diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,64469 @@ +{ + "best_metric": 0.16217102110385895, + "best_model_checkpoint": "wyluilipe/deabuse/checkpoint-9000", + "epoch": 2.742032471437162, + "eval_steps": 120, + "global_step": 9120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 13.701467514038086, + "learning_rate": 2e-05, + "loss": 6.6206, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 39.5434684753418, + "learning_rate": 1.9997995389395612e-05, + "loss": 4.445, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 83.92737579345703, + "learning_rate": 1.9995990778791222e-05, + "loss": 4.277, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 19.87366485595703, + "learning_rate": 1.9993986168186832e-05, + "loss": 4.9229, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 22.43378448486328, + "learning_rate": 1.9991981557582442e-05, + "loss": 4.7554, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 35.35092544555664, + "learning_rate": 1.9989976946978052e-05, + "loss": 4.1126, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 133.4622344970703, + "learning_rate": 1.998797233637366e-05, + "loss": 4.5409, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 20.33665657043457, + "learning_rate": 1.9985967725769272e-05, + "loss": 4.0595, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 85.67984008789062, + "learning_rate": 1.9983963115164882e-05, + "loss": 4.9032, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 34.769309997558594, + "learning_rate": 1.998195850456049e-05, + "loss": 5.5253, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 40.53438949584961, + "learning_rate": 1.9979953893956102e-05, + "loss": 4.3671, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 21.633525848388672, + "learning_rate": 1.997794928335171e-05, + "loss": 3.9831, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 20.7197322845459, + "learning_rate": 1.997594467274732e-05, + "loss": 4.4466, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 110.01347351074219, + "learning_rate": 1.9973940062142933e-05, + "loss": 3.8518, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 19.917098999023438, + "learning_rate": 1.997193545153854e-05, + "loss": 3.7986, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 31.238054275512695, + "learning_rate": 1.996993084093415e-05, + "loss": 3.9456, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 20.226728439331055, + "learning_rate": 1.996792623032976e-05, + "loss": 4.1858, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 28.916229248046875, + "learning_rate": 1.996592161972537e-05, + "loss": 4.2707, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 20.103069305419922, + "learning_rate": 1.996391700912098e-05, + "loss": 5.2733, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 30.481670379638672, + "learning_rate": 1.996191239851659e-05, + "loss": 3.6844, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 27.793643951416016, + "learning_rate": 1.99599077879122e-05, + "loss": 4.3403, + "step": 21 + }, + { + "epoch": 0.01, + "grad_norm": 32.221065521240234, + "learning_rate": 1.995790317730781e-05, + "loss": 3.9117, + "step": 22 + }, + { + "epoch": 0.01, + "grad_norm": 23.525253295898438, + "learning_rate": 1.995589856670342e-05, + "loss": 3.804, + "step": 23 + }, + { + "epoch": 0.01, + "grad_norm": 16.033878326416016, + "learning_rate": 1.995389395609903e-05, + "loss": 5.166, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 22.915647506713867, + "learning_rate": 1.995188934549464e-05, + "loss": 3.5634, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 18.16558837890625, + "learning_rate": 1.994988473489025e-05, + "loss": 4.9719, + "step": 26 + }, + { + "epoch": 0.01, + "grad_norm": 31.655603408813477, + "learning_rate": 1.994788012428586e-05, + "loss": 5.1661, + "step": 27 + }, + { + "epoch": 0.01, + "grad_norm": 18.09580421447754, + "learning_rate": 1.994587551368147e-05, + "loss": 4.4041, + "step": 28 + }, + { + "epoch": 0.01, + "grad_norm": 32.707950592041016, + "learning_rate": 1.9943870903077077e-05, + "loss": 4.5414, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 53.21809005737305, + "learning_rate": 1.994186629247269e-05, + "loss": 4.2779, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 36.61603927612305, + "learning_rate": 1.9939861681868297e-05, + "loss": 4.0579, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 21.24146842956543, + "learning_rate": 1.9937857071263907e-05, + "loss": 4.4486, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 45.33989715576172, + "learning_rate": 1.993585246065952e-05, + "loss": 4.7946, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 36.568145751953125, + "learning_rate": 1.9933847850055128e-05, + "loss": 4.6392, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 19.171186447143555, + "learning_rate": 1.9931843239450738e-05, + "loss": 4.0399, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 28.191804885864258, + "learning_rate": 1.9929838628846348e-05, + "loss": 4.9894, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 56.99246597290039, + "learning_rate": 1.9927834018241958e-05, + "loss": 3.7354, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 25.940950393676758, + "learning_rate": 1.9925829407637568e-05, + "loss": 3.4817, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 38.012001037597656, + "learning_rate": 1.9923824797033178e-05, + "loss": 3.5887, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 23.125755310058594, + "learning_rate": 1.9921820186428788e-05, + "loss": 4.8167, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 25.253414154052734, + "learning_rate": 1.9919815575824398e-05, + "loss": 3.8167, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 18.642539978027344, + "learning_rate": 1.991781096522001e-05, + "loss": 3.5483, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 102.15142059326172, + "learning_rate": 1.9915806354615615e-05, + "loss": 5.1262, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 24.280908584594727, + "learning_rate": 1.991380174401123e-05, + "loss": 4.2309, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 18.596893310546875, + "learning_rate": 1.991179713340684e-05, + "loss": 3.3356, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 21.592117309570312, + "learning_rate": 1.9909792522802445e-05, + "loss": 4.8101, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 19.77571678161621, + "learning_rate": 1.990778791219806e-05, + "loss": 3.4436, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 47.20704650878906, + "learning_rate": 1.9905783301593665e-05, + "loss": 4.077, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 21.739601135253906, + "learning_rate": 1.9903778690989276e-05, + "loss": 4.5314, + "step": 49 + }, + { + "epoch": 0.02, + "grad_norm": 22.48163604736328, + "learning_rate": 1.9901774080384886e-05, + "loss": 4.1253, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 26.019960403442383, + "learning_rate": 1.9899769469780496e-05, + "loss": 4.4319, + "step": 51 + }, + { + "epoch": 0.02, + "grad_norm": 21.49262046813965, + "learning_rate": 1.9897764859176106e-05, + "loss": 4.7591, + "step": 52 + }, + { + "epoch": 0.02, + "grad_norm": 21.852209091186523, + "learning_rate": 1.9895760248571716e-05, + "loss": 4.5679, + "step": 53 + }, + { + "epoch": 0.02, + "grad_norm": 17.225950241088867, + "learning_rate": 1.9893755637967326e-05, + "loss": 3.5482, + "step": 54 + }, + { + "epoch": 0.02, + "grad_norm": 17.815263748168945, + "learning_rate": 1.9891751027362936e-05, + "loss": 3.6293, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 19.188623428344727, + "learning_rate": 1.9889746416758546e-05, + "loss": 4.1118, + "step": 56 + }, + { + "epoch": 0.02, + "grad_norm": 21.920475006103516, + "learning_rate": 1.9887741806154156e-05, + "loss": 3.8108, + "step": 57 + }, + { + "epoch": 0.02, + "grad_norm": 24.557518005371094, + "learning_rate": 1.9885737195549766e-05, + "loss": 3.8317, + "step": 58 + }, + { + "epoch": 0.02, + "grad_norm": 90.77320098876953, + "learning_rate": 1.9883732584945376e-05, + "loss": 5.1687, + "step": 59 + }, + { + "epoch": 0.02, + "grad_norm": 25.83659553527832, + "learning_rate": 1.9881727974340986e-05, + "loss": 5.0576, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 19.337127685546875, + "learning_rate": 1.9879723363736597e-05, + "loss": 3.364, + "step": 61 + }, + { + "epoch": 0.02, + "grad_norm": 19.83724594116211, + "learning_rate": 1.9877718753132203e-05, + "loss": 3.5426, + "step": 62 + }, + { + "epoch": 0.02, + "grad_norm": 14.76127815246582, + "learning_rate": 1.9875714142527817e-05, + "loss": 3.8213, + "step": 63 + }, + { + "epoch": 0.02, + "grad_norm": 21.111013412475586, + "learning_rate": 1.9873709531923427e-05, + "loss": 3.4596, + "step": 64 + }, + { + "epoch": 0.02, + "grad_norm": 63.46014404296875, + "learning_rate": 1.9871704921319033e-05, + "loss": 4.4058, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 19.21253776550293, + "learning_rate": 1.9869700310714647e-05, + "loss": 3.8774, + "step": 66 + }, + { + "epoch": 0.02, + "grad_norm": 23.148107528686523, + "learning_rate": 1.9867695700110254e-05, + "loss": 3.9923, + "step": 67 + }, + { + "epoch": 0.02, + "grad_norm": 25.991294860839844, + "learning_rate": 1.9865691089505864e-05, + "loss": 4.5205, + "step": 68 + }, + { + "epoch": 0.02, + "grad_norm": 18.949478149414062, + "learning_rate": 1.9863686478901477e-05, + "loss": 4.678, + "step": 69 + }, + { + "epoch": 0.02, + "grad_norm": 17.614856719970703, + "learning_rate": 1.9861681868297084e-05, + "loss": 3.758, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 20.783035278320312, + "learning_rate": 1.9859677257692694e-05, + "loss": 4.5238, + "step": 71 + }, + { + "epoch": 0.02, + "grad_norm": 27.744810104370117, + "learning_rate": 1.9857672647088304e-05, + "loss": 4.7011, + "step": 72 + }, + { + "epoch": 0.02, + "grad_norm": 22.72287368774414, + "learning_rate": 1.9855668036483914e-05, + "loss": 3.6715, + "step": 73 + }, + { + "epoch": 0.02, + "grad_norm": 23.866741180419922, + "learning_rate": 1.9853663425879524e-05, + "loss": 5.4669, + "step": 74 + }, + { + "epoch": 0.02, + "grad_norm": 16.936256408691406, + "learning_rate": 1.9851658815275134e-05, + "loss": 3.3014, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 22.439456939697266, + "learning_rate": 1.9849654204670744e-05, + "loss": 3.202, + "step": 76 + }, + { + "epoch": 0.02, + "grad_norm": 16.06572723388672, + "learning_rate": 1.9847649594066354e-05, + "loss": 3.3253, + "step": 77 + }, + { + "epoch": 0.02, + "grad_norm": 29.42819595336914, + "learning_rate": 1.9845644983461965e-05, + "loss": 4.1956, + "step": 78 + }, + { + "epoch": 0.02, + "grad_norm": 20.668291091918945, + "learning_rate": 1.9843640372857575e-05, + "loss": 4.4557, + "step": 79 + }, + { + "epoch": 0.02, + "grad_norm": 21.241540908813477, + "learning_rate": 1.9841635762253185e-05, + "loss": 2.7819, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 39.1732292175293, + "learning_rate": 1.9839631151648795e-05, + "loss": 4.044, + "step": 81 + }, + { + "epoch": 0.02, + "grad_norm": 17.961164474487305, + "learning_rate": 1.9837626541044405e-05, + "loss": 4.0641, + "step": 82 + }, + { + "epoch": 0.02, + "grad_norm": 23.27247428894043, + "learning_rate": 1.9835621930440015e-05, + "loss": 4.1457, + "step": 83 + }, + { + "epoch": 0.03, + "grad_norm": 33.87324523925781, + "learning_rate": 1.983361731983562e-05, + "loss": 4.4233, + "step": 84 + }, + { + "epoch": 0.03, + "grad_norm": 27.01508903503418, + "learning_rate": 1.9831612709231235e-05, + "loss": 3.5391, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 18.453147888183594, + "learning_rate": 1.9829608098626842e-05, + "loss": 3.8098, + "step": 86 + }, + { + "epoch": 0.03, + "grad_norm": 551.41259765625, + "learning_rate": 1.9827603488022452e-05, + "loss": 3.3489, + "step": 87 + }, + { + "epoch": 0.03, + "grad_norm": 33.0008544921875, + "learning_rate": 1.9825598877418065e-05, + "loss": 3.8738, + "step": 88 + }, + { + "epoch": 0.03, + "grad_norm": 80.79717254638672, + "learning_rate": 1.9823594266813672e-05, + "loss": 3.9937, + "step": 89 + }, + { + "epoch": 0.03, + "grad_norm": 20.0161075592041, + "learning_rate": 1.9821589656209282e-05, + "loss": 3.2797, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 25.17633056640625, + "learning_rate": 1.9819585045604892e-05, + "loss": 3.3241, + "step": 91 + }, + { + "epoch": 0.03, + "grad_norm": 50.05910873413086, + "learning_rate": 1.9817580435000502e-05, + "loss": 5.4063, + "step": 92 + }, + { + "epoch": 0.03, + "grad_norm": 19.56489372253418, + "learning_rate": 1.9815575824396112e-05, + "loss": 3.3133, + "step": 93 + }, + { + "epoch": 0.03, + "grad_norm": 66.2906265258789, + "learning_rate": 1.9813571213791723e-05, + "loss": 4.3678, + "step": 94 + }, + { + "epoch": 0.03, + "grad_norm": 17.404077529907227, + "learning_rate": 1.9811566603187333e-05, + "loss": 3.7368, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 17.154563903808594, + "learning_rate": 1.9809561992582943e-05, + "loss": 3.6814, + "step": 96 + }, + { + "epoch": 0.03, + "grad_norm": 21.113162994384766, + "learning_rate": 1.9807557381978553e-05, + "loss": 4.3332, + "step": 97 + }, + { + "epoch": 0.03, + "grad_norm": 35.284725189208984, + "learning_rate": 1.9805552771374163e-05, + "loss": 3.2572, + "step": 98 + }, + { + "epoch": 0.03, + "grad_norm": 19.46085548400879, + "learning_rate": 1.9803548160769773e-05, + "loss": 3.5422, + "step": 99 + }, + { + "epoch": 0.03, + "grad_norm": 24.549236297607422, + "learning_rate": 1.9801543550165383e-05, + "loss": 3.071, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 24.21219253540039, + "learning_rate": 1.9799538939560993e-05, + "loss": 3.7529, + "step": 101 + }, + { + "epoch": 0.03, + "grad_norm": 19.04281234741211, + "learning_rate": 1.9797534328956603e-05, + "loss": 3.5407, + "step": 102 + }, + { + "epoch": 0.03, + "grad_norm": 46.35712814331055, + "learning_rate": 1.979552971835221e-05, + "loss": 3.0938, + "step": 103 + }, + { + "epoch": 0.03, + "grad_norm": 32.1992073059082, + "learning_rate": 1.9793525107747823e-05, + "loss": 4.1555, + "step": 104 + }, + { + "epoch": 0.03, + "grad_norm": 18.388561248779297, + "learning_rate": 1.979152049714343e-05, + "loss": 3.25, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 25.422645568847656, + "learning_rate": 1.978951588653904e-05, + "loss": 3.7947, + "step": 106 + }, + { + "epoch": 0.03, + "grad_norm": 20.530521392822266, + "learning_rate": 1.9787511275934654e-05, + "loss": 3.3529, + "step": 107 + }, + { + "epoch": 0.03, + "grad_norm": 40.206138610839844, + "learning_rate": 1.978550666533026e-05, + "loss": 4.6079, + "step": 108 + }, + { + "epoch": 0.03, + "grad_norm": 43.84805679321289, + "learning_rate": 1.978350205472587e-05, + "loss": 3.6345, + "step": 109 + }, + { + "epoch": 0.03, + "grad_norm": 26.29327392578125, + "learning_rate": 1.978149744412148e-05, + "loss": 4.9385, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 24.576719284057617, + "learning_rate": 1.977949283351709e-05, + "loss": 3.7024, + "step": 111 + }, + { + "epoch": 0.03, + "grad_norm": 21.797935485839844, + "learning_rate": 1.97774882229127e-05, + "loss": 4.0716, + "step": 112 + }, + { + "epoch": 0.03, + "grad_norm": 51.30803298950195, + "learning_rate": 1.977548361230831e-05, + "loss": 4.4803, + "step": 113 + }, + { + "epoch": 0.03, + "grad_norm": 17.133638381958008, + "learning_rate": 1.977347900170392e-05, + "loss": 2.996, + "step": 114 + }, + { + "epoch": 0.03, + "grad_norm": 23.30716323852539, + "learning_rate": 1.977147439109953e-05, + "loss": 3.6316, + "step": 115 + }, + { + "epoch": 0.03, + "grad_norm": 21.08536720275879, + "learning_rate": 1.976946978049514e-05, + "loss": 3.2101, + "step": 116 + }, + { + "epoch": 0.04, + "grad_norm": 40.39435577392578, + "learning_rate": 1.9767465169890748e-05, + "loss": 3.9427, + "step": 117 + }, + { + "epoch": 0.04, + "grad_norm": 20.827363967895508, + "learning_rate": 1.976546055928636e-05, + "loss": 3.7911, + "step": 118 + }, + { + "epoch": 0.04, + "grad_norm": 31.05581283569336, + "learning_rate": 1.976345594868197e-05, + "loss": 4.0686, + "step": 119 + }, + { + "epoch": 0.04, + "grad_norm": 75.66057586669922, + "learning_rate": 1.976145133807758e-05, + "loss": 3.659, + "step": 120 + }, + { + "epoch": 0.04, + "eval_loss": 1.6635291576385498, + "eval_runtime": 43.6414, + "eval_samples_per_second": 33.89, + "eval_steps_per_second": 33.89, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 14.022979736328125, + "learning_rate": 1.975944672747319e-05, + "loss": 3.0417, + "step": 121 + }, + { + "epoch": 0.04, + "grad_norm": 38.61516189575195, + "learning_rate": 1.9757442116868798e-05, + "loss": 4.3155, + "step": 122 + }, + { + "epoch": 0.04, + "grad_norm": 19.140295028686523, + "learning_rate": 1.975543750626441e-05, + "loss": 4.6368, + "step": 123 + }, + { + "epoch": 0.04, + "grad_norm": 24.628950119018555, + "learning_rate": 1.975343289566002e-05, + "loss": 4.131, + "step": 124 + }, + { + "epoch": 0.04, + "grad_norm": 32.278045654296875, + "learning_rate": 1.975142828505563e-05, + "loss": 4.4181, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 25.219432830810547, + "learning_rate": 1.9749423674451242e-05, + "loss": 3.8382, + "step": 126 + }, + { + "epoch": 0.04, + "grad_norm": 20.332189559936523, + "learning_rate": 1.974741906384685e-05, + "loss": 2.7729, + "step": 127 + }, + { + "epoch": 0.04, + "grad_norm": 24.03636932373047, + "learning_rate": 1.974541445324246e-05, + "loss": 3.4281, + "step": 128 + }, + { + "epoch": 0.04, + "grad_norm": 48.14682388305664, + "learning_rate": 1.974340984263807e-05, + "loss": 3.3327, + "step": 129 + }, + { + "epoch": 0.04, + "grad_norm": 22.612560272216797, + "learning_rate": 1.974140523203368e-05, + "loss": 2.8426, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 18.185649871826172, + "learning_rate": 1.973940062142929e-05, + "loss": 3.8089, + "step": 131 + }, + { + "epoch": 0.04, + "grad_norm": 39.714744567871094, + "learning_rate": 1.97373960108249e-05, + "loss": 3.5672, + "step": 132 + }, + { + "epoch": 0.04, + "grad_norm": 63.07310485839844, + "learning_rate": 1.973539140022051e-05, + "loss": 3.1297, + "step": 133 + }, + { + "epoch": 0.04, + "grad_norm": 22.210826873779297, + "learning_rate": 1.973338678961612e-05, + "loss": 5.2622, + "step": 134 + }, + { + "epoch": 0.04, + "grad_norm": 15.51650619506836, + "learning_rate": 1.973138217901173e-05, + "loss": 3.0659, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 20.847930908203125, + "learning_rate": 1.9729377568407336e-05, + "loss": 3.1732, + "step": 136 + }, + { + "epoch": 0.04, + "grad_norm": 18.530948638916016, + "learning_rate": 1.972737295780295e-05, + "loss": 2.7667, + "step": 137 + }, + { + "epoch": 0.04, + "grad_norm": 47.08240509033203, + "learning_rate": 1.972536834719856e-05, + "loss": 2.5596, + "step": 138 + }, + { + "epoch": 0.04, + "grad_norm": 26.97058868408203, + "learning_rate": 1.9723363736594166e-05, + "loss": 3.129, + "step": 139 + }, + { + "epoch": 0.04, + "grad_norm": 20.2777099609375, + "learning_rate": 1.972135912598978e-05, + "loss": 3.4919, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 21.72832679748535, + "learning_rate": 1.9719354515385386e-05, + "loss": 4.1341, + "step": 141 + }, + { + "epoch": 0.04, + "grad_norm": 17.30972671508789, + "learning_rate": 1.9717349904780996e-05, + "loss": 3.2444, + "step": 142 + }, + { + "epoch": 0.04, + "grad_norm": 49.03208923339844, + "learning_rate": 1.971534529417661e-05, + "loss": 4.314, + "step": 143 + }, + { + "epoch": 0.04, + "grad_norm": 26.420650482177734, + "learning_rate": 1.9713340683572217e-05, + "loss": 3.5411, + "step": 144 + }, + { + "epoch": 0.04, + "grad_norm": 25.890357971191406, + "learning_rate": 1.9711336072967827e-05, + "loss": 3.741, + "step": 145 + }, + { + "epoch": 0.04, + "grad_norm": 22.66683578491211, + "learning_rate": 1.9709331462363437e-05, + "loss": 3.116, + "step": 146 + }, + { + "epoch": 0.04, + "grad_norm": 31.934288024902344, + "learning_rate": 1.9707326851759047e-05, + "loss": 3.8739, + "step": 147 + }, + { + "epoch": 0.04, + "grad_norm": 29.55501937866211, + "learning_rate": 1.9705322241154657e-05, + "loss": 3.7167, + "step": 148 + }, + { + "epoch": 0.04, + "grad_norm": 21.13062286376953, + "learning_rate": 1.9703317630550267e-05, + "loss": 3.0495, + "step": 149 + }, + { + "epoch": 0.05, + "grad_norm": 27.444517135620117, + "learning_rate": 1.9701313019945877e-05, + "loss": 3.2852, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 19.37506675720215, + "learning_rate": 1.9699308409341487e-05, + "loss": 5.2696, + "step": 151 + }, + { + "epoch": 0.05, + "grad_norm": 49.605247497558594, + "learning_rate": 1.9697303798737097e-05, + "loss": 3.7375, + "step": 152 + }, + { + "epoch": 0.05, + "grad_norm": 19.60191535949707, + "learning_rate": 1.9695299188132707e-05, + "loss": 3.6209, + "step": 153 + }, + { + "epoch": 0.05, + "grad_norm": 18.86824607849121, + "learning_rate": 1.9693294577528317e-05, + "loss": 3.1842, + "step": 154 + }, + { + "epoch": 0.05, + "grad_norm": 30.93740463256836, + "learning_rate": 1.9691289966923928e-05, + "loss": 3.555, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 24.22440528869629, + "learning_rate": 1.9689285356319538e-05, + "loss": 3.5906, + "step": 156 + }, + { + "epoch": 0.05, + "grad_norm": 43.011539459228516, + "learning_rate": 1.9687280745715148e-05, + "loss": 4.5245, + "step": 157 + }, + { + "epoch": 0.05, + "grad_norm": 24.332199096679688, + "learning_rate": 1.9685276135110754e-05, + "loss": 3.9531, + "step": 158 + }, + { + "epoch": 0.05, + "grad_norm": 37.94855880737305, + "learning_rate": 1.9683271524506368e-05, + "loss": 4.7006, + "step": 159 + }, + { + "epoch": 0.05, + "grad_norm": 20.99969482421875, + "learning_rate": 1.9681266913901975e-05, + "loss": 4.7367, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 34.952232360839844, + "learning_rate": 1.9679262303297585e-05, + "loss": 3.2787, + "step": 161 + }, + { + "epoch": 0.05, + "grad_norm": 25.392833709716797, + "learning_rate": 1.9677257692693198e-05, + "loss": 5.474, + "step": 162 + }, + { + "epoch": 0.05, + "grad_norm": 20.959766387939453, + "learning_rate": 1.9675253082088805e-05, + "loss": 4.8595, + "step": 163 + }, + { + "epoch": 0.05, + "grad_norm": 49.37668228149414, + "learning_rate": 1.9673248471484415e-05, + "loss": 4.2558, + "step": 164 + }, + { + "epoch": 0.05, + "grad_norm": 23.258516311645508, + "learning_rate": 1.9671243860880025e-05, + "loss": 4.362, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 23.190126419067383, + "learning_rate": 1.9669239250275635e-05, + "loss": 3.2228, + "step": 166 + }, + { + "epoch": 0.05, + "grad_norm": 18.251544952392578, + "learning_rate": 1.9667234639671245e-05, + "loss": 4.7009, + "step": 167 + }, + { + "epoch": 0.05, + "grad_norm": 19.142702102661133, + "learning_rate": 1.9665230029066855e-05, + "loss": 3.9582, + "step": 168 + }, + { + "epoch": 0.05, + "grad_norm": 21.59228515625, + "learning_rate": 1.9663225418462465e-05, + "loss": 5.7372, + "step": 169 + }, + { + "epoch": 0.05, + "grad_norm": 55.695281982421875, + "learning_rate": 1.9661220807858075e-05, + "loss": 3.9148, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 68.97881317138672, + "learning_rate": 1.9659216197253685e-05, + "loss": 4.3682, + "step": 171 + }, + { + "epoch": 0.05, + "grad_norm": 20.77494239807129, + "learning_rate": 1.9657211586649296e-05, + "loss": 4.0729, + "step": 172 + }, + { + "epoch": 0.05, + "grad_norm": 48.90683364868164, + "learning_rate": 1.9655206976044906e-05, + "loss": 3.9279, + "step": 173 + }, + { + "epoch": 0.05, + "grad_norm": 27.227113723754883, + "learning_rate": 1.9653202365440516e-05, + "loss": 4.6083, + "step": 174 + }, + { + "epoch": 0.05, + "grad_norm": 25.539125442504883, + "learning_rate": 1.9651197754836126e-05, + "loss": 3.8157, + "step": 175 + }, + { + "epoch": 0.05, + "grad_norm": 21.638704299926758, + "learning_rate": 1.9649193144231736e-05, + "loss": 3.8649, + "step": 176 + }, + { + "epoch": 0.05, + "grad_norm": 29.82769775390625, + "learning_rate": 1.9647188533627343e-05, + "loss": 4.5611, + "step": 177 + }, + { + "epoch": 0.05, + "grad_norm": 22.91179656982422, + "learning_rate": 1.9645183923022956e-05, + "loss": 3.083, + "step": 178 + }, + { + "epoch": 0.05, + "grad_norm": 19.676715850830078, + "learning_rate": 1.9643179312418563e-05, + "loss": 3.8661, + "step": 179 + }, + { + "epoch": 0.05, + "grad_norm": 18.871702194213867, + "learning_rate": 1.9641174701814173e-05, + "loss": 3.8232, + "step": 180 + }, + { + "epoch": 0.05, + "grad_norm": 33.633079528808594, + "learning_rate": 1.9639170091209786e-05, + "loss": 3.9483, + "step": 181 + }, + { + "epoch": 0.05, + "grad_norm": 17.3801212310791, + "learning_rate": 1.9637165480605393e-05, + "loss": 3.4511, + "step": 182 + }, + { + "epoch": 0.06, + "grad_norm": 30.476238250732422, + "learning_rate": 1.9635160870001003e-05, + "loss": 4.5803, + "step": 183 + }, + { + "epoch": 0.06, + "grad_norm": 20.700347900390625, + "learning_rate": 1.9633156259396613e-05, + "loss": 3.6991, + "step": 184 + }, + { + "epoch": 0.06, + "grad_norm": 16.463884353637695, + "learning_rate": 1.9631151648792223e-05, + "loss": 3.2284, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 22.347118377685547, + "learning_rate": 1.9629147038187833e-05, + "loss": 4.603, + "step": 186 + }, + { + "epoch": 0.06, + "grad_norm": 22.56680679321289, + "learning_rate": 1.9627142427583443e-05, + "loss": 4.332, + "step": 187 + }, + { + "epoch": 0.06, + "grad_norm": 31.80207061767578, + "learning_rate": 1.9625137816979054e-05, + "loss": 3.8008, + "step": 188 + }, + { + "epoch": 0.06, + "grad_norm": 22.534713745117188, + "learning_rate": 1.9623133206374664e-05, + "loss": 3.7199, + "step": 189 + }, + { + "epoch": 0.06, + "grad_norm": 22.314205169677734, + "learning_rate": 1.9621128595770274e-05, + "loss": 4.3184, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 24.710988998413086, + "learning_rate": 1.9619123985165884e-05, + "loss": 4.7067, + "step": 191 + }, + { + "epoch": 0.06, + "grad_norm": 34.565242767333984, + "learning_rate": 1.9617119374561494e-05, + "loss": 3.0013, + "step": 192 + }, + { + "epoch": 0.06, + "grad_norm": 21.252920150756836, + "learning_rate": 1.9615114763957104e-05, + "loss": 5.5017, + "step": 193 + }, + { + "epoch": 0.06, + "grad_norm": 18.080705642700195, + "learning_rate": 1.9613110153352714e-05, + "loss": 3.4748, + "step": 194 + }, + { + "epoch": 0.06, + "grad_norm": 20.119901657104492, + "learning_rate": 1.9611105542748324e-05, + "loss": 4.6922, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 20.794160842895508, + "learning_rate": 1.960910093214393e-05, + "loss": 4.3524, + "step": 196 + }, + { + "epoch": 0.06, + "grad_norm": 16.995960235595703, + "learning_rate": 1.9607096321539544e-05, + "loss": 5.311, + "step": 197 + }, + { + "epoch": 0.06, + "grad_norm": 28.07927131652832, + "learning_rate": 1.9605091710935154e-05, + "loss": 4.5386, + "step": 198 + }, + { + "epoch": 0.06, + "grad_norm": 20.054431915283203, + "learning_rate": 1.960308710033076e-05, + "loss": 3.8613, + "step": 199 + }, + { + "epoch": 0.06, + "grad_norm": 19.08523941040039, + "learning_rate": 1.9601082489726375e-05, + "loss": 4.5862, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 20.86153793334961, + "learning_rate": 1.959907787912198e-05, + "loss": 4.4943, + "step": 201 + }, + { + "epoch": 0.06, + "grad_norm": 17.117141723632812, + "learning_rate": 1.959707326851759e-05, + "loss": 3.1455, + "step": 202 + }, + { + "epoch": 0.06, + "grad_norm": 20.701730728149414, + "learning_rate": 1.95950686579132e-05, + "loss": 4.1613, + "step": 203 + }, + { + "epoch": 0.06, + "grad_norm": 17.824867248535156, + "learning_rate": 1.959306404730881e-05, + "loss": 3.764, + "step": 204 + }, + { + "epoch": 0.06, + "grad_norm": 23.23500633239746, + "learning_rate": 1.959105943670442e-05, + "loss": 3.647, + "step": 205 + }, + { + "epoch": 0.06, + "grad_norm": 19.766569137573242, + "learning_rate": 1.958905482610003e-05, + "loss": 3.9145, + "step": 206 + }, + { + "epoch": 0.06, + "grad_norm": 22.38438606262207, + "learning_rate": 1.9587050215495642e-05, + "loss": 4.4528, + "step": 207 + }, + { + "epoch": 0.06, + "grad_norm": 17.628910064697266, + "learning_rate": 1.9585045604891252e-05, + "loss": 3.7731, + "step": 208 + }, + { + "epoch": 0.06, + "grad_norm": 18.171613693237305, + "learning_rate": 1.9583040994286862e-05, + "loss": 4.1904, + "step": 209 + }, + { + "epoch": 0.06, + "grad_norm": 27.142881393432617, + "learning_rate": 1.958103638368247e-05, + "loss": 4.3347, + "step": 210 + }, + { + "epoch": 0.06, + "grad_norm": 13.47857666015625, + "learning_rate": 1.9579031773078082e-05, + "loss": 3.2086, + "step": 211 + }, + { + "epoch": 0.06, + "grad_norm": 18.275033950805664, + "learning_rate": 1.9577027162473692e-05, + "loss": 4.6453, + "step": 212 + }, + { + "epoch": 0.06, + "grad_norm": 17.69455337524414, + "learning_rate": 1.95750225518693e-05, + "loss": 3.5041, + "step": 213 + }, + { + "epoch": 0.06, + "grad_norm": 16.023883819580078, + "learning_rate": 1.9573017941264912e-05, + "loss": 4.2656, + "step": 214 + }, + { + "epoch": 0.06, + "grad_norm": 25.941577911376953, + "learning_rate": 1.957101333066052e-05, + "loss": 3.9683, + "step": 215 + }, + { + "epoch": 0.06, + "grad_norm": 28.56459617614746, + "learning_rate": 1.956900872005613e-05, + "loss": 4.1389, + "step": 216 + }, + { + "epoch": 0.07, + "grad_norm": 18.201026916503906, + "learning_rate": 1.9567004109451743e-05, + "loss": 4.2534, + "step": 217 + }, + { + "epoch": 0.07, + "grad_norm": 17.178457260131836, + "learning_rate": 1.956499949884735e-05, + "loss": 3.8277, + "step": 218 + }, + { + "epoch": 0.07, + "grad_norm": 38.801876068115234, + "learning_rate": 1.956299488824296e-05, + "loss": 4.1429, + "step": 219 + }, + { + "epoch": 0.07, + "grad_norm": 23.956295013427734, + "learning_rate": 1.956099027763857e-05, + "loss": 5.4837, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 16.060426712036133, + "learning_rate": 1.955898566703418e-05, + "loss": 3.7044, + "step": 221 + }, + { + "epoch": 0.07, + "grad_norm": 18.859159469604492, + "learning_rate": 1.955698105642979e-05, + "loss": 4.168, + "step": 222 + }, + { + "epoch": 0.07, + "grad_norm": 21.508644104003906, + "learning_rate": 1.95549764458254e-05, + "loss": 4.5285, + "step": 223 + }, + { + "epoch": 0.07, + "grad_norm": 25.023488998413086, + "learning_rate": 1.955297183522101e-05, + "loss": 3.2762, + "step": 224 + }, + { + "epoch": 0.07, + "grad_norm": 16.727890014648438, + "learning_rate": 1.955096722461662e-05, + "loss": 3.3409, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 20.456737518310547, + "learning_rate": 1.954896261401223e-05, + "loss": 4.1665, + "step": 226 + }, + { + "epoch": 0.07, + "grad_norm": 17.329370498657227, + "learning_rate": 1.954695800340784e-05, + "loss": 3.0992, + "step": 227 + }, + { + "epoch": 0.07, + "grad_norm": 39.43778610229492, + "learning_rate": 1.954495339280345e-05, + "loss": 4.5199, + "step": 228 + }, + { + "epoch": 0.07, + "grad_norm": 20.120681762695312, + "learning_rate": 1.954294878219906e-05, + "loss": 4.6025, + "step": 229 + }, + { + "epoch": 0.07, + "grad_norm": 22.665719985961914, + "learning_rate": 1.954094417159467e-05, + "loss": 4.3815, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 24.487995147705078, + "learning_rate": 1.953893956099028e-05, + "loss": 4.2686, + "step": 231 + }, + { + "epoch": 0.07, + "grad_norm": 32.88031768798828, + "learning_rate": 1.9536934950385887e-05, + "loss": 4.3775, + "step": 232 + }, + { + "epoch": 0.07, + "grad_norm": 22.327272415161133, + "learning_rate": 1.95349303397815e-05, + "loss": 3.7784, + "step": 233 + }, + { + "epoch": 0.07, + "grad_norm": 33.93623352050781, + "learning_rate": 1.9532925729177107e-05, + "loss": 3.2231, + "step": 234 + }, + { + "epoch": 0.07, + "grad_norm": 24.04096794128418, + "learning_rate": 1.9530921118572717e-05, + "loss": 4.1805, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 18.985858917236328, + "learning_rate": 1.952891650796833e-05, + "loss": 4.1365, + "step": 236 + }, + { + "epoch": 0.07, + "grad_norm": 21.616893768310547, + "learning_rate": 1.9526911897363937e-05, + "loss": 2.4831, + "step": 237 + }, + { + "epoch": 0.07, + "grad_norm": 26.768409729003906, + "learning_rate": 1.9524907286759548e-05, + "loss": 4.6538, + "step": 238 + }, + { + "epoch": 0.07, + "grad_norm": 21.572690963745117, + "learning_rate": 1.9522902676155158e-05, + "loss": 3.7805, + "step": 239 + }, + { + "epoch": 0.07, + "grad_norm": 14.675576210021973, + "learning_rate": 1.9520898065550768e-05, + "loss": 3.3591, + "step": 240 + }, + { + "epoch": 0.07, + "eval_loss": 1.399770975112915, + "eval_runtime": 43.6698, + "eval_samples_per_second": 33.868, + "eval_steps_per_second": 33.868, + "step": 240 + }, + { + "epoch": 0.07, + "grad_norm": 39.946598052978516, + "learning_rate": 1.9518893454946378e-05, + "loss": 3.3287, + "step": 241 + }, + { + "epoch": 0.07, + "grad_norm": 22.521690368652344, + "learning_rate": 1.9516888844341988e-05, + "loss": 3.3806, + "step": 242 + }, + { + "epoch": 0.07, + "grad_norm": 18.76473045349121, + "learning_rate": 1.9514884233737598e-05, + "loss": 4.1907, + "step": 243 + }, + { + "epoch": 0.07, + "grad_norm": 17.154766082763672, + "learning_rate": 1.9512879623133208e-05, + "loss": 3.7032, + "step": 244 + }, + { + "epoch": 0.07, + "grad_norm": 25.60683250427246, + "learning_rate": 1.9510875012528818e-05, + "loss": 4.4485, + "step": 245 + }, + { + "epoch": 0.07, + "grad_norm": 29.663965225219727, + "learning_rate": 1.9508870401924428e-05, + "loss": 4.5449, + "step": 246 + }, + { + "epoch": 0.07, + "grad_norm": 24.08624267578125, + "learning_rate": 1.950686579132004e-05, + "loss": 4.6023, + "step": 247 + }, + { + "epoch": 0.07, + "grad_norm": 17.730571746826172, + "learning_rate": 1.950486118071565e-05, + "loss": 4.7885, + "step": 248 + }, + { + "epoch": 0.07, + "grad_norm": 16.919239044189453, + "learning_rate": 1.950285657011126e-05, + "loss": 3.2188, + "step": 249 + }, + { + "epoch": 0.08, + "grad_norm": 16.421016693115234, + "learning_rate": 1.950085195950687e-05, + "loss": 4.6058, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 26.03248405456543, + "learning_rate": 1.9498847348902475e-05, + "loss": 5.1777, + "step": 251 + }, + { + "epoch": 0.08, + "grad_norm": 24.347097396850586, + "learning_rate": 1.949684273829809e-05, + "loss": 4.5525, + "step": 252 + }, + { + "epoch": 0.08, + "grad_norm": 16.834897994995117, + "learning_rate": 1.9494838127693695e-05, + "loss": 3.7237, + "step": 253 + }, + { + "epoch": 0.08, + "grad_norm": 14.267677307128906, + "learning_rate": 1.9492833517089306e-05, + "loss": 4.2919, + "step": 254 + }, + { + "epoch": 0.08, + "grad_norm": 14.566679954528809, + "learning_rate": 1.949082890648492e-05, + "loss": 4.2664, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 20.296588897705078, + "learning_rate": 1.9488824295880526e-05, + "loss": 4.1636, + "step": 256 + }, + { + "epoch": 0.08, + "grad_norm": 19.508743286132812, + "learning_rate": 1.9486819685276136e-05, + "loss": 3.5817, + "step": 257 + }, + { + "epoch": 0.08, + "grad_norm": 22.892982482910156, + "learning_rate": 1.9484815074671746e-05, + "loss": 4.1911, + "step": 258 + }, + { + "epoch": 0.08, + "grad_norm": 20.90638542175293, + "learning_rate": 1.9482810464067356e-05, + "loss": 3.3314, + "step": 259 + }, + { + "epoch": 0.08, + "grad_norm": 16.314008712768555, + "learning_rate": 1.9480805853462966e-05, + "loss": 3.5795, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 19.024072647094727, + "learning_rate": 1.9478801242858576e-05, + "loss": 3.4856, + "step": 261 + }, + { + "epoch": 0.08, + "grad_norm": 15.41294002532959, + "learning_rate": 1.9476796632254186e-05, + "loss": 3.9168, + "step": 262 + }, + { + "epoch": 0.08, + "grad_norm": 27.279674530029297, + "learning_rate": 1.9474792021649796e-05, + "loss": 4.4373, + "step": 263 + }, + { + "epoch": 0.08, + "grad_norm": 26.365346908569336, + "learning_rate": 1.9472787411045406e-05, + "loss": 4.3507, + "step": 264 + }, + { + "epoch": 0.08, + "grad_norm": 18.319107055664062, + "learning_rate": 1.9470782800441016e-05, + "loss": 4.1936, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 16.385744094848633, + "learning_rate": 1.9468778189836627e-05, + "loss": 4.3412, + "step": 266 + }, + { + "epoch": 0.08, + "grad_norm": 17.853548049926758, + "learning_rate": 1.9466773579232237e-05, + "loss": 4.4193, + "step": 267 + }, + { + "epoch": 0.08, + "grad_norm": 23.52976417541504, + "learning_rate": 1.9464768968627847e-05, + "loss": 5.0003, + "step": 268 + }, + { + "epoch": 0.08, + "grad_norm": 20.375717163085938, + "learning_rate": 1.9462764358023457e-05, + "loss": 3.6193, + "step": 269 + }, + { + "epoch": 0.08, + "grad_norm": 19.429637908935547, + "learning_rate": 1.9460759747419063e-05, + "loss": 4.0949, + "step": 270 + }, + { + "epoch": 0.08, + "grad_norm": 16.466793060302734, + "learning_rate": 1.9458755136814677e-05, + "loss": 3.3825, + "step": 271 + }, + { + "epoch": 0.08, + "grad_norm": 28.024784088134766, + "learning_rate": 1.9456750526210287e-05, + "loss": 4.1028, + "step": 272 + }, + { + "epoch": 0.08, + "grad_norm": 20.214069366455078, + "learning_rate": 1.9454745915605894e-05, + "loss": 3.611, + "step": 273 + }, + { + "epoch": 0.08, + "grad_norm": 17.36444854736328, + "learning_rate": 1.9452741305001507e-05, + "loss": 3.7358, + "step": 274 + }, + { + "epoch": 0.08, + "grad_norm": 17.746870040893555, + "learning_rate": 1.9450736694397114e-05, + "loss": 3.527, + "step": 275 + }, + { + "epoch": 0.08, + "grad_norm": 15.168383598327637, + "learning_rate": 1.9448732083792724e-05, + "loss": 3.3192, + "step": 276 + }, + { + "epoch": 0.08, + "grad_norm": 24.655101776123047, + "learning_rate": 1.9446727473188334e-05, + "loss": 5.3757, + "step": 277 + }, + { + "epoch": 0.08, + "grad_norm": 22.04741668701172, + "learning_rate": 1.9444722862583944e-05, + "loss": 3.5021, + "step": 278 + }, + { + "epoch": 0.08, + "grad_norm": 15.345871925354004, + "learning_rate": 1.9442718251979554e-05, + "loss": 4.5815, + "step": 279 + }, + { + "epoch": 0.08, + "grad_norm": 17.076583862304688, + "learning_rate": 1.9440713641375164e-05, + "loss": 4.7359, + "step": 280 + }, + { + "epoch": 0.08, + "grad_norm": 17.223661422729492, + "learning_rate": 1.9438709030770774e-05, + "loss": 3.6285, + "step": 281 + }, + { + "epoch": 0.08, + "grad_norm": 19.869312286376953, + "learning_rate": 1.9436704420166385e-05, + "loss": 3.9027, + "step": 282 + }, + { + "epoch": 0.09, + "grad_norm": 20.547405242919922, + "learning_rate": 1.9434699809561995e-05, + "loss": 3.6283, + "step": 283 + }, + { + "epoch": 0.09, + "grad_norm": 18.522144317626953, + "learning_rate": 1.9432695198957605e-05, + "loss": 4.2172, + "step": 284 + }, + { + "epoch": 0.09, + "grad_norm": 30.722158432006836, + "learning_rate": 1.9430690588353215e-05, + "loss": 3.6669, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 16.493730545043945, + "learning_rate": 1.9428685977748825e-05, + "loss": 4.1544, + "step": 286 + }, + { + "epoch": 0.09, + "grad_norm": 15.535743713378906, + "learning_rate": 1.942668136714443e-05, + "loss": 4.1201, + "step": 287 + }, + { + "epoch": 0.09, + "grad_norm": 136.23187255859375, + "learning_rate": 1.9424676756540045e-05, + "loss": 4.5668, + "step": 288 + }, + { + "epoch": 0.09, + "grad_norm": 15.681511878967285, + "learning_rate": 1.9422672145935652e-05, + "loss": 4.1992, + "step": 289 + }, + { + "epoch": 0.09, + "grad_norm": 12.06187915802002, + "learning_rate": 1.9420667535331262e-05, + "loss": 2.6199, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 14.176081657409668, + "learning_rate": 1.9418662924726875e-05, + "loss": 2.6663, + "step": 291 + }, + { + "epoch": 0.09, + "grad_norm": 16.331350326538086, + "learning_rate": 1.9416658314122482e-05, + "loss": 4.1395, + "step": 292 + }, + { + "epoch": 0.09, + "grad_norm": 25.406064987182617, + "learning_rate": 1.9414653703518095e-05, + "loss": 3.2631, + "step": 293 + }, + { + "epoch": 0.09, + "grad_norm": 25.89323616027832, + "learning_rate": 1.9412649092913702e-05, + "loss": 4.4711, + "step": 294 + }, + { + "epoch": 0.09, + "grad_norm": 12.358001708984375, + "learning_rate": 1.9410644482309312e-05, + "loss": 4.0686, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 17.452009201049805, + "learning_rate": 1.9408639871704922e-05, + "loss": 3.6253, + "step": 296 + }, + { + "epoch": 0.09, + "grad_norm": 28.617233276367188, + "learning_rate": 1.9406635261100532e-05, + "loss": 4.9357, + "step": 297 + }, + { + "epoch": 0.09, + "grad_norm": 17.00152587890625, + "learning_rate": 1.9404630650496142e-05, + "loss": 3.7579, + "step": 298 + }, + { + "epoch": 0.09, + "grad_norm": 17.73528289794922, + "learning_rate": 1.9402626039891753e-05, + "loss": 3.7798, + "step": 299 + }, + { + "epoch": 0.09, + "grad_norm": 15.016806602478027, + "learning_rate": 1.9400621429287363e-05, + "loss": 4.3192, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 12.723051071166992, + "learning_rate": 1.9398616818682973e-05, + "loss": 3.8939, + "step": 301 + }, + { + "epoch": 0.09, + "grad_norm": 14.875953674316406, + "learning_rate": 1.9396612208078583e-05, + "loss": 2.8365, + "step": 302 + }, + { + "epoch": 0.09, + "grad_norm": 16.63105583190918, + "learning_rate": 1.9394607597474193e-05, + "loss": 3.7121, + "step": 303 + }, + { + "epoch": 0.09, + "grad_norm": 22.02267074584961, + "learning_rate": 1.9392602986869803e-05, + "loss": 5.0273, + "step": 304 + }, + { + "epoch": 0.09, + "grad_norm": 44.309234619140625, + "learning_rate": 1.9390598376265413e-05, + "loss": 4.5806, + "step": 305 + }, + { + "epoch": 0.09, + "grad_norm": 20.403287887573242, + "learning_rate": 1.938859376566102e-05, + "loss": 3.2671, + "step": 306 + }, + { + "epoch": 0.09, + "grad_norm": 15.081746101379395, + "learning_rate": 1.9386589155056633e-05, + "loss": 4.2114, + "step": 307 + }, + { + "epoch": 0.09, + "grad_norm": 16.417646408081055, + "learning_rate": 1.938458454445224e-05, + "loss": 3.878, + "step": 308 + }, + { + "epoch": 0.09, + "grad_norm": 20.749244689941406, + "learning_rate": 1.938257993384785e-05, + "loss": 3.3179, + "step": 309 + }, + { + "epoch": 0.09, + "grad_norm": 21.518075942993164, + "learning_rate": 1.9380575323243463e-05, + "loss": 3.4908, + "step": 310 + }, + { + "epoch": 0.09, + "grad_norm": 17.475610733032227, + "learning_rate": 1.937857071263907e-05, + "loss": 4.1835, + "step": 311 + }, + { + "epoch": 0.09, + "grad_norm": 20.958995819091797, + "learning_rate": 1.937656610203468e-05, + "loss": 4.5692, + "step": 312 + }, + { + "epoch": 0.09, + "grad_norm": 16.16900062561035, + "learning_rate": 1.937456149143029e-05, + "loss": 2.9379, + "step": 313 + }, + { + "epoch": 0.09, + "grad_norm": 29.76364517211914, + "learning_rate": 1.93725568808259e-05, + "loss": 4.0285, + "step": 314 + }, + { + "epoch": 0.09, + "grad_norm": 14.153215408325195, + "learning_rate": 1.937055227022151e-05, + "loss": 3.6143, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 25.9129581451416, + "learning_rate": 1.936854765961712e-05, + "loss": 4.9313, + "step": 316 + }, + { + "epoch": 0.1, + "grad_norm": 12.36880874633789, + "learning_rate": 1.936654304901273e-05, + "loss": 3.5271, + "step": 317 + }, + { + "epoch": 0.1, + "grad_norm": 20.27505874633789, + "learning_rate": 1.936453843840834e-05, + "loss": 3.8401, + "step": 318 + }, + { + "epoch": 0.1, + "grad_norm": 20.562847137451172, + "learning_rate": 1.936253382780395e-05, + "loss": 4.0286, + "step": 319 + }, + { + "epoch": 0.1, + "grad_norm": 23.03611183166504, + "learning_rate": 1.936052921719956e-05, + "loss": 3.1755, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 24.045217514038086, + "learning_rate": 1.935852460659517e-05, + "loss": 4.1921, + "step": 321 + }, + { + "epoch": 0.1, + "grad_norm": 15.012109756469727, + "learning_rate": 1.935651999599078e-05, + "loss": 3.9538, + "step": 322 + }, + { + "epoch": 0.1, + "grad_norm": 16.769580841064453, + "learning_rate": 1.935451538538639e-05, + "loss": 4.1792, + "step": 323 + }, + { + "epoch": 0.1, + "grad_norm": 15.990212440490723, + "learning_rate": 1.9352510774782e-05, + "loss": 3.3792, + "step": 324 + }, + { + "epoch": 0.1, + "grad_norm": 17.586837768554688, + "learning_rate": 1.9350506164177608e-05, + "loss": 3.8851, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 28.66331672668457, + "learning_rate": 1.934850155357322e-05, + "loss": 4.4007, + "step": 326 + }, + { + "epoch": 0.1, + "grad_norm": 19.1878604888916, + "learning_rate": 1.934649694296883e-05, + "loss": 3.7238, + "step": 327 + }, + { + "epoch": 0.1, + "grad_norm": 27.458173751831055, + "learning_rate": 1.9344492332364438e-05, + "loss": 3.6198, + "step": 328 + }, + { + "epoch": 0.1, + "grad_norm": 23.124168395996094, + "learning_rate": 1.934248772176005e-05, + "loss": 3.9671, + "step": 329 + }, + { + "epoch": 0.1, + "grad_norm": 22.697099685668945, + "learning_rate": 1.934048311115566e-05, + "loss": 4.6046, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 16.66628074645996, + "learning_rate": 1.933847850055127e-05, + "loss": 3.4079, + "step": 331 + }, + { + "epoch": 0.1, + "grad_norm": 21.602243423461914, + "learning_rate": 1.933647388994688e-05, + "loss": 2.6947, + "step": 332 + }, + { + "epoch": 0.1, + "grad_norm": 19.987550735473633, + "learning_rate": 1.933446927934249e-05, + "loss": 4.0264, + "step": 333 + }, + { + "epoch": 0.1, + "grad_norm": 27.548095703125, + "learning_rate": 1.93324646687381e-05, + "loss": 3.6234, + "step": 334 + }, + { + "epoch": 0.1, + "grad_norm": 16.985288619995117, + "learning_rate": 1.933046005813371e-05, + "loss": 3.5273, + "step": 335 + }, + { + "epoch": 0.1, + "grad_norm": 249.91415405273438, + "learning_rate": 1.932845544752932e-05, + "loss": 3.0628, + "step": 336 + }, + { + "epoch": 0.1, + "grad_norm": 22.47792625427246, + "learning_rate": 1.932645083692493e-05, + "loss": 3.7402, + "step": 337 + }, + { + "epoch": 0.1, + "grad_norm": 28.15104866027832, + "learning_rate": 1.932444622632054e-05, + "loss": 3.5902, + "step": 338 + }, + { + "epoch": 0.1, + "grad_norm": 15.687856674194336, + "learning_rate": 1.932244161571615e-05, + "loss": 3.4816, + "step": 339 + }, + { + "epoch": 0.1, + "grad_norm": 15.60968017578125, + "learning_rate": 1.932043700511176e-05, + "loss": 3.8098, + "step": 340 + }, + { + "epoch": 0.1, + "grad_norm": 21.058176040649414, + "learning_rate": 1.931843239450737e-05, + "loss": 4.0355, + "step": 341 + }, + { + "epoch": 0.1, + "grad_norm": 52.26857376098633, + "learning_rate": 1.931642778390298e-05, + "loss": 5.9303, + "step": 342 + }, + { + "epoch": 0.1, + "grad_norm": 14.664571762084961, + "learning_rate": 1.931442317329859e-05, + "loss": 3.5722, + "step": 343 + }, + { + "epoch": 0.1, + "grad_norm": 11.48156452178955, + "learning_rate": 1.9312418562694196e-05, + "loss": 3.8408, + "step": 344 + }, + { + "epoch": 0.1, + "grad_norm": 18.76517677307129, + "learning_rate": 1.931041395208981e-05, + "loss": 3.5406, + "step": 345 + }, + { + "epoch": 0.1, + "grad_norm": 15.111738204956055, + "learning_rate": 1.930840934148542e-05, + "loss": 3.6611, + "step": 346 + }, + { + "epoch": 0.1, + "grad_norm": 11.203923225402832, + "learning_rate": 1.9306404730881026e-05, + "loss": 3.0199, + "step": 347 + }, + { + "epoch": 0.1, + "grad_norm": 21.51028060913086, + "learning_rate": 1.930440012027664e-05, + "loss": 3.1991, + "step": 348 + }, + { + "epoch": 0.1, + "grad_norm": 21.56675910949707, + "learning_rate": 1.9302395509672247e-05, + "loss": 3.6963, + "step": 349 + }, + { + "epoch": 0.11, + "grad_norm": 13.743776321411133, + "learning_rate": 1.9300390899067857e-05, + "loss": 3.6142, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 15.742630004882812, + "learning_rate": 1.9298386288463467e-05, + "loss": 2.8555, + "step": 351 + }, + { + "epoch": 0.11, + "grad_norm": 14.802862167358398, + "learning_rate": 1.9296381677859077e-05, + "loss": 2.9698, + "step": 352 + }, + { + "epoch": 0.11, + "grad_norm": 16.21837615966797, + "learning_rate": 1.9294377067254687e-05, + "loss": 5.3529, + "step": 353 + }, + { + "epoch": 0.11, + "grad_norm": 14.180800437927246, + "learning_rate": 1.9292372456650297e-05, + "loss": 3.7083, + "step": 354 + }, + { + "epoch": 0.11, + "grad_norm": 16.493022918701172, + "learning_rate": 1.9290367846045907e-05, + "loss": 2.6302, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 19.73296356201172, + "learning_rate": 1.9288363235441517e-05, + "loss": 3.4556, + "step": 356 + }, + { + "epoch": 0.11, + "grad_norm": 20.31797218322754, + "learning_rate": 1.9286358624837127e-05, + "loss": 4.2031, + "step": 357 + }, + { + "epoch": 0.11, + "grad_norm": 24.127195358276367, + "learning_rate": 1.9284354014232737e-05, + "loss": 4.2493, + "step": 358 + }, + { + "epoch": 0.11, + "grad_norm": 33.403282165527344, + "learning_rate": 1.9282349403628347e-05, + "loss": 4.6944, + "step": 359 + }, + { + "epoch": 0.11, + "grad_norm": 52.27415466308594, + "learning_rate": 1.9280344793023958e-05, + "loss": 3.5661, + "step": 360 + }, + { + "epoch": 0.11, + "eval_loss": 1.2527186870574951, + "eval_runtime": 43.7248, + "eval_samples_per_second": 33.825, + "eval_steps_per_second": 33.825, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 18.096208572387695, + "learning_rate": 1.9278340182419568e-05, + "loss": 4.3089, + "step": 361 + }, + { + "epoch": 0.11, + "grad_norm": 12.513582229614258, + "learning_rate": 1.9276335571815178e-05, + "loss": 3.7448, + "step": 362 + }, + { + "epoch": 0.11, + "grad_norm": 18.849184036254883, + "learning_rate": 1.9274330961210784e-05, + "loss": 4.8774, + "step": 363 + }, + { + "epoch": 0.11, + "grad_norm": 14.817676544189453, + "learning_rate": 1.9272326350606398e-05, + "loss": 4.2751, + "step": 364 + }, + { + "epoch": 0.11, + "grad_norm": 24.063844680786133, + "learning_rate": 1.9270321740002008e-05, + "loss": 3.8558, + "step": 365 + }, + { + "epoch": 0.11, + "grad_norm": 18.589746475219727, + "learning_rate": 1.9268317129397615e-05, + "loss": 2.9172, + "step": 366 + }, + { + "epoch": 0.11, + "grad_norm": 51.5893669128418, + "learning_rate": 1.9266312518793228e-05, + "loss": 3.2441, + "step": 367 + }, + { + "epoch": 0.11, + "grad_norm": 18.138118743896484, + "learning_rate": 1.9264307908188835e-05, + "loss": 2.9894, + "step": 368 + }, + { + "epoch": 0.11, + "grad_norm": 14.857394218444824, + "learning_rate": 1.9262303297584445e-05, + "loss": 3.1436, + "step": 369 + }, + { + "epoch": 0.11, + "grad_norm": 17.468170166015625, + "learning_rate": 1.926029868698006e-05, + "loss": 3.2683, + "step": 370 + }, + { + "epoch": 0.11, + "grad_norm": 17.41706657409668, + "learning_rate": 1.9258294076375665e-05, + "loss": 3.6597, + "step": 371 + }, + { + "epoch": 0.11, + "grad_norm": 15.607809066772461, + "learning_rate": 1.9256289465771275e-05, + "loss": 3.5317, + "step": 372 + }, + { + "epoch": 0.11, + "grad_norm": 25.48333168029785, + "learning_rate": 1.9254284855166885e-05, + "loss": 4.017, + "step": 373 + }, + { + "epoch": 0.11, + "grad_norm": 26.36655044555664, + "learning_rate": 1.9252280244562495e-05, + "loss": 3.1871, + "step": 374 + }, + { + "epoch": 0.11, + "grad_norm": 29.703283309936523, + "learning_rate": 1.9250275633958105e-05, + "loss": 3.56, + "step": 375 + }, + { + "epoch": 0.11, + "grad_norm": 12.352496147155762, + "learning_rate": 1.9248271023353715e-05, + "loss": 3.2961, + "step": 376 + }, + { + "epoch": 0.11, + "grad_norm": 24.971912384033203, + "learning_rate": 1.9246266412749326e-05, + "loss": 3.8872, + "step": 377 + }, + { + "epoch": 0.11, + "grad_norm": 20.017274856567383, + "learning_rate": 1.9244261802144936e-05, + "loss": 4.3486, + "step": 378 + }, + { + "epoch": 0.11, + "grad_norm": 16.611631393432617, + "learning_rate": 1.9242257191540546e-05, + "loss": 4.1083, + "step": 379 + }, + { + "epoch": 0.11, + "grad_norm": 94.53077697753906, + "learning_rate": 1.9240252580936152e-05, + "loss": 3.7256, + "step": 380 + }, + { + "epoch": 0.11, + "grad_norm": 14.142595291137695, + "learning_rate": 1.9238247970331766e-05, + "loss": 3.9881, + "step": 381 + }, + { + "epoch": 0.11, + "grad_norm": 15.951706886291504, + "learning_rate": 1.9236243359727373e-05, + "loss": 3.6539, + "step": 382 + }, + { + "epoch": 0.12, + "grad_norm": 16.317209243774414, + "learning_rate": 1.9234238749122983e-05, + "loss": 4.028, + "step": 383 + }, + { + "epoch": 0.12, + "grad_norm": 15.425968170166016, + "learning_rate": 1.9232234138518596e-05, + "loss": 2.9222, + "step": 384 + }, + { + "epoch": 0.12, + "grad_norm": 15.79262924194336, + "learning_rate": 1.9230229527914203e-05, + "loss": 3.1179, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 19.95915985107422, + "learning_rate": 1.9228224917309813e-05, + "loss": 3.9919, + "step": 386 + }, + { + "epoch": 0.12, + "grad_norm": 14.439689636230469, + "learning_rate": 1.9226220306705423e-05, + "loss": 3.257, + "step": 387 + }, + { + "epoch": 0.12, + "grad_norm": 16.699209213256836, + "learning_rate": 1.9224215696101033e-05, + "loss": 3.8755, + "step": 388 + }, + { + "epoch": 0.12, + "grad_norm": 24.074472427368164, + "learning_rate": 1.9222211085496643e-05, + "loss": 4.0977, + "step": 389 + }, + { + "epoch": 0.12, + "grad_norm": 15.57771110534668, + "learning_rate": 1.9220206474892253e-05, + "loss": 4.4361, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 13.52245807647705, + "learning_rate": 1.9218201864287863e-05, + "loss": 3.7686, + "step": 391 + }, + { + "epoch": 0.12, + "grad_norm": 19.746614456176758, + "learning_rate": 1.9216197253683473e-05, + "loss": 3.819, + "step": 392 + }, + { + "epoch": 0.12, + "grad_norm": 14.887005805969238, + "learning_rate": 1.9214192643079084e-05, + "loss": 3.8956, + "step": 393 + }, + { + "epoch": 0.12, + "grad_norm": 18.658023834228516, + "learning_rate": 1.9212188032474694e-05, + "loss": 3.9219, + "step": 394 + }, + { + "epoch": 0.12, + "grad_norm": 14.493846893310547, + "learning_rate": 1.9210183421870304e-05, + "loss": 4.26, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 16.01069450378418, + "learning_rate": 1.9208178811265914e-05, + "loss": 2.9605, + "step": 396 + }, + { + "epoch": 0.12, + "grad_norm": 24.826866149902344, + "learning_rate": 1.9206174200661524e-05, + "loss": 4.3274, + "step": 397 + }, + { + "epoch": 0.12, + "grad_norm": 10.156407356262207, + "learning_rate": 1.9204169590057134e-05, + "loss": 3.2431, + "step": 398 + }, + { + "epoch": 0.12, + "grad_norm": 21.510562896728516, + "learning_rate": 1.920216497945274e-05, + "loss": 4.4685, + "step": 399 + }, + { + "epoch": 0.12, + "grad_norm": 25.705717086791992, + "learning_rate": 1.9200160368848354e-05, + "loss": 4.3723, + "step": 400 + }, + { + "epoch": 0.12, + "grad_norm": 15.125773429870605, + "learning_rate": 1.9198155758243964e-05, + "loss": 3.9991, + "step": 401 + }, + { + "epoch": 0.12, + "grad_norm": 11.93122386932373, + "learning_rate": 1.919615114763957e-05, + "loss": 3.5661, + "step": 402 + }, + { + "epoch": 0.12, + "grad_norm": 9.818219184875488, + "learning_rate": 1.9194146537035184e-05, + "loss": 3.4324, + "step": 403 + }, + { + "epoch": 0.12, + "grad_norm": 23.427623748779297, + "learning_rate": 1.919214192643079e-05, + "loss": 4.2537, + "step": 404 + }, + { + "epoch": 0.12, + "grad_norm": 18.164058685302734, + "learning_rate": 1.91901373158264e-05, + "loss": 3.2003, + "step": 405 + }, + { + "epoch": 0.12, + "grad_norm": 20.42031478881836, + "learning_rate": 1.918813270522201e-05, + "loss": 2.4072, + "step": 406 + }, + { + "epoch": 0.12, + "grad_norm": 21.685937881469727, + "learning_rate": 1.918612809461762e-05, + "loss": 3.5641, + "step": 407 + }, + { + "epoch": 0.12, + "grad_norm": 19.072772979736328, + "learning_rate": 1.918412348401323e-05, + "loss": 3.8314, + "step": 408 + }, + { + "epoch": 0.12, + "grad_norm": 11.509551048278809, + "learning_rate": 1.918211887340884e-05, + "loss": 2.837, + "step": 409 + }, + { + "epoch": 0.12, + "grad_norm": 14.759239196777344, + "learning_rate": 1.918011426280445e-05, + "loss": 3.501, + "step": 410 + }, + { + "epoch": 0.12, + "grad_norm": 25.07742691040039, + "learning_rate": 1.917810965220006e-05, + "loss": 3.0863, + "step": 411 + }, + { + "epoch": 0.12, + "grad_norm": 17.365577697753906, + "learning_rate": 1.9176105041595672e-05, + "loss": 3.6191, + "step": 412 + }, + { + "epoch": 0.12, + "grad_norm": 15.148959159851074, + "learning_rate": 1.9174100430991282e-05, + "loss": 3.6065, + "step": 413 + }, + { + "epoch": 0.12, + "grad_norm": 22.2659854888916, + "learning_rate": 1.9172095820386892e-05, + "loss": 3.7687, + "step": 414 + }, + { + "epoch": 0.12, + "grad_norm": 10.9700927734375, + "learning_rate": 1.9170091209782502e-05, + "loss": 3.6995, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 16.873733520507812, + "learning_rate": 1.9168086599178112e-05, + "loss": 2.3494, + "step": 416 + }, + { + "epoch": 0.13, + "grad_norm": 22.604717254638672, + "learning_rate": 1.9166081988573722e-05, + "loss": 3.417, + "step": 417 + }, + { + "epoch": 0.13, + "grad_norm": 16.342870712280273, + "learning_rate": 1.916407737796933e-05, + "loss": 3.7907, + "step": 418 + }, + { + "epoch": 0.13, + "grad_norm": 43.08720016479492, + "learning_rate": 1.9162072767364942e-05, + "loss": 2.4359, + "step": 419 + }, + { + "epoch": 0.13, + "grad_norm": 18.63433074951172, + "learning_rate": 1.9160068156760552e-05, + "loss": 4.5936, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 20.13007164001465, + "learning_rate": 1.915806354615616e-05, + "loss": 4.5972, + "step": 421 + }, + { + "epoch": 0.13, + "grad_norm": 17.850637435913086, + "learning_rate": 1.9156058935551773e-05, + "loss": 3.1744, + "step": 422 + }, + { + "epoch": 0.13, + "grad_norm": 15.922115325927734, + "learning_rate": 1.915405432494738e-05, + "loss": 3.862, + "step": 423 + }, + { + "epoch": 0.13, + "grad_norm": 20.92375946044922, + "learning_rate": 1.915204971434299e-05, + "loss": 2.907, + "step": 424 + }, + { + "epoch": 0.13, + "grad_norm": 18.767166137695312, + "learning_rate": 1.91500451037386e-05, + "loss": 4.3281, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 16.2836856842041, + "learning_rate": 1.914804049313421e-05, + "loss": 3.5196, + "step": 426 + }, + { + "epoch": 0.13, + "grad_norm": 13.612652778625488, + "learning_rate": 1.914603588252982e-05, + "loss": 4.0671, + "step": 427 + }, + { + "epoch": 0.13, + "grad_norm": 20.497167587280273, + "learning_rate": 1.914403127192543e-05, + "loss": 4.5793, + "step": 428 + }, + { + "epoch": 0.13, + "grad_norm": 16.230653762817383, + "learning_rate": 1.914202666132104e-05, + "loss": 4.3463, + "step": 429 + }, + { + "epoch": 0.13, + "grad_norm": 17.90238380432129, + "learning_rate": 1.914002205071665e-05, + "loss": 3.6869, + "step": 430 + }, + { + "epoch": 0.13, + "grad_norm": 23.716754913330078, + "learning_rate": 1.913801744011226e-05, + "loss": 3.8698, + "step": 431 + }, + { + "epoch": 0.13, + "grad_norm": 17.085433959960938, + "learning_rate": 1.913601282950787e-05, + "loss": 2.4927, + "step": 432 + }, + { + "epoch": 0.13, + "grad_norm": 19.192747116088867, + "learning_rate": 1.913400821890348e-05, + "loss": 3.9132, + "step": 433 + }, + { + "epoch": 0.13, + "grad_norm": 13.243489265441895, + "learning_rate": 1.913200360829909e-05, + "loss": 2.9527, + "step": 434 + }, + { + "epoch": 0.13, + "grad_norm": 15.914332389831543, + "learning_rate": 1.91299989976947e-05, + "loss": 3.9814, + "step": 435 + }, + { + "epoch": 0.13, + "grad_norm": 16.82046127319336, + "learning_rate": 1.912799438709031e-05, + "loss": 3.2976, + "step": 436 + }, + { + "epoch": 0.13, + "grad_norm": 30.610519409179688, + "learning_rate": 1.9125989776485917e-05, + "loss": 2.8746, + "step": 437 + }, + { + "epoch": 0.13, + "grad_norm": 14.973118782043457, + "learning_rate": 1.912398516588153e-05, + "loss": 3.4231, + "step": 438 + }, + { + "epoch": 0.13, + "grad_norm": 17.690595626831055, + "learning_rate": 1.912198055527714e-05, + "loss": 2.8452, + "step": 439 + }, + { + "epoch": 0.13, + "grad_norm": 13.494382858276367, + "learning_rate": 1.9119975944672747e-05, + "loss": 3.4656, + "step": 440 + }, + { + "epoch": 0.13, + "grad_norm": 15.88420581817627, + "learning_rate": 1.911797133406836e-05, + "loss": 3.4494, + "step": 441 + }, + { + "epoch": 0.13, + "grad_norm": 15.013689041137695, + "learning_rate": 1.9115966723463967e-05, + "loss": 3.7962, + "step": 442 + }, + { + "epoch": 0.13, + "grad_norm": 19.49163246154785, + "learning_rate": 1.9113962112859578e-05, + "loss": 3.5861, + "step": 443 + }, + { + "epoch": 0.13, + "grad_norm": 14.526626586914062, + "learning_rate": 1.911195750225519e-05, + "loss": 4.3179, + "step": 444 + }, + { + "epoch": 0.13, + "grad_norm": 14.432934761047363, + "learning_rate": 1.9109952891650798e-05, + "loss": 3.6713, + "step": 445 + }, + { + "epoch": 0.13, + "grad_norm": 17.720027923583984, + "learning_rate": 1.9107948281046408e-05, + "loss": 2.9935, + "step": 446 + }, + { + "epoch": 0.13, + "grad_norm": 30.397794723510742, + "learning_rate": 1.9105943670442018e-05, + "loss": 3.4623, + "step": 447 + }, + { + "epoch": 0.13, + "grad_norm": 14.584136009216309, + "learning_rate": 1.9103939059837628e-05, + "loss": 3.4608, + "step": 448 + }, + { + "epoch": 0.13, + "grad_norm": 10.301027297973633, + "learning_rate": 1.9101934449233238e-05, + "loss": 3.1884, + "step": 449 + }, + { + "epoch": 0.14, + "grad_norm": 17.015613555908203, + "learning_rate": 1.9099929838628848e-05, + "loss": 3.9679, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 13.941701889038086, + "learning_rate": 1.9097925228024458e-05, + "loss": 3.3073, + "step": 451 + }, + { + "epoch": 0.14, + "grad_norm": 16.194982528686523, + "learning_rate": 1.909592061742007e-05, + "loss": 3.6541, + "step": 452 + }, + { + "epoch": 0.14, + "grad_norm": 19.734464645385742, + "learning_rate": 1.909391600681568e-05, + "loss": 4.4298, + "step": 453 + }, + { + "epoch": 0.14, + "grad_norm": 12.222415924072266, + "learning_rate": 1.9091911396211285e-05, + "loss": 2.5833, + "step": 454 + }, + { + "epoch": 0.14, + "grad_norm": 21.09163475036621, + "learning_rate": 1.90899067856069e-05, + "loss": 3.8562, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 14.021400451660156, + "learning_rate": 1.908790217500251e-05, + "loss": 3.9132, + "step": 456 + }, + { + "epoch": 0.14, + "grad_norm": 19.492759704589844, + "learning_rate": 1.9085897564398115e-05, + "loss": 4.7376, + "step": 457 + }, + { + "epoch": 0.14, + "grad_norm": 13.180451393127441, + "learning_rate": 1.908389295379373e-05, + "loss": 2.8013, + "step": 458 + }, + { + "epoch": 0.14, + "grad_norm": 19.046419143676758, + "learning_rate": 1.9081888343189336e-05, + "loss": 3.2353, + "step": 459 + }, + { + "epoch": 0.14, + "grad_norm": 24.7363338470459, + "learning_rate": 1.9079883732584946e-05, + "loss": 4.2447, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 12.855481147766113, + "learning_rate": 1.9077879121980556e-05, + "loss": 3.3459, + "step": 461 + }, + { + "epoch": 0.14, + "grad_norm": 13.02658462524414, + "learning_rate": 1.9075874511376166e-05, + "loss": 4.0793, + "step": 462 + }, + { + "epoch": 0.14, + "grad_norm": 17.221975326538086, + "learning_rate": 1.907386990077178e-05, + "loss": 3.3399, + "step": 463 + }, + { + "epoch": 0.14, + "grad_norm": 22.799137115478516, + "learning_rate": 1.9071865290167386e-05, + "loss": 3.7892, + "step": 464 + }, + { + "epoch": 0.14, + "grad_norm": 21.327260971069336, + "learning_rate": 1.9069860679562996e-05, + "loss": 2.5584, + "step": 465 + }, + { + "epoch": 0.14, + "grad_norm": 26.7298583984375, + "learning_rate": 1.9067856068958606e-05, + "loss": 4.5134, + "step": 466 + }, + { + "epoch": 0.14, + "grad_norm": 25.994600296020508, + "learning_rate": 1.9065851458354216e-05, + "loss": 2.8459, + "step": 467 + }, + { + "epoch": 0.14, + "grad_norm": 22.67238998413086, + "learning_rate": 1.9063846847749826e-05, + "loss": 4.0606, + "step": 468 + }, + { + "epoch": 0.14, + "grad_norm": 16.200130462646484, + "learning_rate": 1.9061842237145436e-05, + "loss": 4.0715, + "step": 469 + }, + { + "epoch": 0.14, + "grad_norm": 12.668680191040039, + "learning_rate": 1.9059837626541046e-05, + "loss": 2.9427, + "step": 470 + }, + { + "epoch": 0.14, + "grad_norm": 24.67599868774414, + "learning_rate": 1.9057833015936657e-05, + "loss": 3.3925, + "step": 471 + }, + { + "epoch": 0.14, + "grad_norm": 26.510074615478516, + "learning_rate": 1.9055828405332267e-05, + "loss": 2.774, + "step": 472 + }, + { + "epoch": 0.14, + "grad_norm": 19.766721725463867, + "learning_rate": 1.9053823794727873e-05, + "loss": 3.1132, + "step": 473 + }, + { + "epoch": 0.14, + "grad_norm": 14.419114112854004, + "learning_rate": 1.9051819184123487e-05, + "loss": 2.8139, + "step": 474 + }, + { + "epoch": 0.14, + "grad_norm": 11.945268630981445, + "learning_rate": 1.9049814573519097e-05, + "loss": 3.4746, + "step": 475 + }, + { + "epoch": 0.14, + "grad_norm": 19.036060333251953, + "learning_rate": 1.9047809962914704e-05, + "loss": 4.0888, + "step": 476 + }, + { + "epoch": 0.14, + "grad_norm": 18.684444427490234, + "learning_rate": 1.9045805352310317e-05, + "loss": 3.8154, + "step": 477 + }, + { + "epoch": 0.14, + "grad_norm": 10.9110689163208, + "learning_rate": 1.9043800741705924e-05, + "loss": 3.7957, + "step": 478 + }, + { + "epoch": 0.14, + "grad_norm": 22.50055503845215, + "learning_rate": 1.9041796131101534e-05, + "loss": 4.0542, + "step": 479 + }, + { + "epoch": 0.14, + "grad_norm": 14.225284576416016, + "learning_rate": 1.9039791520497144e-05, + "loss": 2.9864, + "step": 480 + }, + { + "epoch": 0.14, + "eval_loss": 0.9622519612312317, + "eval_runtime": 43.4567, + "eval_samples_per_second": 34.034, + "eval_steps_per_second": 34.034, + "step": 480 + }, + { + "epoch": 0.14, + "grad_norm": 14.51484489440918, + "learning_rate": 1.9037786909892754e-05, + "loss": 3.9772, + "step": 481 + }, + { + "epoch": 0.14, + "grad_norm": 51.21427536010742, + "learning_rate": 1.9035782299288364e-05, + "loss": 3.8034, + "step": 482 + }, + { + "epoch": 0.15, + "grad_norm": 26.110877990722656, + "learning_rate": 1.9033777688683974e-05, + "loss": 3.6146, + "step": 483 + }, + { + "epoch": 0.15, + "grad_norm": 16.056385040283203, + "learning_rate": 1.9031773078079584e-05, + "loss": 3.8006, + "step": 484 + }, + { + "epoch": 0.15, + "grad_norm": 15.42993450164795, + "learning_rate": 1.9029768467475194e-05, + "loss": 3.0251, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 12.17597484588623, + "learning_rate": 1.9027763856870804e-05, + "loss": 3.4756, + "step": 486 + }, + { + "epoch": 0.15, + "grad_norm": 11.121559143066406, + "learning_rate": 1.9025759246266415e-05, + "loss": 2.7291, + "step": 487 + }, + { + "epoch": 0.15, + "grad_norm": 17.4909725189209, + "learning_rate": 1.9023754635662025e-05, + "loss": 3.6708, + "step": 488 + }, + { + "epoch": 0.15, + "grad_norm": 23.629941940307617, + "learning_rate": 1.9021750025057635e-05, + "loss": 3.0275, + "step": 489 + }, + { + "epoch": 0.15, + "grad_norm": 12.820405960083008, + "learning_rate": 1.9019745414453245e-05, + "loss": 3.0267, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 22.951032638549805, + "learning_rate": 1.9017740803848855e-05, + "loss": 2.393, + "step": 491 + }, + { + "epoch": 0.15, + "grad_norm": 11.71376895904541, + "learning_rate": 1.901573619324446e-05, + "loss": 2.9705, + "step": 492 + }, + { + "epoch": 0.15, + "grad_norm": 14.28131103515625, + "learning_rate": 1.9013731582640075e-05, + "loss": 2.727, + "step": 493 + }, + { + "epoch": 0.15, + "grad_norm": 16.59454345703125, + "learning_rate": 1.9011726972035685e-05, + "loss": 3.5118, + "step": 494 + }, + { + "epoch": 0.15, + "grad_norm": 20.26374053955078, + "learning_rate": 1.9009722361431292e-05, + "loss": 4.6275, + "step": 495 + }, + { + "epoch": 0.15, + "grad_norm": 13.29510498046875, + "learning_rate": 1.9007717750826905e-05, + "loss": 3.8611, + "step": 496 + }, + { + "epoch": 0.15, + "grad_norm": 14.79573917388916, + "learning_rate": 1.9005713140222512e-05, + "loss": 4.632, + "step": 497 + }, + { + "epoch": 0.15, + "grad_norm": 19.15884017944336, + "learning_rate": 1.9003708529618122e-05, + "loss": 3.5274, + "step": 498 + }, + { + "epoch": 0.15, + "grad_norm": 11.399006843566895, + "learning_rate": 1.9001703919013736e-05, + "loss": 4.0488, + "step": 499 + }, + { + "epoch": 0.15, + "grad_norm": 12.227916717529297, + "learning_rate": 1.8999699308409342e-05, + "loss": 3.8005, + "step": 500 + }, + { + "epoch": 0.15, + "grad_norm": 15.450165748596191, + "learning_rate": 1.8997694697804952e-05, + "loss": 2.9546, + "step": 501 + }, + { + "epoch": 0.15, + "grad_norm": 19.479894638061523, + "learning_rate": 1.8995690087200562e-05, + "loss": 3.056, + "step": 502 + }, + { + "epoch": 0.15, + "grad_norm": 18.609848022460938, + "learning_rate": 1.8993685476596172e-05, + "loss": 3.8573, + "step": 503 + }, + { + "epoch": 0.15, + "grad_norm": 19.30628204345703, + "learning_rate": 1.8991680865991783e-05, + "loss": 2.9196, + "step": 504 + }, + { + "epoch": 0.15, + "grad_norm": 15.707794189453125, + "learning_rate": 1.8989676255387393e-05, + "loss": 3.6089, + "step": 505 + }, + { + "epoch": 0.15, + "grad_norm": 18.78173065185547, + "learning_rate": 1.8987671644783003e-05, + "loss": 2.8028, + "step": 506 + }, + { + "epoch": 0.15, + "grad_norm": 17.811344146728516, + "learning_rate": 1.8985667034178613e-05, + "loss": 3.645, + "step": 507 + }, + { + "epoch": 0.15, + "grad_norm": 15.66889476776123, + "learning_rate": 1.8983662423574223e-05, + "loss": 3.5172, + "step": 508 + }, + { + "epoch": 0.15, + "grad_norm": 13.496955871582031, + "learning_rate": 1.8981657812969833e-05, + "loss": 3.2022, + "step": 509 + }, + { + "epoch": 0.15, + "grad_norm": 14.416739463806152, + "learning_rate": 1.8979653202365443e-05, + "loss": 3.7212, + "step": 510 + }, + { + "epoch": 0.15, + "grad_norm": 13.47453784942627, + "learning_rate": 1.897764859176105e-05, + "loss": 4.1187, + "step": 511 + }, + { + "epoch": 0.15, + "grad_norm": 18.787689208984375, + "learning_rate": 1.8975643981156663e-05, + "loss": 3.6262, + "step": 512 + }, + { + "epoch": 0.15, + "grad_norm": 10.850929260253906, + "learning_rate": 1.8973639370552273e-05, + "loss": 2.937, + "step": 513 + }, + { + "epoch": 0.15, + "grad_norm": 14.07578182220459, + "learning_rate": 1.897163475994788e-05, + "loss": 4.5924, + "step": 514 + }, + { + "epoch": 0.15, + "grad_norm": 13.420841217041016, + "learning_rate": 1.8969630149343493e-05, + "loss": 2.5988, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 16.54571533203125, + "learning_rate": 1.89676255387391e-05, + "loss": 3.835, + "step": 516 + }, + { + "epoch": 0.16, + "grad_norm": 68.53701782226562, + "learning_rate": 1.896562092813471e-05, + "loss": 5.1961, + "step": 517 + }, + { + "epoch": 0.16, + "grad_norm": 9.695450782775879, + "learning_rate": 1.8963616317530324e-05, + "loss": 3.4607, + "step": 518 + }, + { + "epoch": 0.16, + "grad_norm": 15.207024574279785, + "learning_rate": 1.896161170692593e-05, + "loss": 3.3355, + "step": 519 + }, + { + "epoch": 0.16, + "grad_norm": 14.207443237304688, + "learning_rate": 1.895960709632154e-05, + "loss": 3.5345, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 18.703203201293945, + "learning_rate": 1.895760248571715e-05, + "loss": 4.1013, + "step": 521 + }, + { + "epoch": 0.16, + "grad_norm": 22.39604949951172, + "learning_rate": 1.895559787511276e-05, + "loss": 4.8708, + "step": 522 + }, + { + "epoch": 0.16, + "grad_norm": 19.35679817199707, + "learning_rate": 1.895359326450837e-05, + "loss": 2.9551, + "step": 523 + }, + { + "epoch": 0.16, + "grad_norm": 13.426315307617188, + "learning_rate": 1.895158865390398e-05, + "loss": 2.9543, + "step": 524 + }, + { + "epoch": 0.16, + "grad_norm": 18.244346618652344, + "learning_rate": 1.894958404329959e-05, + "loss": 3.8422, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 16.004077911376953, + "learning_rate": 1.89475794326952e-05, + "loss": 2.7931, + "step": 526 + }, + { + "epoch": 0.16, + "grad_norm": 23.495067596435547, + "learning_rate": 1.894557482209081e-05, + "loss": 4.6575, + "step": 527 + }, + { + "epoch": 0.16, + "grad_norm": 16.585546493530273, + "learning_rate": 1.894357021148642e-05, + "loss": 3.2343, + "step": 528 + }, + { + "epoch": 0.16, + "grad_norm": 25.565746307373047, + "learning_rate": 1.894156560088203e-05, + "loss": 3.6168, + "step": 529 + }, + { + "epoch": 0.16, + "grad_norm": 17.877038955688477, + "learning_rate": 1.893956099027764e-05, + "loss": 4.1159, + "step": 530 + }, + { + "epoch": 0.16, + "grad_norm": 25.356380462646484, + "learning_rate": 1.893755637967325e-05, + "loss": 4.1038, + "step": 531 + }, + { + "epoch": 0.16, + "grad_norm": 28.306737899780273, + "learning_rate": 1.893555176906886e-05, + "loss": 3.7835, + "step": 532 + }, + { + "epoch": 0.16, + "grad_norm": 16.962968826293945, + "learning_rate": 1.8933547158464468e-05, + "loss": 2.9008, + "step": 533 + }, + { + "epoch": 0.16, + "grad_norm": 20.51753044128418, + "learning_rate": 1.8931542547860082e-05, + "loss": 4.6783, + "step": 534 + }, + { + "epoch": 0.16, + "grad_norm": 17.836177825927734, + "learning_rate": 1.892953793725569e-05, + "loss": 2.8979, + "step": 535 + }, + { + "epoch": 0.16, + "grad_norm": 14.355405807495117, + "learning_rate": 1.89275333266513e-05, + "loss": 3.6143, + "step": 536 + }, + { + "epoch": 0.16, + "grad_norm": 17.681434631347656, + "learning_rate": 1.8925528716046912e-05, + "loss": 2.4729, + "step": 537 + }, + { + "epoch": 0.16, + "grad_norm": 16.02050018310547, + "learning_rate": 1.892352410544252e-05, + "loss": 2.3059, + "step": 538 + }, + { + "epoch": 0.16, + "grad_norm": 12.686012268066406, + "learning_rate": 1.892151949483813e-05, + "loss": 3.307, + "step": 539 + }, + { + "epoch": 0.16, + "grad_norm": 13.235198974609375, + "learning_rate": 1.891951488423374e-05, + "loss": 4.249, + "step": 540 + }, + { + "epoch": 0.16, + "grad_norm": 16.329849243164062, + "learning_rate": 1.891751027362935e-05, + "loss": 3.9762, + "step": 541 + }, + { + "epoch": 0.16, + "grad_norm": 18.02645492553711, + "learning_rate": 1.891550566302496e-05, + "loss": 3.3566, + "step": 542 + }, + { + "epoch": 0.16, + "grad_norm": 15.097955703735352, + "learning_rate": 1.891350105242057e-05, + "loss": 3.683, + "step": 543 + }, + { + "epoch": 0.16, + "grad_norm": 14.13657283782959, + "learning_rate": 1.891149644181618e-05, + "loss": 2.5315, + "step": 544 + }, + { + "epoch": 0.16, + "grad_norm": 13.064208984375, + "learning_rate": 1.890949183121179e-05, + "loss": 3.7249, + "step": 545 + }, + { + "epoch": 0.16, + "grad_norm": 12.626958847045898, + "learning_rate": 1.89074872206074e-05, + "loss": 3.4621, + "step": 546 + }, + { + "epoch": 0.16, + "grad_norm": 15.529770851135254, + "learning_rate": 1.8905482610003006e-05, + "loss": 3.6776, + "step": 547 + }, + { + "epoch": 0.16, + "grad_norm": 16.237417221069336, + "learning_rate": 1.890347799939862e-05, + "loss": 5.0018, + "step": 548 + }, + { + "epoch": 0.17, + "grad_norm": 24.94883155822754, + "learning_rate": 1.890147338879423e-05, + "loss": 3.5904, + "step": 549 + }, + { + "epoch": 0.17, + "grad_norm": 18.61686134338379, + "learning_rate": 1.8899468778189836e-05, + "loss": 3.47, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 23.455402374267578, + "learning_rate": 1.889746416758545e-05, + "loss": 3.8038, + "step": 551 + }, + { + "epoch": 0.17, + "grad_norm": 14.99221420288086, + "learning_rate": 1.8895459556981056e-05, + "loss": 3.4294, + "step": 552 + }, + { + "epoch": 0.17, + "grad_norm": 15.767837524414062, + "learning_rate": 1.8893454946376667e-05, + "loss": 3.2229, + "step": 553 + }, + { + "epoch": 0.17, + "grad_norm": 12.863175392150879, + "learning_rate": 1.8891450335772277e-05, + "loss": 3.6184, + "step": 554 + }, + { + "epoch": 0.17, + "grad_norm": 14.412956237792969, + "learning_rate": 1.8889445725167887e-05, + "loss": 4.1913, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 24.851470947265625, + "learning_rate": 1.8887441114563497e-05, + "loss": 3.9418, + "step": 556 + }, + { + "epoch": 0.17, + "grad_norm": 47.26935958862305, + "learning_rate": 1.8885436503959107e-05, + "loss": 5.1826, + "step": 557 + }, + { + "epoch": 0.17, + "grad_norm": 15.844667434692383, + "learning_rate": 1.8883431893354717e-05, + "loss": 3.7824, + "step": 558 + }, + { + "epoch": 0.17, + "grad_norm": 11.842270851135254, + "learning_rate": 1.8881427282750327e-05, + "loss": 2.9349, + "step": 559 + }, + { + "epoch": 0.17, + "grad_norm": 21.0179386138916, + "learning_rate": 1.8879422672145937e-05, + "loss": 3.2554, + "step": 560 + }, + { + "epoch": 0.17, + "grad_norm": 19.371559143066406, + "learning_rate": 1.8877418061541547e-05, + "loss": 3.6965, + "step": 561 + }, + { + "epoch": 0.17, + "grad_norm": 17.8704833984375, + "learning_rate": 1.8875413450937157e-05, + "loss": 3.6738, + "step": 562 + }, + { + "epoch": 0.17, + "grad_norm": 11.61600112915039, + "learning_rate": 1.8873408840332767e-05, + "loss": 2.5997, + "step": 563 + }, + { + "epoch": 0.17, + "grad_norm": 17.82415199279785, + "learning_rate": 1.8871404229728377e-05, + "loss": 3.402, + "step": 564 + }, + { + "epoch": 0.17, + "grad_norm": 13.841615676879883, + "learning_rate": 1.8869399619123988e-05, + "loss": 2.5574, + "step": 565 + }, + { + "epoch": 0.17, + "grad_norm": 17.705219268798828, + "learning_rate": 1.8867395008519594e-05, + "loss": 3.4216, + "step": 566 + }, + { + "epoch": 0.17, + "grad_norm": 15.329522132873535, + "learning_rate": 1.8865390397915208e-05, + "loss": 2.5929, + "step": 567 + }, + { + "epoch": 0.17, + "grad_norm": 14.040759086608887, + "learning_rate": 1.8863385787310818e-05, + "loss": 3.7188, + "step": 568 + }, + { + "epoch": 0.17, + "grad_norm": 20.66543960571289, + "learning_rate": 1.8861381176706424e-05, + "loss": 3.3664, + "step": 569 + }, + { + "epoch": 0.17, + "grad_norm": 10.702879905700684, + "learning_rate": 1.8859376566102038e-05, + "loss": 3.0084, + "step": 570 + }, + { + "epoch": 0.17, + "grad_norm": 15.897429466247559, + "learning_rate": 1.8857371955497645e-05, + "loss": 3.7669, + "step": 571 + }, + { + "epoch": 0.17, + "grad_norm": 11.95020866394043, + "learning_rate": 1.8855367344893255e-05, + "loss": 3.9482, + "step": 572 + }, + { + "epoch": 0.17, + "grad_norm": 27.125240325927734, + "learning_rate": 1.8853362734288868e-05, + "loss": 2.7438, + "step": 573 + }, + { + "epoch": 0.17, + "grad_norm": 13.781254768371582, + "learning_rate": 1.8851358123684475e-05, + "loss": 3.3872, + "step": 574 + }, + { + "epoch": 0.17, + "grad_norm": 29.12127113342285, + "learning_rate": 1.8849353513080085e-05, + "loss": 4.5567, + "step": 575 + }, + { + "epoch": 0.17, + "grad_norm": 15.671697616577148, + "learning_rate": 1.8847348902475695e-05, + "loss": 2.5302, + "step": 576 + }, + { + "epoch": 0.17, + "grad_norm": 17.996654510498047, + "learning_rate": 1.8845344291871305e-05, + "loss": 3.1327, + "step": 577 + }, + { + "epoch": 0.17, + "grad_norm": 16.846689224243164, + "learning_rate": 1.8843339681266915e-05, + "loss": 2.5453, + "step": 578 + }, + { + "epoch": 0.17, + "grad_norm": 21.583017349243164, + "learning_rate": 1.8841335070662525e-05, + "loss": 3.5481, + "step": 579 + }, + { + "epoch": 0.17, + "grad_norm": 14.08506965637207, + "learning_rate": 1.8839330460058135e-05, + "loss": 3.8354, + "step": 580 + }, + { + "epoch": 0.17, + "grad_norm": 12.634489059448242, + "learning_rate": 1.8837325849453745e-05, + "loss": 3.1067, + "step": 581 + }, + { + "epoch": 0.17, + "grad_norm": 11.801117897033691, + "learning_rate": 1.8835321238849356e-05, + "loss": 3.4585, + "step": 582 + }, + { + "epoch": 0.18, + "grad_norm": 15.204583168029785, + "learning_rate": 1.8833316628244966e-05, + "loss": 3.3097, + "step": 583 + }, + { + "epoch": 0.18, + "grad_norm": 15.794861793518066, + "learning_rate": 1.8831312017640576e-05, + "loss": 3.0038, + "step": 584 + }, + { + "epoch": 0.18, + "grad_norm": 13.39342212677002, + "learning_rate": 1.8829307407036186e-05, + "loss": 3.4626, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 14.78641414642334, + "learning_rate": 1.8827302796431796e-05, + "loss": 3.4037, + "step": 586 + }, + { + "epoch": 0.18, + "grad_norm": 16.814594268798828, + "learning_rate": 1.8825298185827406e-05, + "loss": 3.7882, + "step": 587 + }, + { + "epoch": 0.18, + "grad_norm": 27.2209415435791, + "learning_rate": 1.8823293575223013e-05, + "loss": 3.2455, + "step": 588 + }, + { + "epoch": 0.18, + "grad_norm": 16.686363220214844, + "learning_rate": 1.8821288964618626e-05, + "loss": 3.932, + "step": 589 + }, + { + "epoch": 0.18, + "grad_norm": 17.841867446899414, + "learning_rate": 1.8819284354014233e-05, + "loss": 2.3729, + "step": 590 + }, + { + "epoch": 0.18, + "grad_norm": 13.670007705688477, + "learning_rate": 1.8817279743409843e-05, + "loss": 3.9139, + "step": 591 + }, + { + "epoch": 0.18, + "grad_norm": 12.825089454650879, + "learning_rate": 1.8815275132805456e-05, + "loss": 3.4298, + "step": 592 + }, + { + "epoch": 0.18, + "grad_norm": 21.140291213989258, + "learning_rate": 1.8813270522201063e-05, + "loss": 3.6672, + "step": 593 + }, + { + "epoch": 0.18, + "grad_norm": 19.407217025756836, + "learning_rate": 1.8811265911596673e-05, + "loss": 3.853, + "step": 594 + }, + { + "epoch": 0.18, + "grad_norm": 17.949951171875, + "learning_rate": 1.8809261300992283e-05, + "loss": 4.0853, + "step": 595 + }, + { + "epoch": 0.18, + "grad_norm": 14.42286205291748, + "learning_rate": 1.8807256690387893e-05, + "loss": 3.8624, + "step": 596 + }, + { + "epoch": 0.18, + "grad_norm": 13.524324417114258, + "learning_rate": 1.8805252079783503e-05, + "loss": 3.4537, + "step": 597 + }, + { + "epoch": 0.18, + "grad_norm": 16.790464401245117, + "learning_rate": 1.8803247469179114e-05, + "loss": 3.3715, + "step": 598 + }, + { + "epoch": 0.18, + "grad_norm": 28.930021286010742, + "learning_rate": 1.8801242858574724e-05, + "loss": 4.5601, + "step": 599 + }, + { + "epoch": 0.18, + "grad_norm": 17.764270782470703, + "learning_rate": 1.8799238247970334e-05, + "loss": 3.0025, + "step": 600 + }, + { + "epoch": 0.18, + "eval_loss": 0.8108684420585632, + "eval_runtime": 43.7575, + "eval_samples_per_second": 33.8, + "eval_steps_per_second": 33.8, + "step": 600 + }, + { + "epoch": 0.18, + "grad_norm": 12.37165641784668, + "learning_rate": 1.8797233637365944e-05, + "loss": 3.7558, + "step": 601 + }, + { + "epoch": 0.18, + "grad_norm": 31.21900177001953, + "learning_rate": 1.8795229026761554e-05, + "loss": 3.6418, + "step": 602 + }, + { + "epoch": 0.18, + "grad_norm": 12.556445121765137, + "learning_rate": 1.8793224416157164e-05, + "loss": 2.7222, + "step": 603 + }, + { + "epoch": 0.18, + "grad_norm": 12.976155281066895, + "learning_rate": 1.8791219805552774e-05, + "loss": 3.1556, + "step": 604 + }, + { + "epoch": 0.18, + "grad_norm": 14.281742095947266, + "learning_rate": 1.8789215194948384e-05, + "loss": 3.5987, + "step": 605 + }, + { + "epoch": 0.18, + "grad_norm": 17.95466423034668, + "learning_rate": 1.8787210584343994e-05, + "loss": 2.8724, + "step": 606 + }, + { + "epoch": 0.18, + "grad_norm": 11.985798835754395, + "learning_rate": 1.87852059737396e-05, + "loss": 3.3135, + "step": 607 + }, + { + "epoch": 0.18, + "grad_norm": 11.839190483093262, + "learning_rate": 1.8783201363135214e-05, + "loss": 2.7356, + "step": 608 + }, + { + "epoch": 0.18, + "grad_norm": 19.593158721923828, + "learning_rate": 1.878119675253082e-05, + "loss": 2.48, + "step": 609 + }, + { + "epoch": 0.18, + "grad_norm": 11.183398246765137, + "learning_rate": 1.877919214192643e-05, + "loss": 2.3999, + "step": 610 + }, + { + "epoch": 0.18, + "grad_norm": 16.24125862121582, + "learning_rate": 1.8777187531322045e-05, + "loss": 3.8464, + "step": 611 + }, + { + "epoch": 0.18, + "grad_norm": 17.56856346130371, + "learning_rate": 1.877518292071765e-05, + "loss": 3.5414, + "step": 612 + }, + { + "epoch": 0.18, + "grad_norm": 15.031785011291504, + "learning_rate": 1.877317831011326e-05, + "loss": 3.4706, + "step": 613 + }, + { + "epoch": 0.18, + "grad_norm": 14.207649230957031, + "learning_rate": 1.877117369950887e-05, + "loss": 3.0532, + "step": 614 + }, + { + "epoch": 0.18, + "grad_norm": 15.561878204345703, + "learning_rate": 1.876916908890448e-05, + "loss": 3.5544, + "step": 615 + }, + { + "epoch": 0.19, + "grad_norm": 15.425860404968262, + "learning_rate": 1.876716447830009e-05, + "loss": 3.2101, + "step": 616 + }, + { + "epoch": 0.19, + "grad_norm": 17.44911766052246, + "learning_rate": 1.8765159867695702e-05, + "loss": 4.0486, + "step": 617 + }, + { + "epoch": 0.19, + "grad_norm": 16.89653205871582, + "learning_rate": 1.8763155257091312e-05, + "loss": 3.1268, + "step": 618 + }, + { + "epoch": 0.19, + "grad_norm": 17.181591033935547, + "learning_rate": 1.8761150646486922e-05, + "loss": 3.0687, + "step": 619 + }, + { + "epoch": 0.19, + "grad_norm": 14.057995796203613, + "learning_rate": 1.8759146035882532e-05, + "loss": 2.7292, + "step": 620 + }, + { + "epoch": 0.19, + "grad_norm": 62.06224822998047, + "learning_rate": 1.875714142527814e-05, + "loss": 2.8716, + "step": 621 + }, + { + "epoch": 0.19, + "grad_norm": 15.26970100402832, + "learning_rate": 1.8755136814673752e-05, + "loss": 3.4212, + "step": 622 + }, + { + "epoch": 0.19, + "grad_norm": 12.904154777526855, + "learning_rate": 1.8753132204069362e-05, + "loss": 2.5465, + "step": 623 + }, + { + "epoch": 0.19, + "grad_norm": 16.669986724853516, + "learning_rate": 1.875112759346497e-05, + "loss": 2.8482, + "step": 624 + }, + { + "epoch": 0.19, + "grad_norm": 10.211058616638184, + "learning_rate": 1.8749122982860582e-05, + "loss": 2.5476, + "step": 625 + }, + { + "epoch": 0.19, + "grad_norm": 21.396862030029297, + "learning_rate": 1.874711837225619e-05, + "loss": 3.7639, + "step": 626 + }, + { + "epoch": 0.19, + "grad_norm": 20.889707565307617, + "learning_rate": 1.87451137616518e-05, + "loss": 3.3019, + "step": 627 + }, + { + "epoch": 0.19, + "grad_norm": 25.096691131591797, + "learning_rate": 1.874310915104741e-05, + "loss": 4.246, + "step": 628 + }, + { + "epoch": 0.19, + "grad_norm": 13.878023147583008, + "learning_rate": 1.874110454044302e-05, + "loss": 2.9438, + "step": 629 + }, + { + "epoch": 0.19, + "grad_norm": 18.34180450439453, + "learning_rate": 1.873909992983863e-05, + "loss": 4.3573, + "step": 630 + }, + { + "epoch": 0.19, + "grad_norm": 18.738731384277344, + "learning_rate": 1.873709531923424e-05, + "loss": 2.7748, + "step": 631 + }, + { + "epoch": 0.19, + "grad_norm": 15.163655281066895, + "learning_rate": 1.873509070862985e-05, + "loss": 3.4938, + "step": 632 + }, + { + "epoch": 0.19, + "grad_norm": 16.00517463684082, + "learning_rate": 1.873308609802546e-05, + "loss": 4.1243, + "step": 633 + }, + { + "epoch": 0.19, + "grad_norm": 15.933029174804688, + "learning_rate": 1.873108148742107e-05, + "loss": 3.8793, + "step": 634 + }, + { + "epoch": 0.19, + "grad_norm": 17.073196411132812, + "learning_rate": 1.872907687681668e-05, + "loss": 3.3813, + "step": 635 + }, + { + "epoch": 0.19, + "grad_norm": 16.49427604675293, + "learning_rate": 1.872707226621229e-05, + "loss": 3.9439, + "step": 636 + }, + { + "epoch": 0.19, + "grad_norm": 14.988619804382324, + "learning_rate": 1.87250676556079e-05, + "loss": 3.4706, + "step": 637 + }, + { + "epoch": 0.19, + "grad_norm": 14.796496391296387, + "learning_rate": 1.872306304500351e-05, + "loss": 2.572, + "step": 638 + }, + { + "epoch": 0.19, + "grad_norm": 14.737870216369629, + "learning_rate": 1.872105843439912e-05, + "loss": 4.6644, + "step": 639 + }, + { + "epoch": 0.19, + "grad_norm": 14.593988418579102, + "learning_rate": 1.8719053823794727e-05, + "loss": 2.9476, + "step": 640 + }, + { + "epoch": 0.19, + "grad_norm": 24.125364303588867, + "learning_rate": 1.871704921319034e-05, + "loss": 3.0569, + "step": 641 + }, + { + "epoch": 0.19, + "grad_norm": 12.783120155334473, + "learning_rate": 1.871504460258595e-05, + "loss": 2.7435, + "step": 642 + }, + { + "epoch": 0.19, + "grad_norm": 20.349702835083008, + "learning_rate": 1.8713039991981557e-05, + "loss": 3.924, + "step": 643 + }, + { + "epoch": 0.19, + "grad_norm": 18.848377227783203, + "learning_rate": 1.871103538137717e-05, + "loss": 3.6072, + "step": 644 + }, + { + "epoch": 0.19, + "grad_norm": 16.555147171020508, + "learning_rate": 1.8709030770772777e-05, + "loss": 2.6862, + "step": 645 + }, + { + "epoch": 0.19, + "grad_norm": 93.45834350585938, + "learning_rate": 1.8707026160168387e-05, + "loss": 3.1187, + "step": 646 + }, + { + "epoch": 0.19, + "grad_norm": 12.666962623596191, + "learning_rate": 1.8705021549564e-05, + "loss": 3.3528, + "step": 647 + }, + { + "epoch": 0.19, + "grad_norm": 13.548149108886719, + "learning_rate": 1.8703016938959608e-05, + "loss": 2.8839, + "step": 648 + }, + { + "epoch": 0.2, + "grad_norm": 20.26407241821289, + "learning_rate": 1.8701012328355218e-05, + "loss": 3.2999, + "step": 649 + }, + { + "epoch": 0.2, + "grad_norm": 16.36279296875, + "learning_rate": 1.8699007717750828e-05, + "loss": 2.7201, + "step": 650 + }, + { + "epoch": 0.2, + "grad_norm": 15.859626770019531, + "learning_rate": 1.8697003107146438e-05, + "loss": 3.3978, + "step": 651 + }, + { + "epoch": 0.2, + "grad_norm": 20.308744430541992, + "learning_rate": 1.8694998496542048e-05, + "loss": 3.7883, + "step": 652 + }, + { + "epoch": 0.2, + "grad_norm": 14.849808692932129, + "learning_rate": 1.8692993885937658e-05, + "loss": 3.0277, + "step": 653 + }, + { + "epoch": 0.2, + "grad_norm": 10.114745140075684, + "learning_rate": 1.8690989275333268e-05, + "loss": 2.4441, + "step": 654 + }, + { + "epoch": 0.2, + "grad_norm": 21.375455856323242, + "learning_rate": 1.8688984664728878e-05, + "loss": 2.9501, + "step": 655 + }, + { + "epoch": 0.2, + "grad_norm": 21.958810806274414, + "learning_rate": 1.8686980054124488e-05, + "loss": 2.8862, + "step": 656 + }, + { + "epoch": 0.2, + "grad_norm": 14.946088790893555, + "learning_rate": 1.86849754435201e-05, + "loss": 3.4693, + "step": 657 + }, + { + "epoch": 0.2, + "grad_norm": 13.369003295898438, + "learning_rate": 1.868297083291571e-05, + "loss": 4.4842, + "step": 658 + }, + { + "epoch": 0.2, + "grad_norm": 13.760272979736328, + "learning_rate": 1.868096622231132e-05, + "loss": 4.0259, + "step": 659 + }, + { + "epoch": 0.2, + "grad_norm": 18.290048599243164, + "learning_rate": 1.867896161170693e-05, + "loss": 2.4139, + "step": 660 + }, + { + "epoch": 0.2, + "grad_norm": 20.004146575927734, + "learning_rate": 1.867695700110254e-05, + "loss": 3.8439, + "step": 661 + }, + { + "epoch": 0.2, + "grad_norm": 15.349987983703613, + "learning_rate": 1.8674952390498145e-05, + "loss": 2.9132, + "step": 662 + }, + { + "epoch": 0.2, + "grad_norm": 14.10637092590332, + "learning_rate": 1.867294777989376e-05, + "loss": 2.9145, + "step": 663 + }, + { + "epoch": 0.2, + "grad_norm": 15.24035358428955, + "learning_rate": 1.8670943169289366e-05, + "loss": 2.748, + "step": 664 + }, + { + "epoch": 0.2, + "grad_norm": 23.505083084106445, + "learning_rate": 1.8668938558684976e-05, + "loss": 2.8452, + "step": 665 + }, + { + "epoch": 0.2, + "grad_norm": 82.25447845458984, + "learning_rate": 1.866693394808059e-05, + "loss": 2.8634, + "step": 666 + }, + { + "epoch": 0.2, + "grad_norm": 24.59209632873535, + "learning_rate": 1.8664929337476196e-05, + "loss": 2.9632, + "step": 667 + }, + { + "epoch": 0.2, + "grad_norm": 18.12932014465332, + "learning_rate": 1.8662924726871806e-05, + "loss": 3.9964, + "step": 668 + }, + { + "epoch": 0.2, + "grad_norm": 13.9307222366333, + "learning_rate": 1.8660920116267416e-05, + "loss": 3.37, + "step": 669 + }, + { + "epoch": 0.2, + "grad_norm": 15.804916381835938, + "learning_rate": 1.8658915505663026e-05, + "loss": 3.7911, + "step": 670 + }, + { + "epoch": 0.2, + "grad_norm": 12.954413414001465, + "learning_rate": 1.8656910895058636e-05, + "loss": 3.4076, + "step": 671 + }, + { + "epoch": 0.2, + "grad_norm": 19.96038055419922, + "learning_rate": 1.8654906284454246e-05, + "loss": 2.6706, + "step": 672 + }, + { + "epoch": 0.2, + "grad_norm": 14.459244728088379, + "learning_rate": 1.8652901673849856e-05, + "loss": 2.6447, + "step": 673 + }, + { + "epoch": 0.2, + "grad_norm": 14.168243408203125, + "learning_rate": 1.8650897063245466e-05, + "loss": 2.8916, + "step": 674 + }, + { + "epoch": 0.2, + "grad_norm": 12.294631004333496, + "learning_rate": 1.8648892452641076e-05, + "loss": 3.911, + "step": 675 + }, + { + "epoch": 0.2, + "grad_norm": 16.20514488220215, + "learning_rate": 1.8646887842036687e-05, + "loss": 4.1275, + "step": 676 + }, + { + "epoch": 0.2, + "grad_norm": 13.80794906616211, + "learning_rate": 1.8644883231432297e-05, + "loss": 2.5423, + "step": 677 + }, + { + "epoch": 0.2, + "grad_norm": 13.199609756469727, + "learning_rate": 1.8642878620827907e-05, + "loss": 3.119, + "step": 678 + }, + { + "epoch": 0.2, + "grad_norm": 17.830242156982422, + "learning_rate": 1.8640874010223517e-05, + "loss": 3.9047, + "step": 679 + }, + { + "epoch": 0.2, + "grad_norm": 17.04703140258789, + "learning_rate": 1.8638869399619127e-05, + "loss": 2.4903, + "step": 680 + }, + { + "epoch": 0.2, + "grad_norm": 16.005630493164062, + "learning_rate": 1.8636864789014734e-05, + "loss": 3.5141, + "step": 681 + }, + { + "epoch": 0.21, + "grad_norm": 12.92525577545166, + "learning_rate": 1.8634860178410347e-05, + "loss": 3.5435, + "step": 682 + }, + { + "epoch": 0.21, + "grad_norm": 13.173433303833008, + "learning_rate": 1.8632855567805954e-05, + "loss": 3.4652, + "step": 683 + }, + { + "epoch": 0.21, + "grad_norm": 15.04622745513916, + "learning_rate": 1.8630850957201564e-05, + "loss": 3.2746, + "step": 684 + }, + { + "epoch": 0.21, + "grad_norm": 16.044187545776367, + "learning_rate": 1.8628846346597177e-05, + "loss": 3.5208, + "step": 685 + }, + { + "epoch": 0.21, + "grad_norm": 12.709644317626953, + "learning_rate": 1.8626841735992784e-05, + "loss": 2.9898, + "step": 686 + }, + { + "epoch": 0.21, + "grad_norm": 62.63556671142578, + "learning_rate": 1.8624837125388394e-05, + "loss": 4.3599, + "step": 687 + }, + { + "epoch": 0.21, + "grad_norm": 14.936622619628906, + "learning_rate": 1.8622832514784004e-05, + "loss": 3.7374, + "step": 688 + }, + { + "epoch": 0.21, + "grad_norm": 18.37685203552246, + "learning_rate": 1.8620827904179614e-05, + "loss": 2.8972, + "step": 689 + }, + { + "epoch": 0.21, + "grad_norm": 18.769283294677734, + "learning_rate": 1.8618823293575224e-05, + "loss": 2.7669, + "step": 690 + }, + { + "epoch": 0.21, + "grad_norm": 14.026557922363281, + "learning_rate": 1.8616818682970834e-05, + "loss": 3.4005, + "step": 691 + }, + { + "epoch": 0.21, + "grad_norm": 15.341630935668945, + "learning_rate": 1.8614814072366445e-05, + "loss": 3.8892, + "step": 692 + }, + { + "epoch": 0.21, + "grad_norm": 20.44842529296875, + "learning_rate": 1.8612809461762055e-05, + "loss": 4.4409, + "step": 693 + }, + { + "epoch": 0.21, + "grad_norm": 23.54950523376465, + "learning_rate": 1.8610804851157665e-05, + "loss": 3.9646, + "step": 694 + }, + { + "epoch": 0.21, + "grad_norm": 13.78188419342041, + "learning_rate": 1.860880024055327e-05, + "loss": 4.5521, + "step": 695 + }, + { + "epoch": 0.21, + "grad_norm": 21.157424926757812, + "learning_rate": 1.8606795629948885e-05, + "loss": 3.6927, + "step": 696 + }, + { + "epoch": 0.21, + "grad_norm": 12.406957626342773, + "learning_rate": 1.8604791019344495e-05, + "loss": 3.2458, + "step": 697 + }, + { + "epoch": 0.21, + "grad_norm": 18.12921142578125, + "learning_rate": 1.86027864087401e-05, + "loss": 3.7537, + "step": 698 + }, + { + "epoch": 0.21, + "grad_norm": 21.271060943603516, + "learning_rate": 1.8600781798135715e-05, + "loss": 4.3055, + "step": 699 + }, + { + "epoch": 0.21, + "grad_norm": 13.396303176879883, + "learning_rate": 1.8598777187531322e-05, + "loss": 3.5275, + "step": 700 + }, + { + "epoch": 0.21, + "grad_norm": 11.93570613861084, + "learning_rate": 1.8596772576926935e-05, + "loss": 3.0972, + "step": 701 + }, + { + "epoch": 0.21, + "grad_norm": 14.72830581665039, + "learning_rate": 1.8594767966322545e-05, + "loss": 4.0018, + "step": 702 + }, + { + "epoch": 0.21, + "grad_norm": 23.694110870361328, + "learning_rate": 1.8592763355718152e-05, + "loss": 3.2218, + "step": 703 + }, + { + "epoch": 0.21, + "grad_norm": 17.545129776000977, + "learning_rate": 1.8590758745113766e-05, + "loss": 3.2768, + "step": 704 + }, + { + "epoch": 0.21, + "grad_norm": 16.36639976501465, + "learning_rate": 1.8588754134509372e-05, + "loss": 3.1514, + "step": 705 + }, + { + "epoch": 0.21, + "grad_norm": 11.386521339416504, + "learning_rate": 1.8586749523904982e-05, + "loss": 2.3357, + "step": 706 + }, + { + "epoch": 0.21, + "grad_norm": 10.972953796386719, + "learning_rate": 1.8584744913300592e-05, + "loss": 2.8289, + "step": 707 + }, + { + "epoch": 0.21, + "grad_norm": 16.927831649780273, + "learning_rate": 1.8582740302696202e-05, + "loss": 3.086, + "step": 708 + }, + { + "epoch": 0.21, + "grad_norm": 14.967674255371094, + "learning_rate": 1.8580735692091813e-05, + "loss": 3.6209, + "step": 709 + }, + { + "epoch": 0.21, + "grad_norm": 16.293773651123047, + "learning_rate": 1.8578731081487423e-05, + "loss": 4.2359, + "step": 710 + }, + { + "epoch": 0.21, + "grad_norm": 14.0774564743042, + "learning_rate": 1.8576726470883033e-05, + "loss": 3.7756, + "step": 711 + }, + { + "epoch": 0.21, + "grad_norm": 26.526830673217773, + "learning_rate": 1.8574721860278643e-05, + "loss": 3.1339, + "step": 712 + }, + { + "epoch": 0.21, + "grad_norm": 13.169925689697266, + "learning_rate": 1.8572717249674253e-05, + "loss": 2.6322, + "step": 713 + }, + { + "epoch": 0.21, + "grad_norm": 13.879220962524414, + "learning_rate": 1.857071263906986e-05, + "loss": 2.5253, + "step": 714 + }, + { + "epoch": 0.21, + "grad_norm": 15.33066177368164, + "learning_rate": 1.8568708028465473e-05, + "loss": 3.3604, + "step": 715 + }, + { + "epoch": 0.22, + "grad_norm": 16.04281997680664, + "learning_rate": 1.8566703417861083e-05, + "loss": 2.824, + "step": 716 + }, + { + "epoch": 0.22, + "grad_norm": 13.022286415100098, + "learning_rate": 1.856469880725669e-05, + "loss": 2.6692, + "step": 717 + }, + { + "epoch": 0.22, + "grad_norm": 14.34041976928711, + "learning_rate": 1.8562694196652303e-05, + "loss": 3.0903, + "step": 718 + }, + { + "epoch": 0.22, + "grad_norm": 17.68020248413086, + "learning_rate": 1.856068958604791e-05, + "loss": 3.4143, + "step": 719 + }, + { + "epoch": 0.22, + "grad_norm": 13.933920860290527, + "learning_rate": 1.855868497544352e-05, + "loss": 3.2975, + "step": 720 + }, + { + "epoch": 0.22, + "eval_loss": 0.7311321496963501, + "eval_runtime": 43.9321, + "eval_samples_per_second": 33.666, + "eval_steps_per_second": 33.666, + "step": 720 + }, + { + "epoch": 0.22, + "grad_norm": 14.72537612915039, + "learning_rate": 1.8556680364839134e-05, + "loss": 3.9612, + "step": 721 + }, + { + "epoch": 0.22, + "grad_norm": 26.851642608642578, + "learning_rate": 1.855467575423474e-05, + "loss": 3.4687, + "step": 722 + }, + { + "epoch": 0.22, + "grad_norm": 16.372344970703125, + "learning_rate": 1.855267114363035e-05, + "loss": 3.7605, + "step": 723 + }, + { + "epoch": 0.22, + "grad_norm": 17.88177490234375, + "learning_rate": 1.855066653302596e-05, + "loss": 3.4436, + "step": 724 + }, + { + "epoch": 0.22, + "grad_norm": 16.162473678588867, + "learning_rate": 1.854866192242157e-05, + "loss": 3.4066, + "step": 725 + }, + { + "epoch": 0.22, + "grad_norm": 12.335894584655762, + "learning_rate": 1.854665731181718e-05, + "loss": 2.9006, + "step": 726 + }, + { + "epoch": 0.22, + "grad_norm": 19.66415786743164, + "learning_rate": 1.854465270121279e-05, + "loss": 3.5752, + "step": 727 + }, + { + "epoch": 0.22, + "grad_norm": 11.808516502380371, + "learning_rate": 1.85426480906084e-05, + "loss": 1.8741, + "step": 728 + }, + { + "epoch": 0.22, + "grad_norm": 15.641440391540527, + "learning_rate": 1.854064348000401e-05, + "loss": 3.3391, + "step": 729 + }, + { + "epoch": 0.22, + "grad_norm": 13.590608596801758, + "learning_rate": 1.853863886939962e-05, + "loss": 2.9265, + "step": 730 + }, + { + "epoch": 0.22, + "grad_norm": 18.237276077270508, + "learning_rate": 1.853663425879523e-05, + "loss": 3.6717, + "step": 731 + }, + { + "epoch": 0.22, + "grad_norm": 41.96309280395508, + "learning_rate": 1.853462964819084e-05, + "loss": 2.857, + "step": 732 + }, + { + "epoch": 0.22, + "grad_norm": 15.107841491699219, + "learning_rate": 1.853262503758645e-05, + "loss": 3.9187, + "step": 733 + }, + { + "epoch": 0.22, + "grad_norm": 15.574593544006348, + "learning_rate": 1.853062042698206e-05, + "loss": 3.1606, + "step": 734 + }, + { + "epoch": 0.22, + "grad_norm": 21.812761306762695, + "learning_rate": 1.852861581637767e-05, + "loss": 2.8953, + "step": 735 + }, + { + "epoch": 0.22, + "grad_norm": 21.730745315551758, + "learning_rate": 1.8526611205773278e-05, + "loss": 3.3289, + "step": 736 + }, + { + "epoch": 0.22, + "grad_norm": 33.00816345214844, + "learning_rate": 1.852460659516889e-05, + "loss": 2.98, + "step": 737 + }, + { + "epoch": 0.22, + "grad_norm": 16.756553649902344, + "learning_rate": 1.8522601984564498e-05, + "loss": 3.0438, + "step": 738 + }, + { + "epoch": 0.22, + "grad_norm": 24.259780883789062, + "learning_rate": 1.852059737396011e-05, + "loss": 3.9168, + "step": 739 + }, + { + "epoch": 0.22, + "grad_norm": 10.961197853088379, + "learning_rate": 1.8518592763355722e-05, + "loss": 2.521, + "step": 740 + }, + { + "epoch": 0.22, + "grad_norm": 11.985860824584961, + "learning_rate": 1.851658815275133e-05, + "loss": 3.6082, + "step": 741 + }, + { + "epoch": 0.22, + "grad_norm": 13.038666725158691, + "learning_rate": 1.851458354214694e-05, + "loss": 2.8915, + "step": 742 + }, + { + "epoch": 0.22, + "grad_norm": 15.792445182800293, + "learning_rate": 1.851257893154255e-05, + "loss": 3.0136, + "step": 743 + }, + { + "epoch": 0.22, + "grad_norm": 11.698639869689941, + "learning_rate": 1.851057432093816e-05, + "loss": 2.9538, + "step": 744 + }, + { + "epoch": 0.22, + "grad_norm": 14.640687942504883, + "learning_rate": 1.850856971033377e-05, + "loss": 2.1114, + "step": 745 + }, + { + "epoch": 0.22, + "grad_norm": 21.3159122467041, + "learning_rate": 1.850656509972938e-05, + "loss": 3.1929, + "step": 746 + }, + { + "epoch": 0.22, + "grad_norm": 15.516459465026855, + "learning_rate": 1.850456048912499e-05, + "loss": 3.4558, + "step": 747 + }, + { + "epoch": 0.22, + "grad_norm": 12.31661605834961, + "learning_rate": 1.85025558785206e-05, + "loss": 3.5311, + "step": 748 + }, + { + "epoch": 0.23, + "grad_norm": 17.94566535949707, + "learning_rate": 1.850055126791621e-05, + "loss": 3.6713, + "step": 749 + }, + { + "epoch": 0.23, + "grad_norm": 13.146416664123535, + "learning_rate": 1.849854665731182e-05, + "loss": 2.0929, + "step": 750 + }, + { + "epoch": 0.23, + "grad_norm": 15.036728858947754, + "learning_rate": 1.849654204670743e-05, + "loss": 3.2656, + "step": 751 + }, + { + "epoch": 0.23, + "grad_norm": 12.377967834472656, + "learning_rate": 1.849453743610304e-05, + "loss": 2.811, + "step": 752 + }, + { + "epoch": 0.23, + "grad_norm": 48.185482025146484, + "learning_rate": 1.849253282549865e-05, + "loss": 4.3531, + "step": 753 + }, + { + "epoch": 0.23, + "grad_norm": 16.886388778686523, + "learning_rate": 1.849052821489426e-05, + "loss": 3.3402, + "step": 754 + }, + { + "epoch": 0.23, + "grad_norm": 10.66983699798584, + "learning_rate": 1.8488523604289866e-05, + "loss": 2.5462, + "step": 755 + }, + { + "epoch": 0.23, + "grad_norm": 13.7207670211792, + "learning_rate": 1.848651899368548e-05, + "loss": 4.2536, + "step": 756 + }, + { + "epoch": 0.23, + "grad_norm": 11.786395072937012, + "learning_rate": 1.8484514383081086e-05, + "loss": 3.3007, + "step": 757 + }, + { + "epoch": 0.23, + "grad_norm": 19.73274803161621, + "learning_rate": 1.8482509772476697e-05, + "loss": 3.2301, + "step": 758 + }, + { + "epoch": 0.23, + "grad_norm": 15.213275909423828, + "learning_rate": 1.848050516187231e-05, + "loss": 3.0059, + "step": 759 + }, + { + "epoch": 0.23, + "grad_norm": 19.37066078186035, + "learning_rate": 1.8478500551267917e-05, + "loss": 2.8112, + "step": 760 + }, + { + "epoch": 0.23, + "grad_norm": 48.94891357421875, + "learning_rate": 1.8476495940663527e-05, + "loss": 4.0552, + "step": 761 + }, + { + "epoch": 0.23, + "grad_norm": 11.666791915893555, + "learning_rate": 1.8474491330059137e-05, + "loss": 3.0329, + "step": 762 + }, + { + "epoch": 0.23, + "grad_norm": 14.530978202819824, + "learning_rate": 1.8472486719454747e-05, + "loss": 2.9601, + "step": 763 + }, + { + "epoch": 0.23, + "grad_norm": 15.476301193237305, + "learning_rate": 1.8470482108850357e-05, + "loss": 3.8685, + "step": 764 + }, + { + "epoch": 0.23, + "grad_norm": 34.36149597167969, + "learning_rate": 1.8468477498245967e-05, + "loss": 3.6309, + "step": 765 + }, + { + "epoch": 0.23, + "grad_norm": 13.753215789794922, + "learning_rate": 1.8466472887641577e-05, + "loss": 2.5185, + "step": 766 + }, + { + "epoch": 0.23, + "grad_norm": 14.692520141601562, + "learning_rate": 1.8464468277037187e-05, + "loss": 3.4938, + "step": 767 + }, + { + "epoch": 0.23, + "grad_norm": 19.021665573120117, + "learning_rate": 1.8462463666432797e-05, + "loss": 2.4541, + "step": 768 + }, + { + "epoch": 0.23, + "grad_norm": 15.532613754272461, + "learning_rate": 1.8460459055828407e-05, + "loss": 3.5129, + "step": 769 + }, + { + "epoch": 0.23, + "grad_norm": 19.325273513793945, + "learning_rate": 1.8458454445224018e-05, + "loss": 3.8114, + "step": 770 + }, + { + "epoch": 0.23, + "grad_norm": 19.3370361328125, + "learning_rate": 1.8456449834619628e-05, + "loss": 3.7453, + "step": 771 + }, + { + "epoch": 0.23, + "grad_norm": 17.998004913330078, + "learning_rate": 1.8454445224015238e-05, + "loss": 4.1352, + "step": 772 + }, + { + "epoch": 0.23, + "grad_norm": 14.474387168884277, + "learning_rate": 1.8452440613410848e-05, + "loss": 3.0304, + "step": 773 + }, + { + "epoch": 0.23, + "grad_norm": 12.42952823638916, + "learning_rate": 1.8450436002806454e-05, + "loss": 2.8432, + "step": 774 + }, + { + "epoch": 0.23, + "grad_norm": 15.06114387512207, + "learning_rate": 1.8448431392202068e-05, + "loss": 2.9204, + "step": 775 + }, + { + "epoch": 0.23, + "grad_norm": 19.3477840423584, + "learning_rate": 1.8446426781597678e-05, + "loss": 2.6387, + "step": 776 + }, + { + "epoch": 0.23, + "grad_norm": 19.3367862701416, + "learning_rate": 1.8444422170993285e-05, + "loss": 3.7189, + "step": 777 + }, + { + "epoch": 0.23, + "grad_norm": 18.679487228393555, + "learning_rate": 1.8442417560388898e-05, + "loss": 2.7699, + "step": 778 + }, + { + "epoch": 0.23, + "grad_norm": 17.268980026245117, + "learning_rate": 1.8440412949784505e-05, + "loss": 3.1304, + "step": 779 + }, + { + "epoch": 0.23, + "grad_norm": 12.782285690307617, + "learning_rate": 1.8438408339180115e-05, + "loss": 2.1415, + "step": 780 + }, + { + "epoch": 0.23, + "grad_norm": 13.401495933532715, + "learning_rate": 1.8436403728575725e-05, + "loss": 2.8394, + "step": 781 + }, + { + "epoch": 0.24, + "grad_norm": 17.430240631103516, + "learning_rate": 1.8434399117971335e-05, + "loss": 3.3336, + "step": 782 + }, + { + "epoch": 0.24, + "grad_norm": 50.752410888671875, + "learning_rate": 1.8432394507366945e-05, + "loss": 3.1897, + "step": 783 + }, + { + "epoch": 0.24, + "grad_norm": 14.428933143615723, + "learning_rate": 1.8430389896762555e-05, + "loss": 2.8947, + "step": 784 + }, + { + "epoch": 0.24, + "grad_norm": 60.45037078857422, + "learning_rate": 1.8428385286158165e-05, + "loss": 2.7407, + "step": 785 + }, + { + "epoch": 0.24, + "grad_norm": 15.153972625732422, + "learning_rate": 1.8426380675553776e-05, + "loss": 3.9118, + "step": 786 + }, + { + "epoch": 0.24, + "grad_norm": 13.527739524841309, + "learning_rate": 1.8424376064949386e-05, + "loss": 2.8687, + "step": 787 + }, + { + "epoch": 0.24, + "grad_norm": 13.59441089630127, + "learning_rate": 1.8422371454344996e-05, + "loss": 3.4082, + "step": 788 + }, + { + "epoch": 0.24, + "grad_norm": 18.013368606567383, + "learning_rate": 1.8420366843740606e-05, + "loss": 3.1523, + "step": 789 + }, + { + "epoch": 0.24, + "grad_norm": 23.031423568725586, + "learning_rate": 1.8418362233136216e-05, + "loss": 3.8803, + "step": 790 + }, + { + "epoch": 0.24, + "grad_norm": 12.914963722229004, + "learning_rate": 1.8416357622531823e-05, + "loss": 3.0994, + "step": 791 + }, + { + "epoch": 0.24, + "grad_norm": 25.980188369750977, + "learning_rate": 1.8414353011927436e-05, + "loss": 1.9718, + "step": 792 + }, + { + "epoch": 0.24, + "grad_norm": 14.725564002990723, + "learning_rate": 1.8412348401323043e-05, + "loss": 2.472, + "step": 793 + }, + { + "epoch": 0.24, + "grad_norm": 13.297785758972168, + "learning_rate": 1.8410343790718653e-05, + "loss": 4.0097, + "step": 794 + }, + { + "epoch": 0.24, + "grad_norm": 15.004578590393066, + "learning_rate": 1.8408339180114266e-05, + "loss": 2.8653, + "step": 795 + }, + { + "epoch": 0.24, + "grad_norm": 19.27330780029297, + "learning_rate": 1.8406334569509873e-05, + "loss": 3.1684, + "step": 796 + }, + { + "epoch": 0.24, + "grad_norm": 25.846576690673828, + "learning_rate": 1.8404329958905483e-05, + "loss": 2.6346, + "step": 797 + }, + { + "epoch": 0.24, + "grad_norm": 9.999025344848633, + "learning_rate": 1.8402325348301093e-05, + "loss": 1.9912, + "step": 798 + }, + { + "epoch": 0.24, + "grad_norm": 11.801385879516602, + "learning_rate": 1.8400320737696703e-05, + "loss": 3.3957, + "step": 799 + }, + { + "epoch": 0.24, + "grad_norm": 12.033352851867676, + "learning_rate": 1.8398316127092313e-05, + "loss": 2.62, + "step": 800 + }, + { + "epoch": 0.24, + "grad_norm": 12.087538719177246, + "learning_rate": 1.8396311516487923e-05, + "loss": 3.2451, + "step": 801 + }, + { + "epoch": 0.24, + "grad_norm": 9.926271438598633, + "learning_rate": 1.8394306905883533e-05, + "loss": 2.5499, + "step": 802 + }, + { + "epoch": 0.24, + "grad_norm": 15.592676162719727, + "learning_rate": 1.8392302295279144e-05, + "loss": 3.7471, + "step": 803 + }, + { + "epoch": 0.24, + "grad_norm": 16.60494613647461, + "learning_rate": 1.8390297684674754e-05, + "loss": 3.218, + "step": 804 + }, + { + "epoch": 0.24, + "grad_norm": 20.184545516967773, + "learning_rate": 1.8388293074070364e-05, + "loss": 2.9737, + "step": 805 + }, + { + "epoch": 0.24, + "grad_norm": 12.5513334274292, + "learning_rate": 1.8386288463465974e-05, + "loss": 2.3381, + "step": 806 + }, + { + "epoch": 0.24, + "grad_norm": 19.2335262298584, + "learning_rate": 1.8384283852861584e-05, + "loss": 3.5287, + "step": 807 + }, + { + "epoch": 0.24, + "grad_norm": 16.872840881347656, + "learning_rate": 1.8382279242257194e-05, + "loss": 3.1892, + "step": 808 + }, + { + "epoch": 0.24, + "grad_norm": 22.987356185913086, + "learning_rate": 1.8380274631652804e-05, + "loss": 2.534, + "step": 809 + }, + { + "epoch": 0.24, + "grad_norm": 18.01645851135254, + "learning_rate": 1.837827002104841e-05, + "loss": 3.5278, + "step": 810 + }, + { + "epoch": 0.24, + "grad_norm": 23.722457885742188, + "learning_rate": 1.8376265410444024e-05, + "loss": 3.9305, + "step": 811 + }, + { + "epoch": 0.24, + "grad_norm": 11.13327693939209, + "learning_rate": 1.837426079983963e-05, + "loss": 3.9406, + "step": 812 + }, + { + "epoch": 0.24, + "grad_norm": 18.1392765045166, + "learning_rate": 1.837225618923524e-05, + "loss": 2.3532, + "step": 813 + }, + { + "epoch": 0.24, + "grad_norm": 17.197660446166992, + "learning_rate": 1.8370251578630854e-05, + "loss": 3.8571, + "step": 814 + }, + { + "epoch": 0.25, + "grad_norm": 16.233530044555664, + "learning_rate": 1.836824696802646e-05, + "loss": 2.8763, + "step": 815 + }, + { + "epoch": 0.25, + "grad_norm": 104.82502746582031, + "learning_rate": 1.836624235742207e-05, + "loss": 3.3302, + "step": 816 + }, + { + "epoch": 0.25, + "grad_norm": 14.149030685424805, + "learning_rate": 1.836423774681768e-05, + "loss": 3.2956, + "step": 817 + }, + { + "epoch": 0.25, + "grad_norm": 13.704061508178711, + "learning_rate": 1.836223313621329e-05, + "loss": 2.7446, + "step": 818 + }, + { + "epoch": 0.25, + "grad_norm": 14.284625053405762, + "learning_rate": 1.83602285256089e-05, + "loss": 2.9887, + "step": 819 + }, + { + "epoch": 0.25, + "grad_norm": 20.733549118041992, + "learning_rate": 1.835822391500451e-05, + "loss": 3.3273, + "step": 820 + }, + { + "epoch": 0.25, + "grad_norm": 27.305700302124023, + "learning_rate": 1.835621930440012e-05, + "loss": 3.061, + "step": 821 + }, + { + "epoch": 0.25, + "grad_norm": 11.420647621154785, + "learning_rate": 1.8354214693795732e-05, + "loss": 3.1825, + "step": 822 + }, + { + "epoch": 0.25, + "grad_norm": 13.486950874328613, + "learning_rate": 1.8352210083191342e-05, + "loss": 3.1205, + "step": 823 + }, + { + "epoch": 0.25, + "grad_norm": 21.656282424926758, + "learning_rate": 1.8350205472586952e-05, + "loss": 2.7608, + "step": 824 + }, + { + "epoch": 0.25, + "grad_norm": 31.164608001708984, + "learning_rate": 1.8348200861982562e-05, + "loss": 3.3097, + "step": 825 + }, + { + "epoch": 0.25, + "grad_norm": 36.02324676513672, + "learning_rate": 1.8346196251378172e-05, + "loss": 3.2008, + "step": 826 + }, + { + "epoch": 0.25, + "grad_norm": 15.944084167480469, + "learning_rate": 1.8344191640773782e-05, + "loss": 2.9221, + "step": 827 + }, + { + "epoch": 0.25, + "grad_norm": 14.965399742126465, + "learning_rate": 1.8342187030169392e-05, + "loss": 3.1973, + "step": 828 + }, + { + "epoch": 0.25, + "grad_norm": 20.134925842285156, + "learning_rate": 1.8340182419565e-05, + "loss": 3.521, + "step": 829 + }, + { + "epoch": 0.25, + "grad_norm": 17.916906356811523, + "learning_rate": 1.8338177808960612e-05, + "loss": 3.8329, + "step": 830 + }, + { + "epoch": 0.25, + "grad_norm": 22.273197174072266, + "learning_rate": 1.8336173198356223e-05, + "loss": 3.0719, + "step": 831 + }, + { + "epoch": 0.25, + "grad_norm": 18.33119773864746, + "learning_rate": 1.833416858775183e-05, + "loss": 3.1312, + "step": 832 + }, + { + "epoch": 0.25, + "grad_norm": 17.575429916381836, + "learning_rate": 1.8332163977147443e-05, + "loss": 2.7529, + "step": 833 + }, + { + "epoch": 0.25, + "grad_norm": 16.5457820892334, + "learning_rate": 1.833015936654305e-05, + "loss": 2.3905, + "step": 834 + }, + { + "epoch": 0.25, + "grad_norm": 19.157955169677734, + "learning_rate": 1.832815475593866e-05, + "loss": 3.6365, + "step": 835 + }, + { + "epoch": 0.25, + "grad_norm": 16.239450454711914, + "learning_rate": 1.832615014533427e-05, + "loss": 2.864, + "step": 836 + }, + { + "epoch": 0.25, + "grad_norm": 24.656652450561523, + "learning_rate": 1.832414553472988e-05, + "loss": 3.4563, + "step": 837 + }, + { + "epoch": 0.25, + "grad_norm": 21.09084129333496, + "learning_rate": 1.832214092412549e-05, + "loss": 3.3509, + "step": 838 + }, + { + "epoch": 0.25, + "grad_norm": 14.556492805480957, + "learning_rate": 1.83201363135211e-05, + "loss": 3.0753, + "step": 839 + }, + { + "epoch": 0.25, + "grad_norm": 19.413835525512695, + "learning_rate": 1.831813170291671e-05, + "loss": 2.9689, + "step": 840 + }, + { + "epoch": 0.25, + "eval_loss": 0.5996547937393188, + "eval_runtime": 43.6208, + "eval_samples_per_second": 33.906, + "eval_steps_per_second": 33.906, + "step": 840 + }, + { + "epoch": 0.25, + "grad_norm": 17.024858474731445, + "learning_rate": 1.831612709231232e-05, + "loss": 4.6989, + "step": 841 + }, + { + "epoch": 0.25, + "grad_norm": 12.914769172668457, + "learning_rate": 1.831412248170793e-05, + "loss": 3.6312, + "step": 842 + }, + { + "epoch": 0.25, + "grad_norm": 89.747802734375, + "learning_rate": 1.831211787110354e-05, + "loss": 2.7291, + "step": 843 + }, + { + "epoch": 0.25, + "grad_norm": 15.173033714294434, + "learning_rate": 1.831011326049915e-05, + "loss": 3.1805, + "step": 844 + }, + { + "epoch": 0.25, + "grad_norm": 40.06729507446289, + "learning_rate": 1.830810864989476e-05, + "loss": 4.5534, + "step": 845 + }, + { + "epoch": 0.25, + "grad_norm": 13.402242660522461, + "learning_rate": 1.830610403929037e-05, + "loss": 2.946, + "step": 846 + }, + { + "epoch": 0.25, + "grad_norm": 18.814979553222656, + "learning_rate": 1.830409942868598e-05, + "loss": 3.3728, + "step": 847 + }, + { + "epoch": 0.25, + "grad_norm": 12.975325584411621, + "learning_rate": 1.8302094818081587e-05, + "loss": 2.8497, + "step": 848 + }, + { + "epoch": 0.26, + "grad_norm": 9.28498363494873, + "learning_rate": 1.83000902074772e-05, + "loss": 2.4016, + "step": 849 + }, + { + "epoch": 0.26, + "grad_norm": 15.675614356994629, + "learning_rate": 1.829808559687281e-05, + "loss": 2.7848, + "step": 850 + }, + { + "epoch": 0.26, + "grad_norm": 13.038680076599121, + "learning_rate": 1.8296080986268417e-05, + "loss": 3.4155, + "step": 851 + }, + { + "epoch": 0.26, + "grad_norm": 11.621955871582031, + "learning_rate": 1.829407637566403e-05, + "loss": 2.8026, + "step": 852 + }, + { + "epoch": 0.26, + "grad_norm": 11.502103805541992, + "learning_rate": 1.8292071765059638e-05, + "loss": 2.2732, + "step": 853 + }, + { + "epoch": 0.26, + "grad_norm": 27.41511344909668, + "learning_rate": 1.8290067154455248e-05, + "loss": 4.0694, + "step": 854 + }, + { + "epoch": 0.26, + "grad_norm": 16.437061309814453, + "learning_rate": 1.8288062543850858e-05, + "loss": 3.0165, + "step": 855 + }, + { + "epoch": 0.26, + "grad_norm": 14.086379051208496, + "learning_rate": 1.8286057933246468e-05, + "loss": 2.6271, + "step": 856 + }, + { + "epoch": 0.26, + "grad_norm": 18.89293670654297, + "learning_rate": 1.8284053322642078e-05, + "loss": 3.2895, + "step": 857 + }, + { + "epoch": 0.26, + "grad_norm": 14.886481285095215, + "learning_rate": 1.8282048712037688e-05, + "loss": 3.3004, + "step": 858 + }, + { + "epoch": 0.26, + "grad_norm": 21.090091705322266, + "learning_rate": 1.8280044101433298e-05, + "loss": 3.0523, + "step": 859 + }, + { + "epoch": 0.26, + "grad_norm": 13.093547821044922, + "learning_rate": 1.8278039490828908e-05, + "loss": 2.3511, + "step": 860 + }, + { + "epoch": 0.26, + "grad_norm": 13.202871322631836, + "learning_rate": 1.8276034880224518e-05, + "loss": 2.7941, + "step": 861 + }, + { + "epoch": 0.26, + "grad_norm": 10.028319358825684, + "learning_rate": 1.827403026962013e-05, + "loss": 2.6855, + "step": 862 + }, + { + "epoch": 0.26, + "grad_norm": 14.725035667419434, + "learning_rate": 1.827202565901574e-05, + "loss": 3.3114, + "step": 863 + }, + { + "epoch": 0.26, + "grad_norm": 20.055673599243164, + "learning_rate": 1.827002104841135e-05, + "loss": 4.0679, + "step": 864 + }, + { + "epoch": 0.26, + "grad_norm": 15.065644264221191, + "learning_rate": 1.8268016437806955e-05, + "loss": 2.8739, + "step": 865 + }, + { + "epoch": 0.26, + "grad_norm": 11.230770111083984, + "learning_rate": 1.826601182720257e-05, + "loss": 3.2987, + "step": 866 + }, + { + "epoch": 0.26, + "grad_norm": 27.67386245727539, + "learning_rate": 1.8264007216598175e-05, + "loss": 3.0173, + "step": 867 + }, + { + "epoch": 0.26, + "grad_norm": 18.033613204956055, + "learning_rate": 1.8262002605993785e-05, + "loss": 3.4757, + "step": 868 + }, + { + "epoch": 0.26, + "grad_norm": 12.066390037536621, + "learning_rate": 1.82599979953894e-05, + "loss": 2.815, + "step": 869 + }, + { + "epoch": 0.26, + "grad_norm": 18.521556854248047, + "learning_rate": 1.8257993384785006e-05, + "loss": 4.0135, + "step": 870 + }, + { + "epoch": 0.26, + "grad_norm": 19.63759422302246, + "learning_rate": 1.825598877418062e-05, + "loss": 4.0507, + "step": 871 + }, + { + "epoch": 0.26, + "grad_norm": 34.88851547241211, + "learning_rate": 1.8253984163576226e-05, + "loss": 4.7139, + "step": 872 + }, + { + "epoch": 0.26, + "grad_norm": 15.407425880432129, + "learning_rate": 1.8251979552971836e-05, + "loss": 3.7188, + "step": 873 + }, + { + "epoch": 0.26, + "grad_norm": 12.915898323059082, + "learning_rate": 1.824997494236745e-05, + "loss": 2.5801, + "step": 874 + }, + { + "epoch": 0.26, + "grad_norm": 12.690547943115234, + "learning_rate": 1.8247970331763056e-05, + "loss": 2.3076, + "step": 875 + }, + { + "epoch": 0.26, + "grad_norm": 31.50592041015625, + "learning_rate": 1.8245965721158666e-05, + "loss": 2.8265, + "step": 876 + }, + { + "epoch": 0.26, + "grad_norm": 13.253562927246094, + "learning_rate": 1.8243961110554276e-05, + "loss": 2.612, + "step": 877 + }, + { + "epoch": 0.26, + "grad_norm": 21.80856704711914, + "learning_rate": 1.8241956499949886e-05, + "loss": 3.0365, + "step": 878 + }, + { + "epoch": 0.26, + "grad_norm": 17.941879272460938, + "learning_rate": 1.8239951889345496e-05, + "loss": 2.9886, + "step": 879 + }, + { + "epoch": 0.26, + "grad_norm": 14.684907913208008, + "learning_rate": 1.8237947278741106e-05, + "loss": 2.6941, + "step": 880 + }, + { + "epoch": 0.26, + "grad_norm": 19.08102798461914, + "learning_rate": 1.8235942668136717e-05, + "loss": 3.4722, + "step": 881 + }, + { + "epoch": 0.27, + "grad_norm": 14.615809440612793, + "learning_rate": 1.8233938057532327e-05, + "loss": 2.9938, + "step": 882 + }, + { + "epoch": 0.27, + "grad_norm": 22.291290283203125, + "learning_rate": 1.8231933446927937e-05, + "loss": 4.3829, + "step": 883 + }, + { + "epoch": 0.27, + "grad_norm": 16.102741241455078, + "learning_rate": 1.8229928836323543e-05, + "loss": 3.5552, + "step": 884 + }, + { + "epoch": 0.27, + "grad_norm": 20.495725631713867, + "learning_rate": 1.8227924225719157e-05, + "loss": 3.3252, + "step": 885 + }, + { + "epoch": 0.27, + "grad_norm": 25.180877685546875, + "learning_rate": 1.8225919615114764e-05, + "loss": 2.6339, + "step": 886 + }, + { + "epoch": 0.27, + "grad_norm": 14.054975509643555, + "learning_rate": 1.8223915004510374e-05, + "loss": 3.2697, + "step": 887 + }, + { + "epoch": 0.27, + "grad_norm": 14.858386993408203, + "learning_rate": 1.8221910393905987e-05, + "loss": 2.3764, + "step": 888 + }, + { + "epoch": 0.27, + "grad_norm": 14.020156860351562, + "learning_rate": 1.8219905783301594e-05, + "loss": 2.4207, + "step": 889 + }, + { + "epoch": 0.27, + "grad_norm": 20.844745635986328, + "learning_rate": 1.8217901172697204e-05, + "loss": 2.8965, + "step": 890 + }, + { + "epoch": 0.27, + "grad_norm": 12.82536792755127, + "learning_rate": 1.8215896562092814e-05, + "loss": 2.1605, + "step": 891 + }, + { + "epoch": 0.27, + "grad_norm": 11.74587345123291, + "learning_rate": 1.8213891951488424e-05, + "loss": 3.171, + "step": 892 + }, + { + "epoch": 0.27, + "grad_norm": 21.482084274291992, + "learning_rate": 1.8211887340884034e-05, + "loss": 3.142, + "step": 893 + }, + { + "epoch": 0.27, + "grad_norm": 28.953731536865234, + "learning_rate": 1.8209882730279644e-05, + "loss": 3.2382, + "step": 894 + }, + { + "epoch": 0.27, + "grad_norm": 17.727638244628906, + "learning_rate": 1.8207878119675254e-05, + "loss": 3.2516, + "step": 895 + }, + { + "epoch": 0.27, + "grad_norm": 24.647932052612305, + "learning_rate": 1.8205873509070864e-05, + "loss": 3.1515, + "step": 896 + }, + { + "epoch": 0.27, + "grad_norm": 15.397368431091309, + "learning_rate": 1.8203868898466475e-05, + "loss": 3.8315, + "step": 897 + }, + { + "epoch": 0.27, + "grad_norm": 10.54749584197998, + "learning_rate": 1.8201864287862085e-05, + "loss": 1.3177, + "step": 898 + }, + { + "epoch": 0.27, + "grad_norm": 22.014423370361328, + "learning_rate": 1.8199859677257695e-05, + "loss": 3.8405, + "step": 899 + }, + { + "epoch": 0.27, + "grad_norm": 18.356632232666016, + "learning_rate": 1.8197855066653305e-05, + "loss": 3.0761, + "step": 900 + }, + { + "epoch": 0.27, + "grad_norm": 19.23956298828125, + "learning_rate": 1.8195850456048915e-05, + "loss": 3.8153, + "step": 901 + }, + { + "epoch": 0.27, + "grad_norm": 21.569475173950195, + "learning_rate": 1.8193845845444525e-05, + "loss": 2.913, + "step": 902 + }, + { + "epoch": 0.27, + "grad_norm": 10.557537078857422, + "learning_rate": 1.819184123484013e-05, + "loss": 2.8867, + "step": 903 + }, + { + "epoch": 0.27, + "grad_norm": 14.843564987182617, + "learning_rate": 1.8189836624235745e-05, + "loss": 2.9857, + "step": 904 + }, + { + "epoch": 0.27, + "grad_norm": 10.520699501037598, + "learning_rate": 1.8187832013631355e-05, + "loss": 3.0075, + "step": 905 + }, + { + "epoch": 0.27, + "grad_norm": 18.192188262939453, + "learning_rate": 1.8185827403026962e-05, + "loss": 4.4008, + "step": 906 + }, + { + "epoch": 0.27, + "grad_norm": 13.926733016967773, + "learning_rate": 1.8183822792422575e-05, + "loss": 3.1211, + "step": 907 + }, + { + "epoch": 0.27, + "grad_norm": 17.12418556213379, + "learning_rate": 1.8181818181818182e-05, + "loss": 3.4901, + "step": 908 + }, + { + "epoch": 0.27, + "grad_norm": 16.681673049926758, + "learning_rate": 1.8179813571213792e-05, + "loss": 3.2524, + "step": 909 + }, + { + "epoch": 0.27, + "grad_norm": 18.500484466552734, + "learning_rate": 1.8177808960609402e-05, + "loss": 3.7737, + "step": 910 + }, + { + "epoch": 0.27, + "grad_norm": 13.746018409729004, + "learning_rate": 1.8175804350005012e-05, + "loss": 2.4568, + "step": 911 + }, + { + "epoch": 0.27, + "grad_norm": 12.869297981262207, + "learning_rate": 1.8173799739400622e-05, + "loss": 2.6976, + "step": 912 + }, + { + "epoch": 0.27, + "grad_norm": 17.630205154418945, + "learning_rate": 1.8171795128796232e-05, + "loss": 3.2424, + "step": 913 + }, + { + "epoch": 0.27, + "grad_norm": 11.685369491577148, + "learning_rate": 1.8169790518191843e-05, + "loss": 2.3216, + "step": 914 + }, + { + "epoch": 0.28, + "grad_norm": 23.479877471923828, + "learning_rate": 1.8167785907587453e-05, + "loss": 3.3934, + "step": 915 + }, + { + "epoch": 0.28, + "grad_norm": 11.527375221252441, + "learning_rate": 1.8165781296983063e-05, + "loss": 2.9907, + "step": 916 + }, + { + "epoch": 0.28, + "grad_norm": 11.39271354675293, + "learning_rate": 1.8163776686378673e-05, + "loss": 2.8909, + "step": 917 + }, + { + "epoch": 0.28, + "grad_norm": 18.55435562133789, + "learning_rate": 1.8161772075774283e-05, + "loss": 2.9944, + "step": 918 + }, + { + "epoch": 0.28, + "grad_norm": 13.561935424804688, + "learning_rate": 1.8159767465169893e-05, + "loss": 2.385, + "step": 919 + }, + { + "epoch": 0.28, + "grad_norm": 16.513639450073242, + "learning_rate": 1.8157762854565503e-05, + "loss": 3.0903, + "step": 920 + }, + { + "epoch": 0.28, + "grad_norm": 19.51789093017578, + "learning_rate": 1.8155758243961113e-05, + "loss": 4.0376, + "step": 921 + }, + { + "epoch": 0.28, + "grad_norm": 16.57154655456543, + "learning_rate": 1.815375363335672e-05, + "loss": 3.3258, + "step": 922 + }, + { + "epoch": 0.28, + "grad_norm": 12.257933616638184, + "learning_rate": 1.8151749022752333e-05, + "loss": 2.85, + "step": 923 + }, + { + "epoch": 0.28, + "grad_norm": 19.822603225708008, + "learning_rate": 1.8149744412147943e-05, + "loss": 2.7183, + "step": 924 + }, + { + "epoch": 0.28, + "grad_norm": 13.144048690795898, + "learning_rate": 1.814773980154355e-05, + "loss": 2.9005, + "step": 925 + }, + { + "epoch": 0.28, + "grad_norm": 13.302081108093262, + "learning_rate": 1.8145735190939164e-05, + "loss": 3.5627, + "step": 926 + }, + { + "epoch": 0.28, + "grad_norm": 18.423614501953125, + "learning_rate": 1.814373058033477e-05, + "loss": 3.7219, + "step": 927 + }, + { + "epoch": 0.28, + "grad_norm": 16.71343231201172, + "learning_rate": 1.814172596973038e-05, + "loss": 2.3099, + "step": 928 + }, + { + "epoch": 0.28, + "grad_norm": 12.817221641540527, + "learning_rate": 1.813972135912599e-05, + "loss": 3.1187, + "step": 929 + }, + { + "epoch": 0.28, + "grad_norm": 23.68539047241211, + "learning_rate": 1.81377167485216e-05, + "loss": 2.8866, + "step": 930 + }, + { + "epoch": 0.28, + "grad_norm": 19.9428768157959, + "learning_rate": 1.813571213791721e-05, + "loss": 3.3457, + "step": 931 + }, + { + "epoch": 0.28, + "grad_norm": 15.457541465759277, + "learning_rate": 1.813370752731282e-05, + "loss": 2.5672, + "step": 932 + }, + { + "epoch": 0.28, + "grad_norm": 14.215147972106934, + "learning_rate": 1.813170291670843e-05, + "loss": 3.2077, + "step": 933 + }, + { + "epoch": 0.28, + "grad_norm": 23.65352439880371, + "learning_rate": 1.812969830610404e-05, + "loss": 4.0376, + "step": 934 + }, + { + "epoch": 0.28, + "grad_norm": 15.811484336853027, + "learning_rate": 1.812769369549965e-05, + "loss": 3.4806, + "step": 935 + }, + { + "epoch": 0.28, + "grad_norm": 22.097625732421875, + "learning_rate": 1.812568908489526e-05, + "loss": 2.9302, + "step": 936 + }, + { + "epoch": 0.28, + "grad_norm": 21.592880249023438, + "learning_rate": 1.812368447429087e-05, + "loss": 3.5155, + "step": 937 + }, + { + "epoch": 0.28, + "grad_norm": 13.204874992370605, + "learning_rate": 1.812167986368648e-05, + "loss": 3.172, + "step": 938 + }, + { + "epoch": 0.28, + "grad_norm": 19.575117111206055, + "learning_rate": 1.811967525308209e-05, + "loss": 3.1032, + "step": 939 + }, + { + "epoch": 0.28, + "grad_norm": 19.439678192138672, + "learning_rate": 1.81176706424777e-05, + "loss": 3.1572, + "step": 940 + }, + { + "epoch": 0.28, + "grad_norm": 18.003063201904297, + "learning_rate": 1.8115666031873308e-05, + "loss": 3.6915, + "step": 941 + }, + { + "epoch": 0.28, + "grad_norm": 14.612069129943848, + "learning_rate": 1.811366142126892e-05, + "loss": 2.6144, + "step": 942 + }, + { + "epoch": 0.28, + "grad_norm": 26.311756134033203, + "learning_rate": 1.811165681066453e-05, + "loss": 4.1978, + "step": 943 + }, + { + "epoch": 0.28, + "grad_norm": 12.142837524414062, + "learning_rate": 1.810965220006014e-05, + "loss": 1.9791, + "step": 944 + }, + { + "epoch": 0.28, + "grad_norm": 16.259824752807617, + "learning_rate": 1.8107647589455752e-05, + "loss": 3.0366, + "step": 945 + }, + { + "epoch": 0.28, + "grad_norm": 15.532299041748047, + "learning_rate": 1.810564297885136e-05, + "loss": 2.9808, + "step": 946 + }, + { + "epoch": 0.28, + "grad_norm": 31.902145385742188, + "learning_rate": 1.810363836824697e-05, + "loss": 4.2394, + "step": 947 + }, + { + "epoch": 0.29, + "grad_norm": 20.337215423583984, + "learning_rate": 1.8101633757642582e-05, + "loss": 3.8077, + "step": 948 + }, + { + "epoch": 0.29, + "grad_norm": 15.51062297821045, + "learning_rate": 1.809962914703819e-05, + "loss": 1.9024, + "step": 949 + }, + { + "epoch": 0.29, + "grad_norm": 21.733741760253906, + "learning_rate": 1.80976245364338e-05, + "loss": 3.1563, + "step": 950 + }, + { + "epoch": 0.29, + "grad_norm": 27.472270965576172, + "learning_rate": 1.809561992582941e-05, + "loss": 3.7646, + "step": 951 + }, + { + "epoch": 0.29, + "grad_norm": 33.5841178894043, + "learning_rate": 1.809361531522502e-05, + "loss": 3.6314, + "step": 952 + }, + { + "epoch": 0.29, + "grad_norm": 15.488965034484863, + "learning_rate": 1.809161070462063e-05, + "loss": 3.8817, + "step": 953 + }, + { + "epoch": 0.29, + "grad_norm": 11.746240615844727, + "learning_rate": 1.808960609401624e-05, + "loss": 2.1518, + "step": 954 + }, + { + "epoch": 0.29, + "grad_norm": 9.70715618133545, + "learning_rate": 1.808760148341185e-05, + "loss": 2.5758, + "step": 955 + }, + { + "epoch": 0.29, + "grad_norm": 15.384037017822266, + "learning_rate": 1.808559687280746e-05, + "loss": 3.2143, + "step": 956 + }, + { + "epoch": 0.29, + "grad_norm": 18.4316349029541, + "learning_rate": 1.808359226220307e-05, + "loss": 4.8752, + "step": 957 + }, + { + "epoch": 0.29, + "grad_norm": 13.498906135559082, + "learning_rate": 1.8081587651598676e-05, + "loss": 3.2658, + "step": 958 + }, + { + "epoch": 0.29, + "grad_norm": 26.12436866760254, + "learning_rate": 1.807958304099429e-05, + "loss": 1.9117, + "step": 959 + }, + { + "epoch": 0.29, + "grad_norm": 14.987210273742676, + "learning_rate": 1.80775784303899e-05, + "loss": 2.1122, + "step": 960 + }, + { + "epoch": 0.29, + "eval_loss": 0.5726422667503357, + "eval_runtime": 43.6835, + "eval_samples_per_second": 33.857, + "eval_steps_per_second": 33.857, + "step": 960 + }, + { + "epoch": 0.29, + "grad_norm": 11.189974784851074, + "learning_rate": 1.8075573819785506e-05, + "loss": 2.7843, + "step": 961 + }, + { + "epoch": 0.29, + "grad_norm": 13.831470489501953, + "learning_rate": 1.807356920918112e-05, + "loss": 3.8859, + "step": 962 + }, + { + "epoch": 0.29, + "grad_norm": 14.756105422973633, + "learning_rate": 1.8071564598576727e-05, + "loss": 2.6933, + "step": 963 + }, + { + "epoch": 0.29, + "grad_norm": 13.506275177001953, + "learning_rate": 1.8069559987972337e-05, + "loss": 2.7927, + "step": 964 + }, + { + "epoch": 0.29, + "grad_norm": 27.999460220336914, + "learning_rate": 1.8067555377367947e-05, + "loss": 3.3839, + "step": 965 + }, + { + "epoch": 0.29, + "grad_norm": 13.807269096374512, + "learning_rate": 1.8065550766763557e-05, + "loss": 2.0444, + "step": 966 + }, + { + "epoch": 0.29, + "grad_norm": 15.027080535888672, + "learning_rate": 1.8063546156159167e-05, + "loss": 2.5851, + "step": 967 + }, + { + "epoch": 0.29, + "grad_norm": 46.652530670166016, + "learning_rate": 1.8061541545554777e-05, + "loss": 2.5573, + "step": 968 + }, + { + "epoch": 0.29, + "grad_norm": 17.231346130371094, + "learning_rate": 1.8059536934950387e-05, + "loss": 2.3659, + "step": 969 + }, + { + "epoch": 0.29, + "grad_norm": 17.74903678894043, + "learning_rate": 1.8057532324345997e-05, + "loss": 2.8746, + "step": 970 + }, + { + "epoch": 0.29, + "grad_norm": 9.483039855957031, + "learning_rate": 1.8055527713741607e-05, + "loss": 2.6829, + "step": 971 + }, + { + "epoch": 0.29, + "grad_norm": 10.787821769714355, + "learning_rate": 1.8053523103137217e-05, + "loss": 2.395, + "step": 972 + }, + { + "epoch": 0.29, + "grad_norm": 16.904396057128906, + "learning_rate": 1.8051518492532827e-05, + "loss": 2.8498, + "step": 973 + }, + { + "epoch": 0.29, + "grad_norm": 18.882564544677734, + "learning_rate": 1.8049513881928437e-05, + "loss": 3.3251, + "step": 974 + }, + { + "epoch": 0.29, + "grad_norm": 18.128250122070312, + "learning_rate": 1.8047509271324048e-05, + "loss": 3.1162, + "step": 975 + }, + { + "epoch": 0.29, + "grad_norm": 19.998552322387695, + "learning_rate": 1.8045504660719658e-05, + "loss": 3.183, + "step": 976 + }, + { + "epoch": 0.29, + "grad_norm": 19.723297119140625, + "learning_rate": 1.8043500050115264e-05, + "loss": 3.0132, + "step": 977 + }, + { + "epoch": 0.29, + "grad_norm": 18.414817810058594, + "learning_rate": 1.8041495439510878e-05, + "loss": 2.6902, + "step": 978 + }, + { + "epoch": 0.29, + "grad_norm": 15.55300521850586, + "learning_rate": 1.8039490828906488e-05, + "loss": 2.0389, + "step": 979 + }, + { + "epoch": 0.29, + "grad_norm": 19.29793357849121, + "learning_rate": 1.8037486218302095e-05, + "loss": 3.276, + "step": 980 + }, + { + "epoch": 0.29, + "grad_norm": 18.001596450805664, + "learning_rate": 1.8035481607697708e-05, + "loss": 2.2783, + "step": 981 + }, + { + "epoch": 0.3, + "grad_norm": 12.103147506713867, + "learning_rate": 1.8033476997093315e-05, + "loss": 3.8083, + "step": 982 + }, + { + "epoch": 0.3, + "grad_norm": 15.265912055969238, + "learning_rate": 1.8031472386488925e-05, + "loss": 2.1125, + "step": 983 + }, + { + "epoch": 0.3, + "grad_norm": 20.835962295532227, + "learning_rate": 1.8029467775884535e-05, + "loss": 2.128, + "step": 984 + }, + { + "epoch": 0.3, + "grad_norm": 27.07541847229004, + "learning_rate": 1.8027463165280145e-05, + "loss": 2.9855, + "step": 985 + }, + { + "epoch": 0.3, + "grad_norm": 17.84712791442871, + "learning_rate": 1.8025458554675755e-05, + "loss": 2.7988, + "step": 986 + }, + { + "epoch": 0.3, + "grad_norm": 15.661683082580566, + "learning_rate": 1.8023453944071365e-05, + "loss": 3.1825, + "step": 987 + }, + { + "epoch": 0.3, + "grad_norm": 21.294462203979492, + "learning_rate": 1.8021449333466975e-05, + "loss": 3.2597, + "step": 988 + }, + { + "epoch": 0.3, + "grad_norm": 14.863018035888672, + "learning_rate": 1.8019444722862585e-05, + "loss": 2.7602, + "step": 989 + }, + { + "epoch": 0.3, + "grad_norm": 14.493101119995117, + "learning_rate": 1.8017440112258195e-05, + "loss": 3.5392, + "step": 990 + }, + { + "epoch": 0.3, + "grad_norm": 16.697307586669922, + "learning_rate": 1.8015435501653806e-05, + "loss": 3.0317, + "step": 991 + }, + { + "epoch": 0.3, + "grad_norm": 21.106035232543945, + "learning_rate": 1.8013430891049416e-05, + "loss": 2.6286, + "step": 992 + }, + { + "epoch": 0.3, + "grad_norm": 14.834815979003906, + "learning_rate": 1.8011426280445026e-05, + "loss": 2.2224, + "step": 993 + }, + { + "epoch": 0.3, + "grad_norm": 16.66942596435547, + "learning_rate": 1.8009421669840636e-05, + "loss": 3.2398, + "step": 994 + }, + { + "epoch": 0.3, + "grad_norm": 20.955211639404297, + "learning_rate": 1.8007417059236246e-05, + "loss": 3.2694, + "step": 995 + }, + { + "epoch": 0.3, + "grad_norm": 16.78022575378418, + "learning_rate": 1.8005412448631853e-05, + "loss": 3.3401, + "step": 996 + }, + { + "epoch": 0.3, + "grad_norm": 10.238494873046875, + "learning_rate": 1.8003407838027466e-05, + "loss": 2.686, + "step": 997 + }, + { + "epoch": 0.3, + "grad_norm": 25.67633628845215, + "learning_rate": 1.8001403227423076e-05, + "loss": 4.3305, + "step": 998 + }, + { + "epoch": 0.3, + "grad_norm": 18.46412467956543, + "learning_rate": 1.7999398616818683e-05, + "loss": 4.0384, + "step": 999 + }, + { + "epoch": 0.3, + "grad_norm": 16.967388153076172, + "learning_rate": 1.7997394006214296e-05, + "loss": 2.752, + "step": 1000 + }, + { + "epoch": 0.3, + "grad_norm": 16.72841453552246, + "learning_rate": 1.7995389395609903e-05, + "loss": 1.9186, + "step": 1001 + }, + { + "epoch": 0.3, + "grad_norm": 17.65038299560547, + "learning_rate": 1.7993384785005513e-05, + "loss": 2.8156, + "step": 1002 + }, + { + "epoch": 0.3, + "grad_norm": 15.546381950378418, + "learning_rate": 1.7991380174401123e-05, + "loss": 2.9373, + "step": 1003 + }, + { + "epoch": 0.3, + "grad_norm": 13.01362133026123, + "learning_rate": 1.7989375563796733e-05, + "loss": 2.4807, + "step": 1004 + }, + { + "epoch": 0.3, + "grad_norm": 18.31947135925293, + "learning_rate": 1.7987370953192343e-05, + "loss": 2.6384, + "step": 1005 + }, + { + "epoch": 0.3, + "grad_norm": 19.710500717163086, + "learning_rate": 1.7985366342587953e-05, + "loss": 2.9314, + "step": 1006 + }, + { + "epoch": 0.3, + "grad_norm": 12.663653373718262, + "learning_rate": 1.7983361731983563e-05, + "loss": 1.9977, + "step": 1007 + }, + { + "epoch": 0.3, + "grad_norm": 23.557924270629883, + "learning_rate": 1.7981357121379174e-05, + "loss": 3.6326, + "step": 1008 + }, + { + "epoch": 0.3, + "grad_norm": 12.505498886108398, + "learning_rate": 1.7979352510774784e-05, + "loss": 2.3971, + "step": 1009 + }, + { + "epoch": 0.3, + "grad_norm": 21.709392547607422, + "learning_rate": 1.7977347900170394e-05, + "loss": 2.4877, + "step": 1010 + }, + { + "epoch": 0.3, + "grad_norm": 12.365106582641602, + "learning_rate": 1.7975343289566004e-05, + "loss": 3.1574, + "step": 1011 + }, + { + "epoch": 0.3, + "grad_norm": 18.052745819091797, + "learning_rate": 1.7973338678961614e-05, + "loss": 3.6156, + "step": 1012 + }, + { + "epoch": 0.3, + "grad_norm": 16.1375732421875, + "learning_rate": 1.7971334068357224e-05, + "loss": 3.0042, + "step": 1013 + }, + { + "epoch": 0.3, + "grad_norm": 15.52522087097168, + "learning_rate": 1.7969329457752834e-05, + "loss": 3.3669, + "step": 1014 + }, + { + "epoch": 0.31, + "grad_norm": 18.78600311279297, + "learning_rate": 1.796732484714844e-05, + "loss": 4.9568, + "step": 1015 + }, + { + "epoch": 0.31, + "grad_norm": 32.19879150390625, + "learning_rate": 1.7965320236544054e-05, + "loss": 3.4862, + "step": 1016 + }, + { + "epoch": 0.31, + "grad_norm": 22.817644119262695, + "learning_rate": 1.7963315625939664e-05, + "loss": 3.6447, + "step": 1017 + }, + { + "epoch": 0.31, + "grad_norm": 13.087103843688965, + "learning_rate": 1.796131101533527e-05, + "loss": 3.1701, + "step": 1018 + }, + { + "epoch": 0.31, + "grad_norm": 15.170973777770996, + "learning_rate": 1.7959306404730884e-05, + "loss": 2.907, + "step": 1019 + }, + { + "epoch": 0.31, + "grad_norm": 18.237186431884766, + "learning_rate": 1.795730179412649e-05, + "loss": 3.34, + "step": 1020 + }, + { + "epoch": 0.31, + "grad_norm": 21.131467819213867, + "learning_rate": 1.79552971835221e-05, + "loss": 3.0868, + "step": 1021 + }, + { + "epoch": 0.31, + "grad_norm": 21.808650970458984, + "learning_rate": 1.7953292572917715e-05, + "loss": 2.4194, + "step": 1022 + }, + { + "epoch": 0.31, + "grad_norm": 8.451123237609863, + "learning_rate": 1.795128796231332e-05, + "loss": 2.0371, + "step": 1023 + }, + { + "epoch": 0.31, + "grad_norm": 13.38468074798584, + "learning_rate": 1.794928335170893e-05, + "loss": 2.1737, + "step": 1024 + }, + { + "epoch": 0.31, + "grad_norm": 11.502458572387695, + "learning_rate": 1.794727874110454e-05, + "loss": 2.5593, + "step": 1025 + }, + { + "epoch": 0.31, + "grad_norm": 8.056184768676758, + "learning_rate": 1.794527413050015e-05, + "loss": 2.0176, + "step": 1026 + }, + { + "epoch": 0.31, + "grad_norm": 12.703288078308105, + "learning_rate": 1.7943269519895762e-05, + "loss": 3.1376, + "step": 1027 + }, + { + "epoch": 0.31, + "grad_norm": 18.933555603027344, + "learning_rate": 1.7941264909291372e-05, + "loss": 2.6452, + "step": 1028 + }, + { + "epoch": 0.31, + "grad_norm": 19.963056564331055, + "learning_rate": 1.7939260298686982e-05, + "loss": 3.3245, + "step": 1029 + }, + { + "epoch": 0.31, + "grad_norm": 11.356467247009277, + "learning_rate": 1.7937255688082592e-05, + "loss": 2.8729, + "step": 1030 + }, + { + "epoch": 0.31, + "grad_norm": 22.96855926513672, + "learning_rate": 1.7935251077478202e-05, + "loss": 3.4278, + "step": 1031 + }, + { + "epoch": 0.31, + "grad_norm": 9.410229682922363, + "learning_rate": 1.793324646687381e-05, + "loss": 2.4024, + "step": 1032 + }, + { + "epoch": 0.31, + "grad_norm": 15.325308799743652, + "learning_rate": 1.7931241856269422e-05, + "loss": 3.0026, + "step": 1033 + }, + { + "epoch": 0.31, + "grad_norm": 17.43353271484375, + "learning_rate": 1.7929237245665032e-05, + "loss": 3.4274, + "step": 1034 + }, + { + "epoch": 0.31, + "grad_norm": 18.91429328918457, + "learning_rate": 1.792723263506064e-05, + "loss": 2.8543, + "step": 1035 + }, + { + "epoch": 0.31, + "grad_norm": 13.564133644104004, + "learning_rate": 1.7925228024456253e-05, + "loss": 1.9484, + "step": 1036 + }, + { + "epoch": 0.31, + "grad_norm": 21.08160972595215, + "learning_rate": 1.792322341385186e-05, + "loss": 3.0899, + "step": 1037 + }, + { + "epoch": 0.31, + "grad_norm": 11.906721115112305, + "learning_rate": 1.792121880324747e-05, + "loss": 2.3356, + "step": 1038 + }, + { + "epoch": 0.31, + "grad_norm": 12.753665924072266, + "learning_rate": 1.791921419264308e-05, + "loss": 1.7421, + "step": 1039 + }, + { + "epoch": 0.31, + "grad_norm": 13.137770652770996, + "learning_rate": 1.791720958203869e-05, + "loss": 3.0557, + "step": 1040 + }, + { + "epoch": 0.31, + "grad_norm": 18.3424129486084, + "learning_rate": 1.7915204971434303e-05, + "loss": 2.9359, + "step": 1041 + }, + { + "epoch": 0.31, + "grad_norm": 14.686424255371094, + "learning_rate": 1.791320036082991e-05, + "loss": 2.947, + "step": 1042 + }, + { + "epoch": 0.31, + "grad_norm": 18.53430938720703, + "learning_rate": 1.791119575022552e-05, + "loss": 3.6603, + "step": 1043 + }, + { + "epoch": 0.31, + "grad_norm": 12.61064338684082, + "learning_rate": 1.790919113962113e-05, + "loss": 3.071, + "step": 1044 + }, + { + "epoch": 0.31, + "grad_norm": 16.695178985595703, + "learning_rate": 1.790718652901674e-05, + "loss": 3.6908, + "step": 1045 + }, + { + "epoch": 0.31, + "grad_norm": 21.377464294433594, + "learning_rate": 1.790518191841235e-05, + "loss": 2.0132, + "step": 1046 + }, + { + "epoch": 0.31, + "grad_norm": 12.307204246520996, + "learning_rate": 1.790317730780796e-05, + "loss": 3.3536, + "step": 1047 + }, + { + "epoch": 0.32, + "grad_norm": 18.136398315429688, + "learning_rate": 1.790117269720357e-05, + "loss": 3.3327, + "step": 1048 + }, + { + "epoch": 0.32, + "grad_norm": 14.812959671020508, + "learning_rate": 1.789916808659918e-05, + "loss": 2.203, + "step": 1049 + }, + { + "epoch": 0.32, + "grad_norm": 16.73038101196289, + "learning_rate": 1.789716347599479e-05, + "loss": 2.5606, + "step": 1050 + }, + { + "epoch": 0.32, + "grad_norm": 28.89952278137207, + "learning_rate": 1.7895158865390397e-05, + "loss": 3.7834, + "step": 1051 + }, + { + "epoch": 0.32, + "grad_norm": 16.880115509033203, + "learning_rate": 1.789315425478601e-05, + "loss": 2.5196, + "step": 1052 + }, + { + "epoch": 0.32, + "grad_norm": 13.364277839660645, + "learning_rate": 1.789114964418162e-05, + "loss": 2.9332, + "step": 1053 + }, + { + "epoch": 0.32, + "grad_norm": 34.66802978515625, + "learning_rate": 1.7889145033577227e-05, + "loss": 3.8191, + "step": 1054 + }, + { + "epoch": 0.32, + "grad_norm": 31.339599609375, + "learning_rate": 1.788714042297284e-05, + "loss": 2.9259, + "step": 1055 + }, + { + "epoch": 0.32, + "grad_norm": 20.67884063720703, + "learning_rate": 1.7885135812368447e-05, + "loss": 2.7581, + "step": 1056 + }, + { + "epoch": 0.32, + "grad_norm": 15.182465553283691, + "learning_rate": 1.7883131201764058e-05, + "loss": 2.8311, + "step": 1057 + }, + { + "epoch": 0.32, + "grad_norm": 18.39655303955078, + "learning_rate": 1.7881126591159668e-05, + "loss": 2.7088, + "step": 1058 + }, + { + "epoch": 0.32, + "grad_norm": 15.220178604125977, + "learning_rate": 1.7879121980555278e-05, + "loss": 3.1082, + "step": 1059 + }, + { + "epoch": 0.32, + "grad_norm": 13.494277954101562, + "learning_rate": 1.7877117369950888e-05, + "loss": 2.8682, + "step": 1060 + }, + { + "epoch": 0.32, + "grad_norm": 19.83290672302246, + "learning_rate": 1.7875112759346498e-05, + "loss": 3.3298, + "step": 1061 + }, + { + "epoch": 0.32, + "grad_norm": 22.487585067749023, + "learning_rate": 1.7873108148742108e-05, + "loss": 3.2538, + "step": 1062 + }, + { + "epoch": 0.32, + "grad_norm": 24.89381217956543, + "learning_rate": 1.7871103538137718e-05, + "loss": 2.5621, + "step": 1063 + }, + { + "epoch": 0.32, + "grad_norm": 17.16600227355957, + "learning_rate": 1.7869098927533328e-05, + "loss": 2.8225, + "step": 1064 + }, + { + "epoch": 0.32, + "grad_norm": 10.632941246032715, + "learning_rate": 1.7867094316928938e-05, + "loss": 3.0174, + "step": 1065 + }, + { + "epoch": 0.32, + "grad_norm": 20.951936721801758, + "learning_rate": 1.7865089706324548e-05, + "loss": 3.5367, + "step": 1066 + }, + { + "epoch": 0.32, + "grad_norm": 15.389318466186523, + "learning_rate": 1.786308509572016e-05, + "loss": 2.2259, + "step": 1067 + }, + { + "epoch": 0.32, + "grad_norm": 44.66872787475586, + "learning_rate": 1.786108048511577e-05, + "loss": 4.754, + "step": 1068 + }, + { + "epoch": 0.32, + "grad_norm": 12.777280807495117, + "learning_rate": 1.785907587451138e-05, + "loss": 3.2503, + "step": 1069 + }, + { + "epoch": 0.32, + "grad_norm": 130.45664978027344, + "learning_rate": 1.7857071263906985e-05, + "loss": 2.8328, + "step": 1070 + }, + { + "epoch": 0.32, + "grad_norm": 25.45984649658203, + "learning_rate": 1.78550666533026e-05, + "loss": 2.7516, + "step": 1071 + }, + { + "epoch": 0.32, + "grad_norm": 15.323507308959961, + "learning_rate": 1.785306204269821e-05, + "loss": 2.0025, + "step": 1072 + }, + { + "epoch": 0.32, + "grad_norm": 12.247516632080078, + "learning_rate": 1.7851057432093815e-05, + "loss": 2.7504, + "step": 1073 + }, + { + "epoch": 0.32, + "grad_norm": 33.093528747558594, + "learning_rate": 1.784905282148943e-05, + "loss": 3.8759, + "step": 1074 + }, + { + "epoch": 0.32, + "grad_norm": 19.78754234313965, + "learning_rate": 1.7847048210885036e-05, + "loss": 3.0278, + "step": 1075 + }, + { + "epoch": 0.32, + "grad_norm": 15.436639785766602, + "learning_rate": 1.7845043600280646e-05, + "loss": 3.5511, + "step": 1076 + }, + { + "epoch": 0.32, + "grad_norm": 12.693981170654297, + "learning_rate": 1.784303898967626e-05, + "loss": 3.4346, + "step": 1077 + }, + { + "epoch": 0.32, + "grad_norm": 17.650697708129883, + "learning_rate": 1.7841034379071866e-05, + "loss": 2.3388, + "step": 1078 + }, + { + "epoch": 0.32, + "grad_norm": 18.479936599731445, + "learning_rate": 1.7839029768467476e-05, + "loss": 3.4169, + "step": 1079 + }, + { + "epoch": 0.32, + "grad_norm": 14.039420127868652, + "learning_rate": 1.7837025157863086e-05, + "loss": 3.0089, + "step": 1080 + }, + { + "epoch": 0.32, + "eval_loss": 0.5645097494125366, + "eval_runtime": 43.9047, + "eval_samples_per_second": 33.687, + "eval_steps_per_second": 33.687, + "step": 1080 + }, + { + "epoch": 0.33, + "grad_norm": 18.384023666381836, + "learning_rate": 1.7835020547258696e-05, + "loss": 5.0499, + "step": 1081 + }, + { + "epoch": 0.33, + "grad_norm": 14.556102752685547, + "learning_rate": 1.7833015936654306e-05, + "loss": 2.7198, + "step": 1082 + }, + { + "epoch": 0.33, + "grad_norm": 16.748477935791016, + "learning_rate": 1.7831011326049916e-05, + "loss": 2.7809, + "step": 1083 + }, + { + "epoch": 0.33, + "grad_norm": 16.843488693237305, + "learning_rate": 1.7829006715445526e-05, + "loss": 3.1988, + "step": 1084 + }, + { + "epoch": 0.33, + "grad_norm": 60.20880889892578, + "learning_rate": 1.7827002104841137e-05, + "loss": 4.0739, + "step": 1085 + }, + { + "epoch": 0.33, + "grad_norm": 22.219043731689453, + "learning_rate": 1.7824997494236747e-05, + "loss": 3.1217, + "step": 1086 + }, + { + "epoch": 0.33, + "grad_norm": 15.742715835571289, + "learning_rate": 1.7822992883632357e-05, + "loss": 2.1444, + "step": 1087 + }, + { + "epoch": 0.33, + "grad_norm": 13.159707069396973, + "learning_rate": 1.7820988273027967e-05, + "loss": 3.7656, + "step": 1088 + }, + { + "epoch": 0.33, + "grad_norm": 18.79963493347168, + "learning_rate": 1.7818983662423573e-05, + "loss": 3.2334, + "step": 1089 + }, + { + "epoch": 0.33, + "grad_norm": 12.51858139038086, + "learning_rate": 1.7816979051819187e-05, + "loss": 1.6161, + "step": 1090 + }, + { + "epoch": 0.33, + "grad_norm": 18.549959182739258, + "learning_rate": 1.7814974441214797e-05, + "loss": 2.2857, + "step": 1091 + }, + { + "epoch": 0.33, + "grad_norm": 17.99864959716797, + "learning_rate": 1.7812969830610404e-05, + "loss": 2.2779, + "step": 1092 + }, + { + "epoch": 0.33, + "grad_norm": 16.206727981567383, + "learning_rate": 1.7810965220006017e-05, + "loss": 2.1039, + "step": 1093 + }, + { + "epoch": 0.33, + "grad_norm": 14.30031681060791, + "learning_rate": 1.7808960609401624e-05, + "loss": 3.3124, + "step": 1094 + }, + { + "epoch": 0.33, + "grad_norm": 48.405094146728516, + "learning_rate": 1.7806955998797234e-05, + "loss": 3.8335, + "step": 1095 + }, + { + "epoch": 0.33, + "grad_norm": 17.39105224609375, + "learning_rate": 1.7804951388192847e-05, + "loss": 2.8733, + "step": 1096 + }, + { + "epoch": 0.33, + "grad_norm": 23.60736656188965, + "learning_rate": 1.7802946777588454e-05, + "loss": 2.9195, + "step": 1097 + }, + { + "epoch": 0.33, + "grad_norm": 18.69573402404785, + "learning_rate": 1.7800942166984064e-05, + "loss": 3.4923, + "step": 1098 + }, + { + "epoch": 0.33, + "grad_norm": 31.806943893432617, + "learning_rate": 1.7798937556379674e-05, + "loss": 2.9115, + "step": 1099 + }, + { + "epoch": 0.33, + "grad_norm": 13.861745834350586, + "learning_rate": 1.7796932945775284e-05, + "loss": 1.3565, + "step": 1100 + }, + { + "epoch": 0.33, + "grad_norm": 23.198631286621094, + "learning_rate": 1.7794928335170894e-05, + "loss": 3.0973, + "step": 1101 + }, + { + "epoch": 0.33, + "grad_norm": 16.94276237487793, + "learning_rate": 1.7792923724566505e-05, + "loss": 3.5986, + "step": 1102 + }, + { + "epoch": 0.33, + "grad_norm": 9.507773399353027, + "learning_rate": 1.7790919113962115e-05, + "loss": 1.7677, + "step": 1103 + }, + { + "epoch": 0.33, + "grad_norm": 61.569091796875, + "learning_rate": 1.7788914503357725e-05, + "loss": 2.5007, + "step": 1104 + }, + { + "epoch": 0.33, + "grad_norm": 16.81398582458496, + "learning_rate": 1.7786909892753335e-05, + "loss": 3.5078, + "step": 1105 + }, + { + "epoch": 0.33, + "grad_norm": 15.168441772460938, + "learning_rate": 1.778490528214894e-05, + "loss": 2.3151, + "step": 1106 + }, + { + "epoch": 0.33, + "grad_norm": 16.73635482788086, + "learning_rate": 1.7782900671544555e-05, + "loss": 3.0094, + "step": 1107 + }, + { + "epoch": 0.33, + "grad_norm": 16.253849029541016, + "learning_rate": 1.7780896060940165e-05, + "loss": 2.7682, + "step": 1108 + }, + { + "epoch": 0.33, + "grad_norm": 14.130172729492188, + "learning_rate": 1.7778891450335775e-05, + "loss": 3.0624, + "step": 1109 + }, + { + "epoch": 0.33, + "grad_norm": 17.684720993041992, + "learning_rate": 1.7776886839731385e-05, + "loss": 3.3596, + "step": 1110 + }, + { + "epoch": 0.33, + "grad_norm": 29.59230613708496, + "learning_rate": 1.7774882229126992e-05, + "loss": 3.1807, + "step": 1111 + }, + { + "epoch": 0.33, + "grad_norm": 11.93481159210205, + "learning_rate": 1.7772877618522605e-05, + "loss": 2.8107, + "step": 1112 + }, + { + "epoch": 0.33, + "grad_norm": 16.962127685546875, + "learning_rate": 1.7770873007918212e-05, + "loss": 2.9108, + "step": 1113 + }, + { + "epoch": 0.33, + "grad_norm": 13.559270858764648, + "learning_rate": 1.7768868397313822e-05, + "loss": 3.5231, + "step": 1114 + }, + { + "epoch": 0.34, + "grad_norm": 12.885640144348145, + "learning_rate": 1.7766863786709436e-05, + "loss": 2.7105, + "step": 1115 + }, + { + "epoch": 0.34, + "grad_norm": 29.005674362182617, + "learning_rate": 1.7764859176105042e-05, + "loss": 3.413, + "step": 1116 + }, + { + "epoch": 0.34, + "grad_norm": 13.02987289428711, + "learning_rate": 1.7762854565500652e-05, + "loss": 1.8334, + "step": 1117 + }, + { + "epoch": 0.34, + "grad_norm": 14.34976863861084, + "learning_rate": 1.7760849954896263e-05, + "loss": 1.5514, + "step": 1118 + }, + { + "epoch": 0.34, + "grad_norm": 22.86460304260254, + "learning_rate": 1.7758845344291873e-05, + "loss": 3.4641, + "step": 1119 + }, + { + "epoch": 0.34, + "grad_norm": 28.6113224029541, + "learning_rate": 1.7756840733687483e-05, + "loss": 3.761, + "step": 1120 + }, + { + "epoch": 0.34, + "grad_norm": 16.072839736938477, + "learning_rate": 1.7754836123083093e-05, + "loss": 1.5352, + "step": 1121 + }, + { + "epoch": 0.34, + "grad_norm": 18.935298919677734, + "learning_rate": 1.7752831512478703e-05, + "loss": 2.8795, + "step": 1122 + }, + { + "epoch": 0.34, + "grad_norm": 22.4312686920166, + "learning_rate": 1.7750826901874313e-05, + "loss": 4.1038, + "step": 1123 + }, + { + "epoch": 0.34, + "grad_norm": 13.36023998260498, + "learning_rate": 1.7748822291269923e-05, + "loss": 2.9175, + "step": 1124 + }, + { + "epoch": 0.34, + "grad_norm": 10.283577919006348, + "learning_rate": 1.774681768066553e-05, + "loss": 1.6504, + "step": 1125 + }, + { + "epoch": 0.34, + "grad_norm": 16.4921817779541, + "learning_rate": 1.7744813070061143e-05, + "loss": 3.4366, + "step": 1126 + }, + { + "epoch": 0.34, + "grad_norm": 24.69314193725586, + "learning_rate": 1.7742808459456753e-05, + "loss": 3.5184, + "step": 1127 + }, + { + "epoch": 0.34, + "grad_norm": 15.775534629821777, + "learning_rate": 1.774080384885236e-05, + "loss": 3.0247, + "step": 1128 + }, + { + "epoch": 0.34, + "grad_norm": 35.886539459228516, + "learning_rate": 1.7738799238247973e-05, + "loss": 3.0789, + "step": 1129 + }, + { + "epoch": 0.34, + "grad_norm": 13.481932640075684, + "learning_rate": 1.773679462764358e-05, + "loss": 2.6866, + "step": 1130 + }, + { + "epoch": 0.34, + "grad_norm": 13.571722030639648, + "learning_rate": 1.773479001703919e-05, + "loss": 3.6435, + "step": 1131 + }, + { + "epoch": 0.34, + "grad_norm": 12.800562858581543, + "learning_rate": 1.77327854064348e-05, + "loss": 2.6728, + "step": 1132 + }, + { + "epoch": 0.34, + "grad_norm": 28.864028930664062, + "learning_rate": 1.773078079583041e-05, + "loss": 1.8873, + "step": 1133 + }, + { + "epoch": 0.34, + "grad_norm": 23.177940368652344, + "learning_rate": 1.772877618522602e-05, + "loss": 3.7982, + "step": 1134 + }, + { + "epoch": 0.34, + "grad_norm": 18.07777214050293, + "learning_rate": 1.772677157462163e-05, + "loss": 3.1977, + "step": 1135 + }, + { + "epoch": 0.34, + "grad_norm": 25.78776741027832, + "learning_rate": 1.772476696401724e-05, + "loss": 2.4591, + "step": 1136 + }, + { + "epoch": 0.34, + "grad_norm": 14.545743942260742, + "learning_rate": 1.772276235341285e-05, + "loss": 2.5043, + "step": 1137 + }, + { + "epoch": 0.34, + "grad_norm": 12.708026885986328, + "learning_rate": 1.772075774280846e-05, + "loss": 3.234, + "step": 1138 + }, + { + "epoch": 0.34, + "grad_norm": 15.227660179138184, + "learning_rate": 1.771875313220407e-05, + "loss": 3.9899, + "step": 1139 + }, + { + "epoch": 0.34, + "grad_norm": 23.074007034301758, + "learning_rate": 1.771674852159968e-05, + "loss": 1.94, + "step": 1140 + }, + { + "epoch": 0.34, + "grad_norm": 20.678417205810547, + "learning_rate": 1.771474391099529e-05, + "loss": 3.0762, + "step": 1141 + }, + { + "epoch": 0.34, + "grad_norm": 10.947648048400879, + "learning_rate": 1.77127393003909e-05, + "loss": 3.0063, + "step": 1142 + }, + { + "epoch": 0.34, + "grad_norm": 12.576761245727539, + "learning_rate": 1.771073468978651e-05, + "loss": 2.5958, + "step": 1143 + }, + { + "epoch": 0.34, + "grad_norm": 25.190330505371094, + "learning_rate": 1.7708730079182118e-05, + "loss": 2.9465, + "step": 1144 + }, + { + "epoch": 0.34, + "grad_norm": 14.625896453857422, + "learning_rate": 1.770672546857773e-05, + "loss": 2.337, + "step": 1145 + }, + { + "epoch": 0.34, + "grad_norm": 13.233895301818848, + "learning_rate": 1.770472085797334e-05, + "loss": 2.7883, + "step": 1146 + }, + { + "epoch": 0.34, + "grad_norm": 11.731785774230957, + "learning_rate": 1.7702716247368948e-05, + "loss": 2.4522, + "step": 1147 + }, + { + "epoch": 0.35, + "grad_norm": 13.378613471984863, + "learning_rate": 1.770071163676456e-05, + "loss": 2.6688, + "step": 1148 + }, + { + "epoch": 0.35, + "grad_norm": 34.357383728027344, + "learning_rate": 1.769870702616017e-05, + "loss": 3.0263, + "step": 1149 + }, + { + "epoch": 0.35, + "grad_norm": 13.445833206176758, + "learning_rate": 1.769670241555578e-05, + "loss": 1.7473, + "step": 1150 + }, + { + "epoch": 0.35, + "grad_norm": 13.28929328918457, + "learning_rate": 1.7694697804951392e-05, + "loss": 2.3902, + "step": 1151 + }, + { + "epoch": 0.35, + "grad_norm": 19.005510330200195, + "learning_rate": 1.7692693194347e-05, + "loss": 2.506, + "step": 1152 + }, + { + "epoch": 0.35, + "grad_norm": 18.774620056152344, + "learning_rate": 1.769068858374261e-05, + "loss": 2.8765, + "step": 1153 + }, + { + "epoch": 0.35, + "grad_norm": 21.18247413635254, + "learning_rate": 1.768868397313822e-05, + "loss": 2.4275, + "step": 1154 + }, + { + "epoch": 0.35, + "grad_norm": 22.34467124938965, + "learning_rate": 1.768667936253383e-05, + "loss": 2.6174, + "step": 1155 + }, + { + "epoch": 0.35, + "grad_norm": 28.30898094177246, + "learning_rate": 1.768467475192944e-05, + "loss": 3.1029, + "step": 1156 + }, + { + "epoch": 0.35, + "grad_norm": 12.033406257629395, + "learning_rate": 1.768267014132505e-05, + "loss": 3.4031, + "step": 1157 + }, + { + "epoch": 0.35, + "grad_norm": 16.54393768310547, + "learning_rate": 1.768066553072066e-05, + "loss": 2.6641, + "step": 1158 + }, + { + "epoch": 0.35, + "grad_norm": 29.7037410736084, + "learning_rate": 1.767866092011627e-05, + "loss": 2.8023, + "step": 1159 + }, + { + "epoch": 0.35, + "grad_norm": 17.149572372436523, + "learning_rate": 1.767665630951188e-05, + "loss": 3.3883, + "step": 1160 + }, + { + "epoch": 0.35, + "grad_norm": 32.76160430908203, + "learning_rate": 1.767465169890749e-05, + "loss": 3.1329, + "step": 1161 + }, + { + "epoch": 0.35, + "grad_norm": 13.081724166870117, + "learning_rate": 1.76726470883031e-05, + "loss": 3.1289, + "step": 1162 + }, + { + "epoch": 0.35, + "grad_norm": 29.665246963500977, + "learning_rate": 1.767064247769871e-05, + "loss": 3.6106, + "step": 1163 + }, + { + "epoch": 0.35, + "grad_norm": 19.989797592163086, + "learning_rate": 1.766863786709432e-05, + "loss": 3.4249, + "step": 1164 + }, + { + "epoch": 0.35, + "grad_norm": 11.273635864257812, + "learning_rate": 1.766663325648993e-05, + "loss": 3.0172, + "step": 1165 + }, + { + "epoch": 0.35, + "grad_norm": 25.350624084472656, + "learning_rate": 1.7664628645885536e-05, + "loss": 2.8628, + "step": 1166 + }, + { + "epoch": 0.35, + "grad_norm": 14.034561157226562, + "learning_rate": 1.766262403528115e-05, + "loss": 2.7608, + "step": 1167 + }, + { + "epoch": 0.35, + "grad_norm": 20.618732452392578, + "learning_rate": 1.7660619424676757e-05, + "loss": 2.6732, + "step": 1168 + }, + { + "epoch": 0.35, + "grad_norm": 18.557788848876953, + "learning_rate": 1.7658614814072367e-05, + "loss": 4.2311, + "step": 1169 + }, + { + "epoch": 0.35, + "grad_norm": 22.748462677001953, + "learning_rate": 1.765661020346798e-05, + "loss": 3.0337, + "step": 1170 + }, + { + "epoch": 0.35, + "grad_norm": 15.054831504821777, + "learning_rate": 1.7654605592863587e-05, + "loss": 2.5534, + "step": 1171 + }, + { + "epoch": 0.35, + "grad_norm": 12.077056884765625, + "learning_rate": 1.7652600982259197e-05, + "loss": 1.9834, + "step": 1172 + }, + { + "epoch": 0.35, + "grad_norm": 17.17857551574707, + "learning_rate": 1.7650596371654807e-05, + "loss": 3.1654, + "step": 1173 + }, + { + "epoch": 0.35, + "grad_norm": 13.104513168334961, + "learning_rate": 1.7648591761050417e-05, + "loss": 2.999, + "step": 1174 + }, + { + "epoch": 0.35, + "grad_norm": 14.851997375488281, + "learning_rate": 1.7646587150446027e-05, + "loss": 3.3295, + "step": 1175 + }, + { + "epoch": 0.35, + "grad_norm": 12.090683937072754, + "learning_rate": 1.7644582539841637e-05, + "loss": 2.4796, + "step": 1176 + }, + { + "epoch": 0.35, + "grad_norm": 13.724775314331055, + "learning_rate": 1.7642577929237247e-05, + "loss": 2.9697, + "step": 1177 + }, + { + "epoch": 0.35, + "grad_norm": 26.366907119750977, + "learning_rate": 1.7640573318632857e-05, + "loss": 3.2335, + "step": 1178 + }, + { + "epoch": 0.35, + "grad_norm": 10.779017448425293, + "learning_rate": 1.7638568708028467e-05, + "loss": 2.4018, + "step": 1179 + }, + { + "epoch": 0.35, + "grad_norm": 22.231727600097656, + "learning_rate": 1.7636564097424078e-05, + "loss": 3.2377, + "step": 1180 + }, + { + "epoch": 0.36, + "grad_norm": 18.77667236328125, + "learning_rate": 1.7634559486819688e-05, + "loss": 3.0199, + "step": 1181 + }, + { + "epoch": 0.36, + "grad_norm": 16.207422256469727, + "learning_rate": 1.7632554876215298e-05, + "loss": 2.7055, + "step": 1182 + }, + { + "epoch": 0.36, + "grad_norm": 16.809715270996094, + "learning_rate": 1.7630550265610908e-05, + "loss": 2.9821, + "step": 1183 + }, + { + "epoch": 0.36, + "grad_norm": 16.3651065826416, + "learning_rate": 1.7628545655006518e-05, + "loss": 2.1699, + "step": 1184 + }, + { + "epoch": 0.36, + "grad_norm": 11.468558311462402, + "learning_rate": 1.7626541044402125e-05, + "loss": 2.6325, + "step": 1185 + }, + { + "epoch": 0.36, + "grad_norm": 16.051881790161133, + "learning_rate": 1.7624536433797738e-05, + "loss": 2.3443, + "step": 1186 + }, + { + "epoch": 0.36, + "grad_norm": 12.057586669921875, + "learning_rate": 1.7622531823193345e-05, + "loss": 2.7623, + "step": 1187 + }, + { + "epoch": 0.36, + "grad_norm": 17.24924659729004, + "learning_rate": 1.7620527212588955e-05, + "loss": 3.3553, + "step": 1188 + }, + { + "epoch": 0.36, + "grad_norm": 12.06871223449707, + "learning_rate": 1.761852260198457e-05, + "loss": 1.9598, + "step": 1189 + }, + { + "epoch": 0.36, + "grad_norm": 20.307418823242188, + "learning_rate": 1.7616517991380175e-05, + "loss": 3.1637, + "step": 1190 + }, + { + "epoch": 0.36, + "grad_norm": 11.176687240600586, + "learning_rate": 1.7614513380775785e-05, + "loss": 2.5982, + "step": 1191 + }, + { + "epoch": 0.36, + "grad_norm": 14.596364974975586, + "learning_rate": 1.7612508770171395e-05, + "loss": 2.5782, + "step": 1192 + }, + { + "epoch": 0.36, + "grad_norm": 16.369272232055664, + "learning_rate": 1.7610504159567005e-05, + "loss": 2.7876, + "step": 1193 + }, + { + "epoch": 0.36, + "grad_norm": 16.84731101989746, + "learning_rate": 1.7608499548962615e-05, + "loss": 2.9056, + "step": 1194 + }, + { + "epoch": 0.36, + "grad_norm": 51.60824203491211, + "learning_rate": 1.7606494938358225e-05, + "loss": 2.3254, + "step": 1195 + }, + { + "epoch": 0.36, + "grad_norm": 18.47427749633789, + "learning_rate": 1.7604490327753836e-05, + "loss": 2.9355, + "step": 1196 + }, + { + "epoch": 0.36, + "grad_norm": 15.123434066772461, + "learning_rate": 1.7602485717149446e-05, + "loss": 2.588, + "step": 1197 + }, + { + "epoch": 0.36, + "grad_norm": 19.959136962890625, + "learning_rate": 1.7600481106545056e-05, + "loss": 2.5238, + "step": 1198 + }, + { + "epoch": 0.36, + "grad_norm": 19.62584114074707, + "learning_rate": 1.7598476495940662e-05, + "loss": 3.2818, + "step": 1199 + }, + { + "epoch": 0.36, + "grad_norm": 19.437580108642578, + "learning_rate": 1.7596471885336276e-05, + "loss": 2.8019, + "step": 1200 + }, + { + "epoch": 0.36, + "eval_loss": 0.5124673843383789, + "eval_runtime": 43.4829, + "eval_samples_per_second": 34.013, + "eval_steps_per_second": 34.013, + "step": 1200 + }, + { + "epoch": 0.36, + "grad_norm": 41.26815414428711, + "learning_rate": 1.7594467274731886e-05, + "loss": 3.0088, + "step": 1201 + }, + { + "epoch": 0.36, + "grad_norm": 23.42255973815918, + "learning_rate": 1.7592462664127493e-05, + "loss": 3.0811, + "step": 1202 + }, + { + "epoch": 0.36, + "grad_norm": 24.135406494140625, + "learning_rate": 1.7590458053523106e-05, + "loss": 3.4351, + "step": 1203 + }, + { + "epoch": 0.36, + "grad_norm": 11.820728302001953, + "learning_rate": 1.7588453442918713e-05, + "loss": 2.2131, + "step": 1204 + }, + { + "epoch": 0.36, + "grad_norm": 12.589012145996094, + "learning_rate": 1.7586448832314323e-05, + "loss": 2.5815, + "step": 1205 + }, + { + "epoch": 0.36, + "grad_norm": 14.668851852416992, + "learning_rate": 1.7584444221709936e-05, + "loss": 2.9765, + "step": 1206 + }, + { + "epoch": 0.36, + "grad_norm": 13.122230529785156, + "learning_rate": 1.7582439611105543e-05, + "loss": 2.3476, + "step": 1207 + }, + { + "epoch": 0.36, + "grad_norm": 17.157947540283203, + "learning_rate": 1.7580435000501153e-05, + "loss": 2.8408, + "step": 1208 + }, + { + "epoch": 0.36, + "grad_norm": 21.18256950378418, + "learning_rate": 1.7578430389896763e-05, + "loss": 2.5619, + "step": 1209 + }, + { + "epoch": 0.36, + "grad_norm": 19.1740779876709, + "learning_rate": 1.7576425779292373e-05, + "loss": 4.0597, + "step": 1210 + }, + { + "epoch": 0.36, + "grad_norm": 20.59099769592285, + "learning_rate": 1.7574421168687983e-05, + "loss": 1.8192, + "step": 1211 + }, + { + "epoch": 0.36, + "grad_norm": 31.291309356689453, + "learning_rate": 1.7572416558083593e-05, + "loss": 3.6861, + "step": 1212 + }, + { + "epoch": 0.36, + "grad_norm": 11.655035018920898, + "learning_rate": 1.7570411947479204e-05, + "loss": 2.3163, + "step": 1213 + }, + { + "epoch": 0.37, + "grad_norm": 27.356048583984375, + "learning_rate": 1.7568407336874814e-05, + "loss": 3.9249, + "step": 1214 + }, + { + "epoch": 0.37, + "grad_norm": 22.89837646484375, + "learning_rate": 1.7566402726270424e-05, + "loss": 2.6824, + "step": 1215 + }, + { + "epoch": 0.37, + "grad_norm": 16.46586799621582, + "learning_rate": 1.7564398115666034e-05, + "loss": 2.5388, + "step": 1216 + }, + { + "epoch": 0.37, + "grad_norm": 18.528961181640625, + "learning_rate": 1.7562393505061644e-05, + "loss": 2.435, + "step": 1217 + }, + { + "epoch": 0.37, + "grad_norm": 10.361041069030762, + "learning_rate": 1.756038889445725e-05, + "loss": 2.5633, + "step": 1218 + }, + { + "epoch": 0.37, + "grad_norm": 17.265287399291992, + "learning_rate": 1.7558384283852864e-05, + "loss": 2.3523, + "step": 1219 + }, + { + "epoch": 0.37, + "grad_norm": 18.783292770385742, + "learning_rate": 1.7556379673248474e-05, + "loss": 2.3425, + "step": 1220 + }, + { + "epoch": 0.37, + "grad_norm": 21.62091827392578, + "learning_rate": 1.755437506264408e-05, + "loss": 2.7889, + "step": 1221 + }, + { + "epoch": 0.37, + "grad_norm": 19.435802459716797, + "learning_rate": 1.7552370452039694e-05, + "loss": 3.4363, + "step": 1222 + }, + { + "epoch": 0.37, + "grad_norm": 13.962774276733398, + "learning_rate": 1.75503658414353e-05, + "loss": 1.6731, + "step": 1223 + }, + { + "epoch": 0.37, + "grad_norm": 10.637103080749512, + "learning_rate": 1.754836123083091e-05, + "loss": 1.9421, + "step": 1224 + }, + { + "epoch": 0.37, + "grad_norm": 12.985700607299805, + "learning_rate": 1.7546356620226525e-05, + "loss": 2.2144, + "step": 1225 + }, + { + "epoch": 0.37, + "grad_norm": 16.512508392333984, + "learning_rate": 1.754435200962213e-05, + "loss": 2.4662, + "step": 1226 + }, + { + "epoch": 0.37, + "grad_norm": 20.497188568115234, + "learning_rate": 1.754234739901774e-05, + "loss": 2.0858, + "step": 1227 + }, + { + "epoch": 0.37, + "grad_norm": 32.291175842285156, + "learning_rate": 1.754034278841335e-05, + "loss": 3.3415, + "step": 1228 + }, + { + "epoch": 0.37, + "grad_norm": 16.312284469604492, + "learning_rate": 1.753833817780896e-05, + "loss": 2.2323, + "step": 1229 + }, + { + "epoch": 0.37, + "grad_norm": 23.16538429260254, + "learning_rate": 1.753633356720457e-05, + "loss": 3.7011, + "step": 1230 + }, + { + "epoch": 0.37, + "grad_norm": 9.652095794677734, + "learning_rate": 1.7534328956600182e-05, + "loss": 2.5907, + "step": 1231 + }, + { + "epoch": 0.37, + "grad_norm": 16.361186981201172, + "learning_rate": 1.7532324345995792e-05, + "loss": 3.1489, + "step": 1232 + }, + { + "epoch": 0.37, + "grad_norm": 24.299665451049805, + "learning_rate": 1.7530319735391402e-05, + "loss": 2.751, + "step": 1233 + }, + { + "epoch": 0.37, + "grad_norm": 11.663421630859375, + "learning_rate": 1.7528315124787012e-05, + "loss": 2.2756, + "step": 1234 + }, + { + "epoch": 0.37, + "grad_norm": 16.520639419555664, + "learning_rate": 1.7526310514182622e-05, + "loss": 1.8753, + "step": 1235 + }, + { + "epoch": 0.37, + "grad_norm": 30.01593780517578, + "learning_rate": 1.7524305903578232e-05, + "loss": 1.1438, + "step": 1236 + }, + { + "epoch": 0.37, + "grad_norm": 28.729244232177734, + "learning_rate": 1.7522301292973842e-05, + "loss": 3.4131, + "step": 1237 + }, + { + "epoch": 0.37, + "grad_norm": 18.10856819152832, + "learning_rate": 1.7520296682369452e-05, + "loss": 2.8804, + "step": 1238 + }, + { + "epoch": 0.37, + "grad_norm": 18.52988052368164, + "learning_rate": 1.7518292071765062e-05, + "loss": 2.57, + "step": 1239 + }, + { + "epoch": 0.37, + "grad_norm": 18.071670532226562, + "learning_rate": 1.751628746116067e-05, + "loss": 2.5888, + "step": 1240 + }, + { + "epoch": 0.37, + "grad_norm": 9.437912940979004, + "learning_rate": 1.7514282850556283e-05, + "loss": 2.1698, + "step": 1241 + }, + { + "epoch": 0.37, + "grad_norm": 11.90031623840332, + "learning_rate": 1.751227823995189e-05, + "loss": 2.5627, + "step": 1242 + }, + { + "epoch": 0.37, + "grad_norm": 12.437149047851562, + "learning_rate": 1.75102736293475e-05, + "loss": 2.2992, + "step": 1243 + }, + { + "epoch": 0.37, + "grad_norm": 27.860820770263672, + "learning_rate": 1.7508269018743113e-05, + "loss": 3.5747, + "step": 1244 + }, + { + "epoch": 0.37, + "grad_norm": 26.30794334411621, + "learning_rate": 1.750626440813872e-05, + "loss": 3.8743, + "step": 1245 + }, + { + "epoch": 0.37, + "grad_norm": 17.703901290893555, + "learning_rate": 1.750425979753433e-05, + "loss": 2.338, + "step": 1246 + }, + { + "epoch": 0.37, + "grad_norm": 10.545022010803223, + "learning_rate": 1.750225518692994e-05, + "loss": 1.9899, + "step": 1247 + }, + { + "epoch": 0.38, + "grad_norm": 24.279394149780273, + "learning_rate": 1.750025057632555e-05, + "loss": 2.9463, + "step": 1248 + }, + { + "epoch": 0.38, + "grad_norm": 21.58678436279297, + "learning_rate": 1.749824596572116e-05, + "loss": 3.4896, + "step": 1249 + }, + { + "epoch": 0.38, + "grad_norm": 13.904131889343262, + "learning_rate": 1.749624135511677e-05, + "loss": 3.1997, + "step": 1250 + }, + { + "epoch": 0.38, + "grad_norm": 10.544210433959961, + "learning_rate": 1.749423674451238e-05, + "loss": 1.9443, + "step": 1251 + }, + { + "epoch": 0.38, + "grad_norm": 16.909954071044922, + "learning_rate": 1.749223213390799e-05, + "loss": 3.1345, + "step": 1252 + }, + { + "epoch": 0.38, + "grad_norm": 18.661376953125, + "learning_rate": 1.74902275233036e-05, + "loss": 3.7376, + "step": 1253 + }, + { + "epoch": 0.38, + "grad_norm": 16.603628158569336, + "learning_rate": 1.748822291269921e-05, + "loss": 2.108, + "step": 1254 + }, + { + "epoch": 0.38, + "grad_norm": 12.670201301574707, + "learning_rate": 1.748621830209482e-05, + "loss": 3.4042, + "step": 1255 + }, + { + "epoch": 0.38, + "grad_norm": 12.029052734375, + "learning_rate": 1.748421369149043e-05, + "loss": 2.9556, + "step": 1256 + }, + { + "epoch": 0.38, + "grad_norm": 28.61595916748047, + "learning_rate": 1.748220908088604e-05, + "loss": 2.0854, + "step": 1257 + }, + { + "epoch": 0.38, + "grad_norm": 22.586305618286133, + "learning_rate": 1.748020447028165e-05, + "loss": 2.461, + "step": 1258 + }, + { + "epoch": 0.38, + "grad_norm": 14.331897735595703, + "learning_rate": 1.7478199859677257e-05, + "loss": 3.5532, + "step": 1259 + }, + { + "epoch": 0.38, + "grad_norm": 18.208744049072266, + "learning_rate": 1.747619524907287e-05, + "loss": 2.59, + "step": 1260 + }, + { + "epoch": 0.38, + "grad_norm": 26.50733757019043, + "learning_rate": 1.7474190638468477e-05, + "loss": 3.567, + "step": 1261 + }, + { + "epoch": 0.38, + "grad_norm": 17.948020935058594, + "learning_rate": 1.7472186027864088e-05, + "loss": 2.5004, + "step": 1262 + }, + { + "epoch": 0.38, + "grad_norm": 11.927952766418457, + "learning_rate": 1.74701814172597e-05, + "loss": 2.1265, + "step": 1263 + }, + { + "epoch": 0.38, + "grad_norm": 26.811809539794922, + "learning_rate": 1.7468176806655308e-05, + "loss": 2.7967, + "step": 1264 + }, + { + "epoch": 0.38, + "grad_norm": 82.8468246459961, + "learning_rate": 1.7466172196050918e-05, + "loss": 2.652, + "step": 1265 + }, + { + "epoch": 0.38, + "grad_norm": 13.454938888549805, + "learning_rate": 1.7464167585446528e-05, + "loss": 2.282, + "step": 1266 + }, + { + "epoch": 0.38, + "grad_norm": 20.55266571044922, + "learning_rate": 1.7462162974842138e-05, + "loss": 2.3296, + "step": 1267 + }, + { + "epoch": 0.38, + "grad_norm": 10.279245376586914, + "learning_rate": 1.7460158364237748e-05, + "loss": 1.3453, + "step": 1268 + }, + { + "epoch": 0.38, + "grad_norm": 11.651695251464844, + "learning_rate": 1.7458153753633358e-05, + "loss": 2.5631, + "step": 1269 + }, + { + "epoch": 0.38, + "grad_norm": 12.197574615478516, + "learning_rate": 1.7456149143028968e-05, + "loss": 3.0425, + "step": 1270 + }, + { + "epoch": 0.38, + "grad_norm": 24.782493591308594, + "learning_rate": 1.7454144532424578e-05, + "loss": 3.1392, + "step": 1271 + }, + { + "epoch": 0.38, + "grad_norm": 10.844775199890137, + "learning_rate": 1.745213992182019e-05, + "loss": 2.9512, + "step": 1272 + }, + { + "epoch": 0.38, + "grad_norm": 13.511686325073242, + "learning_rate": 1.7450135311215795e-05, + "loss": 2.8739, + "step": 1273 + }, + { + "epoch": 0.38, + "grad_norm": 21.19719886779785, + "learning_rate": 1.744813070061141e-05, + "loss": 2.8494, + "step": 1274 + }, + { + "epoch": 0.38, + "grad_norm": 17.959545135498047, + "learning_rate": 1.744612609000702e-05, + "loss": 2.8389, + "step": 1275 + }, + { + "epoch": 0.38, + "grad_norm": 17.719860076904297, + "learning_rate": 1.7444121479402625e-05, + "loss": 2.7844, + "step": 1276 + }, + { + "epoch": 0.38, + "grad_norm": 15.047807693481445, + "learning_rate": 1.744211686879824e-05, + "loss": 2.5462, + "step": 1277 + }, + { + "epoch": 0.38, + "grad_norm": 18.358489990234375, + "learning_rate": 1.7440112258193846e-05, + "loss": 2.8936, + "step": 1278 + }, + { + "epoch": 0.38, + "grad_norm": 20.895231246948242, + "learning_rate": 1.743810764758946e-05, + "loss": 3.1691, + "step": 1279 + }, + { + "epoch": 0.38, + "grad_norm": 10.4979248046875, + "learning_rate": 1.743610303698507e-05, + "loss": 1.5841, + "step": 1280 + }, + { + "epoch": 0.39, + "grad_norm": 14.42455768585205, + "learning_rate": 1.7434098426380676e-05, + "loss": 2.7656, + "step": 1281 + }, + { + "epoch": 0.39, + "grad_norm": 11.873161315917969, + "learning_rate": 1.743209381577629e-05, + "loss": 2.9719, + "step": 1282 + }, + { + "epoch": 0.39, + "grad_norm": 23.232919692993164, + "learning_rate": 1.7430089205171896e-05, + "loss": 2.5282, + "step": 1283 + }, + { + "epoch": 0.39, + "grad_norm": 15.954151153564453, + "learning_rate": 1.7428084594567506e-05, + "loss": 3.3602, + "step": 1284 + }, + { + "epoch": 0.39, + "grad_norm": 15.332113265991211, + "learning_rate": 1.7426079983963116e-05, + "loss": 3.1367, + "step": 1285 + }, + { + "epoch": 0.39, + "grad_norm": 17.88947296142578, + "learning_rate": 1.7424075373358726e-05, + "loss": 2.8707, + "step": 1286 + }, + { + "epoch": 0.39, + "grad_norm": 14.455220222473145, + "learning_rate": 1.7422070762754336e-05, + "loss": 2.7743, + "step": 1287 + }, + { + "epoch": 0.39, + "grad_norm": 12.738212585449219, + "learning_rate": 1.7420066152149946e-05, + "loss": 2.4288, + "step": 1288 + }, + { + "epoch": 0.39, + "grad_norm": 17.96286392211914, + "learning_rate": 1.7418061541545556e-05, + "loss": 2.1277, + "step": 1289 + }, + { + "epoch": 0.39, + "grad_norm": 10.881392478942871, + "learning_rate": 1.7416056930941167e-05, + "loss": 3.3985, + "step": 1290 + }, + { + "epoch": 0.39, + "grad_norm": 9.816652297973633, + "learning_rate": 1.7414052320336777e-05, + "loss": 1.9979, + "step": 1291 + }, + { + "epoch": 0.39, + "grad_norm": 14.846339225769043, + "learning_rate": 1.7412047709732387e-05, + "loss": 3.2407, + "step": 1292 + }, + { + "epoch": 0.39, + "grad_norm": 24.287704467773438, + "learning_rate": 1.7410043099127997e-05, + "loss": 3.1519, + "step": 1293 + }, + { + "epoch": 0.39, + "grad_norm": 18.814102172851562, + "learning_rate": 1.7408038488523607e-05, + "loss": 3.9634, + "step": 1294 + }, + { + "epoch": 0.39, + "grad_norm": 21.42952537536621, + "learning_rate": 1.7406033877919214e-05, + "loss": 3.3032, + "step": 1295 + }, + { + "epoch": 0.39, + "grad_norm": 14.19246768951416, + "learning_rate": 1.7404029267314827e-05, + "loss": 2.3004, + "step": 1296 + }, + { + "epoch": 0.39, + "grad_norm": 32.35289001464844, + "learning_rate": 1.7402024656710434e-05, + "loss": 4.9394, + "step": 1297 + }, + { + "epoch": 0.39, + "grad_norm": 14.341626167297363, + "learning_rate": 1.7400020046106044e-05, + "loss": 2.9232, + "step": 1298 + }, + { + "epoch": 0.39, + "grad_norm": 13.265176773071289, + "learning_rate": 1.7398015435501657e-05, + "loss": 3.7512, + "step": 1299 + }, + { + "epoch": 0.39, + "grad_norm": 15.265084266662598, + "learning_rate": 1.7396010824897264e-05, + "loss": 3.0586, + "step": 1300 + }, + { + "epoch": 0.39, + "grad_norm": 19.14535903930664, + "learning_rate": 1.7394006214292874e-05, + "loss": 3.9271, + "step": 1301 + }, + { + "epoch": 0.39, + "grad_norm": 19.937034606933594, + "learning_rate": 1.7392001603688484e-05, + "loss": 1.9253, + "step": 1302 + }, + { + "epoch": 0.39, + "grad_norm": 15.879293441772461, + "learning_rate": 1.7389996993084094e-05, + "loss": 2.5909, + "step": 1303 + }, + { + "epoch": 0.39, + "grad_norm": 22.646526336669922, + "learning_rate": 1.7387992382479704e-05, + "loss": 2.1147, + "step": 1304 + }, + { + "epoch": 0.39, + "grad_norm": 19.122434616088867, + "learning_rate": 1.7385987771875314e-05, + "loss": 2.6576, + "step": 1305 + }, + { + "epoch": 0.39, + "grad_norm": 16.39354705810547, + "learning_rate": 1.7383983161270924e-05, + "loss": 2.3048, + "step": 1306 + }, + { + "epoch": 0.39, + "grad_norm": 21.781461715698242, + "learning_rate": 1.7381978550666535e-05, + "loss": 2.5542, + "step": 1307 + }, + { + "epoch": 0.39, + "grad_norm": 13.316234588623047, + "learning_rate": 1.7379973940062145e-05, + "loss": 2.669, + "step": 1308 + }, + { + "epoch": 0.39, + "grad_norm": 11.472354888916016, + "learning_rate": 1.7377969329457755e-05, + "loss": 2.7657, + "step": 1309 + }, + { + "epoch": 0.39, + "grad_norm": 25.965810775756836, + "learning_rate": 1.7375964718853365e-05, + "loss": 2.8167, + "step": 1310 + }, + { + "epoch": 0.39, + "grad_norm": 17.545780181884766, + "learning_rate": 1.7373960108248975e-05, + "loss": 3.5917, + "step": 1311 + }, + { + "epoch": 0.39, + "grad_norm": 22.960935592651367, + "learning_rate": 1.7371955497644585e-05, + "loss": 3.4562, + "step": 1312 + }, + { + "epoch": 0.39, + "grad_norm": 11.906625747680664, + "learning_rate": 1.7369950887040195e-05, + "loss": 3.293, + "step": 1313 + }, + { + "epoch": 0.4, + "grad_norm": 14.520302772521973, + "learning_rate": 1.7367946276435802e-05, + "loss": 2.368, + "step": 1314 + }, + { + "epoch": 0.4, + "grad_norm": 9.339329719543457, + "learning_rate": 1.7365941665831415e-05, + "loss": 2.1992, + "step": 1315 + }, + { + "epoch": 0.4, + "grad_norm": 19.66999626159668, + "learning_rate": 1.7363937055227022e-05, + "loss": 3.1317, + "step": 1316 + }, + { + "epoch": 0.4, + "grad_norm": 19.995849609375, + "learning_rate": 1.7361932444622632e-05, + "loss": 2.606, + "step": 1317 + }, + { + "epoch": 0.4, + "grad_norm": 16.427093505859375, + "learning_rate": 1.7359927834018245e-05, + "loss": 1.7637, + "step": 1318 + }, + { + "epoch": 0.4, + "grad_norm": 14.198533058166504, + "learning_rate": 1.7357923223413852e-05, + "loss": 2.904, + "step": 1319 + }, + { + "epoch": 0.4, + "grad_norm": 12.009779930114746, + "learning_rate": 1.7355918612809462e-05, + "loss": 2.2005, + "step": 1320 + }, + { + "epoch": 0.4, + "eval_loss": 0.39250195026397705, + "eval_runtime": 43.6691, + "eval_samples_per_second": 33.868, + "eval_steps_per_second": 33.868, + "step": 1320 + }, + { + "epoch": 0.4, + "grad_norm": 11.49736499786377, + "learning_rate": 1.7353914002205072e-05, + "loss": 2.9098, + "step": 1321 + }, + { + "epoch": 0.4, + "grad_norm": 12.25181770324707, + "learning_rate": 1.7351909391600682e-05, + "loss": 2.83, + "step": 1322 + }, + { + "epoch": 0.4, + "grad_norm": 17.131059646606445, + "learning_rate": 1.7349904780996293e-05, + "loss": 2.6729, + "step": 1323 + }, + { + "epoch": 0.4, + "grad_norm": 16.65376853942871, + "learning_rate": 1.7347900170391903e-05, + "loss": 2.9689, + "step": 1324 + }, + { + "epoch": 0.4, + "grad_norm": 13.16373062133789, + "learning_rate": 1.7345895559787513e-05, + "loss": 2.5196, + "step": 1325 + }, + { + "epoch": 0.4, + "grad_norm": 15.137256622314453, + "learning_rate": 1.7343890949183123e-05, + "loss": 3.2273, + "step": 1326 + }, + { + "epoch": 0.4, + "grad_norm": 12.256842613220215, + "learning_rate": 1.7341886338578733e-05, + "loss": 3.0272, + "step": 1327 + }, + { + "epoch": 0.4, + "grad_norm": 46.50150680541992, + "learning_rate": 1.7339881727974343e-05, + "loss": 3.4704, + "step": 1328 + }, + { + "epoch": 0.4, + "grad_norm": 23.8785400390625, + "learning_rate": 1.7337877117369953e-05, + "loss": 3.1262, + "step": 1329 + }, + { + "epoch": 0.4, + "grad_norm": 17.71267318725586, + "learning_rate": 1.7335872506765563e-05, + "loss": 2.1541, + "step": 1330 + }, + { + "epoch": 0.4, + "grad_norm": 22.225250244140625, + "learning_rate": 1.7333867896161173e-05, + "loss": 1.5919, + "step": 1331 + }, + { + "epoch": 0.4, + "grad_norm": 16.465585708618164, + "learning_rate": 1.7331863285556783e-05, + "loss": 2.3168, + "step": 1332 + }, + { + "epoch": 0.4, + "grad_norm": 24.607051849365234, + "learning_rate": 1.732985867495239e-05, + "loss": 3.1243, + "step": 1333 + }, + { + "epoch": 0.4, + "grad_norm": 19.95473861694336, + "learning_rate": 1.7327854064348003e-05, + "loss": 2.5079, + "step": 1334 + }, + { + "epoch": 0.4, + "grad_norm": 19.784517288208008, + "learning_rate": 1.7325849453743614e-05, + "loss": 3.3156, + "step": 1335 + }, + { + "epoch": 0.4, + "grad_norm": 13.695775032043457, + "learning_rate": 1.732384484313922e-05, + "loss": 3.1165, + "step": 1336 + }, + { + "epoch": 0.4, + "grad_norm": 11.951885223388672, + "learning_rate": 1.7321840232534834e-05, + "loss": 2.9321, + "step": 1337 + }, + { + "epoch": 0.4, + "grad_norm": 13.507436752319336, + "learning_rate": 1.731983562193044e-05, + "loss": 2.4538, + "step": 1338 + }, + { + "epoch": 0.4, + "grad_norm": 11.957839965820312, + "learning_rate": 1.731783101132605e-05, + "loss": 3.1245, + "step": 1339 + }, + { + "epoch": 0.4, + "grad_norm": 14.829548835754395, + "learning_rate": 1.731582640072166e-05, + "loss": 2.3632, + "step": 1340 + }, + { + "epoch": 0.4, + "grad_norm": 14.587291717529297, + "learning_rate": 1.731382179011727e-05, + "loss": 2.8577, + "step": 1341 + }, + { + "epoch": 0.4, + "grad_norm": 13.556631088256836, + "learning_rate": 1.731181717951288e-05, + "loss": 2.0475, + "step": 1342 + }, + { + "epoch": 0.4, + "grad_norm": 13.010260581970215, + "learning_rate": 1.730981256890849e-05, + "loss": 2.6603, + "step": 1343 + }, + { + "epoch": 0.4, + "grad_norm": 11.09408187866211, + "learning_rate": 1.73078079583041e-05, + "loss": 2.5755, + "step": 1344 + }, + { + "epoch": 0.4, + "grad_norm": 13.849974632263184, + "learning_rate": 1.730580334769971e-05, + "loss": 2.8472, + "step": 1345 + }, + { + "epoch": 0.4, + "grad_norm": 21.211544036865234, + "learning_rate": 1.730379873709532e-05, + "loss": 2.7465, + "step": 1346 + }, + { + "epoch": 0.4, + "grad_norm": 15.509568214416504, + "learning_rate": 1.730179412649093e-05, + "loss": 3.4552, + "step": 1347 + }, + { + "epoch": 0.41, + "grad_norm": 13.904120445251465, + "learning_rate": 1.729978951588654e-05, + "loss": 2.9963, + "step": 1348 + }, + { + "epoch": 0.41, + "grad_norm": 17.5559024810791, + "learning_rate": 1.729778490528215e-05, + "loss": 2.4842, + "step": 1349 + }, + { + "epoch": 0.41, + "grad_norm": 15.476587295532227, + "learning_rate": 1.729578029467776e-05, + "loss": 2.5928, + "step": 1350 + }, + { + "epoch": 0.41, + "grad_norm": 15.32288932800293, + "learning_rate": 1.729377568407337e-05, + "loss": 3.4517, + "step": 1351 + }, + { + "epoch": 0.41, + "grad_norm": 15.49992561340332, + "learning_rate": 1.7291771073468978e-05, + "loss": 3.042, + "step": 1352 + }, + { + "epoch": 0.41, + "grad_norm": 13.955615043640137, + "learning_rate": 1.728976646286459e-05, + "loss": 2.5861, + "step": 1353 + }, + { + "epoch": 0.41, + "grad_norm": 12.550683975219727, + "learning_rate": 1.7287761852260202e-05, + "loss": 3.6951, + "step": 1354 + }, + { + "epoch": 0.41, + "grad_norm": 22.899044036865234, + "learning_rate": 1.728575724165581e-05, + "loss": 2.9966, + "step": 1355 + }, + { + "epoch": 0.41, + "grad_norm": 12.468682289123535, + "learning_rate": 1.7283752631051422e-05, + "loss": 2.1763, + "step": 1356 + }, + { + "epoch": 0.41, + "grad_norm": 22.829370498657227, + "learning_rate": 1.728174802044703e-05, + "loss": 3.2563, + "step": 1357 + }, + { + "epoch": 0.41, + "grad_norm": 27.643447875976562, + "learning_rate": 1.727974340984264e-05, + "loss": 2.7474, + "step": 1358 + }, + { + "epoch": 0.41, + "grad_norm": 17.19843101501465, + "learning_rate": 1.727773879923825e-05, + "loss": 2.9442, + "step": 1359 + }, + { + "epoch": 0.41, + "grad_norm": 13.734451293945312, + "learning_rate": 1.727573418863386e-05, + "loss": 2.8518, + "step": 1360 + }, + { + "epoch": 0.41, + "grad_norm": 11.024998664855957, + "learning_rate": 1.727372957802947e-05, + "loss": 2.2833, + "step": 1361 + }, + { + "epoch": 0.41, + "grad_norm": 19.57155990600586, + "learning_rate": 1.727172496742508e-05, + "loss": 2.9668, + "step": 1362 + }, + { + "epoch": 0.41, + "grad_norm": 24.14830207824707, + "learning_rate": 1.726972035682069e-05, + "loss": 2.2259, + "step": 1363 + }, + { + "epoch": 0.41, + "grad_norm": 13.887985229492188, + "learning_rate": 1.72677157462163e-05, + "loss": 2.4764, + "step": 1364 + }, + { + "epoch": 0.41, + "grad_norm": 11.10314655303955, + "learning_rate": 1.726571113561191e-05, + "loss": 2.1367, + "step": 1365 + }, + { + "epoch": 0.41, + "grad_norm": 11.575247764587402, + "learning_rate": 1.726370652500752e-05, + "loss": 2.396, + "step": 1366 + }, + { + "epoch": 0.41, + "grad_norm": 17.140932083129883, + "learning_rate": 1.726170191440313e-05, + "loss": 3.0233, + "step": 1367 + }, + { + "epoch": 0.41, + "grad_norm": 18.503704071044922, + "learning_rate": 1.725969730379874e-05, + "loss": 3.2255, + "step": 1368 + }, + { + "epoch": 0.41, + "grad_norm": 14.695094108581543, + "learning_rate": 1.7257692693194346e-05, + "loss": 3.1332, + "step": 1369 + }, + { + "epoch": 0.41, + "grad_norm": 12.079935073852539, + "learning_rate": 1.725568808258996e-05, + "loss": 2.4825, + "step": 1370 + }, + { + "epoch": 0.41, + "grad_norm": 23.080753326416016, + "learning_rate": 1.7253683471985566e-05, + "loss": 2.7521, + "step": 1371 + }, + { + "epoch": 0.41, + "grad_norm": 13.837876319885254, + "learning_rate": 1.7251678861381176e-05, + "loss": 2.5238, + "step": 1372 + }, + { + "epoch": 0.41, + "grad_norm": 19.977251052856445, + "learning_rate": 1.724967425077679e-05, + "loss": 3.2955, + "step": 1373 + }, + { + "epoch": 0.41, + "grad_norm": 13.569607734680176, + "learning_rate": 1.7247669640172397e-05, + "loss": 2.7767, + "step": 1374 + }, + { + "epoch": 0.41, + "grad_norm": 14.404850959777832, + "learning_rate": 1.7245665029568007e-05, + "loss": 2.6134, + "step": 1375 + }, + { + "epoch": 0.41, + "grad_norm": 16.575212478637695, + "learning_rate": 1.7243660418963617e-05, + "loss": 1.7531, + "step": 1376 + }, + { + "epoch": 0.41, + "grad_norm": 21.60881233215332, + "learning_rate": 1.7241655808359227e-05, + "loss": 2.9619, + "step": 1377 + }, + { + "epoch": 0.41, + "grad_norm": 17.051956176757812, + "learning_rate": 1.7239651197754837e-05, + "loss": 3.0801, + "step": 1378 + }, + { + "epoch": 0.41, + "grad_norm": 37.58320999145508, + "learning_rate": 1.7237646587150447e-05, + "loss": 2.3241, + "step": 1379 + }, + { + "epoch": 0.41, + "grad_norm": 12.96102237701416, + "learning_rate": 1.7235641976546057e-05, + "loss": 2.2837, + "step": 1380 + }, + { + "epoch": 0.42, + "grad_norm": 34.093780517578125, + "learning_rate": 1.7233637365941667e-05, + "loss": 2.8085, + "step": 1381 + }, + { + "epoch": 0.42, + "grad_norm": 17.705703735351562, + "learning_rate": 1.7231632755337277e-05, + "loss": 3.4047, + "step": 1382 + }, + { + "epoch": 0.42, + "grad_norm": 16.184062957763672, + "learning_rate": 1.7229628144732887e-05, + "loss": 3.4847, + "step": 1383 + }, + { + "epoch": 0.42, + "grad_norm": 12.74220085144043, + "learning_rate": 1.7227623534128497e-05, + "loss": 1.8471, + "step": 1384 + }, + { + "epoch": 0.42, + "grad_norm": 26.621505737304688, + "learning_rate": 1.7225618923524108e-05, + "loss": 3.4508, + "step": 1385 + }, + { + "epoch": 0.42, + "grad_norm": 23.22762107849121, + "learning_rate": 1.7223614312919718e-05, + "loss": 2.8567, + "step": 1386 + }, + { + "epoch": 0.42, + "grad_norm": 13.235843658447266, + "learning_rate": 1.7221609702315328e-05, + "loss": 2.368, + "step": 1387 + }, + { + "epoch": 0.42, + "grad_norm": 10.793066024780273, + "learning_rate": 1.7219605091710934e-05, + "loss": 1.9442, + "step": 1388 + }, + { + "epoch": 0.42, + "grad_norm": 18.45147705078125, + "learning_rate": 1.7217600481106548e-05, + "loss": 3.3398, + "step": 1389 + }, + { + "epoch": 0.42, + "grad_norm": 14.921577453613281, + "learning_rate": 1.7215595870502155e-05, + "loss": 2.1594, + "step": 1390 + }, + { + "epoch": 0.42, + "grad_norm": 14.961502075195312, + "learning_rate": 1.7213591259897765e-05, + "loss": 3.3974, + "step": 1391 + }, + { + "epoch": 0.42, + "grad_norm": 19.07111167907715, + "learning_rate": 1.7211586649293378e-05, + "loss": 3.1362, + "step": 1392 + }, + { + "epoch": 0.42, + "grad_norm": 10.949179649353027, + "learning_rate": 1.7209582038688985e-05, + "loss": 2.0993, + "step": 1393 + }, + { + "epoch": 0.42, + "grad_norm": 12.562166213989258, + "learning_rate": 1.7207577428084595e-05, + "loss": 2.0554, + "step": 1394 + }, + { + "epoch": 0.42, + "grad_norm": 11.35824966430664, + "learning_rate": 1.7205572817480205e-05, + "loss": 2.1912, + "step": 1395 + }, + { + "epoch": 0.42, + "grad_norm": 16.764873504638672, + "learning_rate": 1.7203568206875815e-05, + "loss": 2.5602, + "step": 1396 + }, + { + "epoch": 0.42, + "grad_norm": 14.467211723327637, + "learning_rate": 1.7201563596271425e-05, + "loss": 2.6329, + "step": 1397 + }, + { + "epoch": 0.42, + "grad_norm": 11.931891441345215, + "learning_rate": 1.7199558985667035e-05, + "loss": 2.2821, + "step": 1398 + }, + { + "epoch": 0.42, + "grad_norm": 22.51726722717285, + "learning_rate": 1.7197554375062645e-05, + "loss": 2.7927, + "step": 1399 + }, + { + "epoch": 0.42, + "grad_norm": 10.18954086303711, + "learning_rate": 1.7195549764458255e-05, + "loss": 1.9007, + "step": 1400 + }, + { + "epoch": 0.42, + "grad_norm": 15.76696491241455, + "learning_rate": 1.7193545153853866e-05, + "loss": 2.0431, + "step": 1401 + }, + { + "epoch": 0.42, + "grad_norm": 11.40339183807373, + "learning_rate": 1.7191540543249476e-05, + "loss": 2.5742, + "step": 1402 + }, + { + "epoch": 0.42, + "grad_norm": 12.021610260009766, + "learning_rate": 1.7189535932645086e-05, + "loss": 3.6196, + "step": 1403 + }, + { + "epoch": 0.42, + "grad_norm": 19.796781539916992, + "learning_rate": 1.7187531322040696e-05, + "loss": 2.8469, + "step": 1404 + }, + { + "epoch": 0.42, + "grad_norm": 14.170371055603027, + "learning_rate": 1.7185526711436306e-05, + "loss": 2.3962, + "step": 1405 + }, + { + "epoch": 0.42, + "grad_norm": 11.088910102844238, + "learning_rate": 1.7183522100831916e-05, + "loss": 2.6827, + "step": 1406 + }, + { + "epoch": 0.42, + "grad_norm": 136.58908081054688, + "learning_rate": 1.7181517490227523e-05, + "loss": 2.3091, + "step": 1407 + }, + { + "epoch": 0.42, + "grad_norm": 12.442429542541504, + "learning_rate": 1.7179512879623136e-05, + "loss": 2.4494, + "step": 1408 + }, + { + "epoch": 0.42, + "grad_norm": 14.868009567260742, + "learning_rate": 1.7177508269018746e-05, + "loss": 2.5188, + "step": 1409 + }, + { + "epoch": 0.42, + "grad_norm": 12.164691925048828, + "learning_rate": 1.7175503658414353e-05, + "loss": 2.378, + "step": 1410 + }, + { + "epoch": 0.42, + "grad_norm": 15.50973129272461, + "learning_rate": 1.7173499047809966e-05, + "loss": 4.2229, + "step": 1411 + }, + { + "epoch": 0.42, + "grad_norm": 16.49388885498047, + "learning_rate": 1.7171494437205573e-05, + "loss": 2.5579, + "step": 1412 + }, + { + "epoch": 0.42, + "grad_norm": 14.063419342041016, + "learning_rate": 1.7169489826601183e-05, + "loss": 3.0148, + "step": 1413 + }, + { + "epoch": 0.43, + "grad_norm": 37.99775314331055, + "learning_rate": 1.7167485215996793e-05, + "loss": 2.9612, + "step": 1414 + }, + { + "epoch": 0.43, + "grad_norm": 18.0521183013916, + "learning_rate": 1.7165480605392403e-05, + "loss": 2.871, + "step": 1415 + }, + { + "epoch": 0.43, + "grad_norm": 17.04564094543457, + "learning_rate": 1.7163475994788013e-05, + "loss": 2.6688, + "step": 1416 + }, + { + "epoch": 0.43, + "grad_norm": 13.493517875671387, + "learning_rate": 1.7161471384183624e-05, + "loss": 2.1012, + "step": 1417 + }, + { + "epoch": 0.43, + "grad_norm": 13.597625732421875, + "learning_rate": 1.7159466773579234e-05, + "loss": 3.3529, + "step": 1418 + }, + { + "epoch": 0.43, + "grad_norm": 17.114131927490234, + "learning_rate": 1.7157462162974844e-05, + "loss": 3.2373, + "step": 1419 + }, + { + "epoch": 0.43, + "grad_norm": 18.538379669189453, + "learning_rate": 1.7155457552370454e-05, + "loss": 2.4973, + "step": 1420 + }, + { + "epoch": 0.43, + "grad_norm": 23.284835815429688, + "learning_rate": 1.7153452941766064e-05, + "loss": 3.6339, + "step": 1421 + }, + { + "epoch": 0.43, + "grad_norm": 19.382251739501953, + "learning_rate": 1.7151448331161674e-05, + "loss": 3.1989, + "step": 1422 + }, + { + "epoch": 0.43, + "grad_norm": 17.688274383544922, + "learning_rate": 1.7149443720557284e-05, + "loss": 2.1218, + "step": 1423 + }, + { + "epoch": 0.43, + "grad_norm": 24.966703414916992, + "learning_rate": 1.7147439109952894e-05, + "loss": 2.9954, + "step": 1424 + }, + { + "epoch": 0.43, + "grad_norm": 31.188905715942383, + "learning_rate": 1.7145434499348504e-05, + "loss": 2.9035, + "step": 1425 + }, + { + "epoch": 0.43, + "grad_norm": 13.262977600097656, + "learning_rate": 1.714342988874411e-05, + "loss": 1.3303, + "step": 1426 + }, + { + "epoch": 0.43, + "grad_norm": 13.84818172454834, + "learning_rate": 1.7141425278139724e-05, + "loss": 2.3282, + "step": 1427 + }, + { + "epoch": 0.43, + "grad_norm": 55.73940658569336, + "learning_rate": 1.7139420667535334e-05, + "loss": 3.5851, + "step": 1428 + }, + { + "epoch": 0.43, + "grad_norm": 11.50109577178955, + "learning_rate": 1.713741605693094e-05, + "loss": 2.6434, + "step": 1429 + }, + { + "epoch": 0.43, + "grad_norm": 17.894203186035156, + "learning_rate": 1.7135411446326555e-05, + "loss": 2.5986, + "step": 1430 + }, + { + "epoch": 0.43, + "grad_norm": 33.426055908203125, + "learning_rate": 1.713340683572216e-05, + "loss": 2.7803, + "step": 1431 + }, + { + "epoch": 0.43, + "grad_norm": 10.518821716308594, + "learning_rate": 1.713140222511777e-05, + "loss": 2.1774, + "step": 1432 + }, + { + "epoch": 0.43, + "grad_norm": 13.691694259643555, + "learning_rate": 1.712939761451338e-05, + "loss": 2.4883, + "step": 1433 + }, + { + "epoch": 0.43, + "grad_norm": 19.365102767944336, + "learning_rate": 1.712739300390899e-05, + "loss": 2.6138, + "step": 1434 + }, + { + "epoch": 0.43, + "grad_norm": 31.23158836364746, + "learning_rate": 1.71253883933046e-05, + "loss": 2.905, + "step": 1435 + }, + { + "epoch": 0.43, + "grad_norm": 23.67169761657715, + "learning_rate": 1.7123383782700212e-05, + "loss": 3.4515, + "step": 1436 + }, + { + "epoch": 0.43, + "grad_norm": 11.991448402404785, + "learning_rate": 1.7121379172095822e-05, + "loss": 2.562, + "step": 1437 + }, + { + "epoch": 0.43, + "grad_norm": 34.79066467285156, + "learning_rate": 1.7119374561491432e-05, + "loss": 3.1665, + "step": 1438 + }, + { + "epoch": 0.43, + "grad_norm": 24.111236572265625, + "learning_rate": 1.7117369950887042e-05, + "loss": 3.2707, + "step": 1439 + }, + { + "epoch": 0.43, + "grad_norm": 36.61920928955078, + "learning_rate": 1.7115365340282652e-05, + "loss": 2.3113, + "step": 1440 + }, + { + "epoch": 0.43, + "eval_loss": 0.4355357885360718, + "eval_runtime": 43.4342, + "eval_samples_per_second": 34.051, + "eval_steps_per_second": 34.051, + "step": 1440 + }, + { + "epoch": 0.43, + "grad_norm": 25.45790672302246, + "learning_rate": 1.7113360729678262e-05, + "loss": 2.8492, + "step": 1441 + }, + { + "epoch": 0.43, + "grad_norm": 8.575899124145508, + "learning_rate": 1.7111356119073872e-05, + "loss": 2.934, + "step": 1442 + }, + { + "epoch": 0.43, + "grad_norm": 14.703217506408691, + "learning_rate": 1.710935150846948e-05, + "loss": 2.8837, + "step": 1443 + }, + { + "epoch": 0.43, + "grad_norm": 45.12884521484375, + "learning_rate": 1.7107346897865092e-05, + "loss": 4.0134, + "step": 1444 + }, + { + "epoch": 0.43, + "grad_norm": 17.04209327697754, + "learning_rate": 1.71053422872607e-05, + "loss": 2.9208, + "step": 1445 + }, + { + "epoch": 0.43, + "grad_norm": 11.436484336853027, + "learning_rate": 1.710333767665631e-05, + "loss": 2.5288, + "step": 1446 + }, + { + "epoch": 0.44, + "grad_norm": 11.852707862854004, + "learning_rate": 1.7101333066051923e-05, + "loss": 2.4064, + "step": 1447 + }, + { + "epoch": 0.44, + "grad_norm": 18.86704444885254, + "learning_rate": 1.709932845544753e-05, + "loss": 2.924, + "step": 1448 + }, + { + "epoch": 0.44, + "grad_norm": 26.129491806030273, + "learning_rate": 1.7097323844843143e-05, + "loss": 2.4179, + "step": 1449 + }, + { + "epoch": 0.44, + "grad_norm": 13.224143028259277, + "learning_rate": 1.709531923423875e-05, + "loss": 3.0095, + "step": 1450 + }, + { + "epoch": 0.44, + "grad_norm": 12.908326148986816, + "learning_rate": 1.709331462363436e-05, + "loss": 2.6045, + "step": 1451 + }, + { + "epoch": 0.44, + "grad_norm": 17.967126846313477, + "learning_rate": 1.7091310013029973e-05, + "loss": 1.9252, + "step": 1452 + }, + { + "epoch": 0.44, + "grad_norm": 15.611589431762695, + "learning_rate": 1.708930540242558e-05, + "loss": 3.118, + "step": 1453 + }, + { + "epoch": 0.44, + "grad_norm": 19.359302520751953, + "learning_rate": 1.708730079182119e-05, + "loss": 2.8996, + "step": 1454 + }, + { + "epoch": 0.44, + "grad_norm": 15.188103675842285, + "learning_rate": 1.70852961812168e-05, + "loss": 3.3204, + "step": 1455 + }, + { + "epoch": 0.44, + "grad_norm": 24.560422897338867, + "learning_rate": 1.708329157061241e-05, + "loss": 2.7835, + "step": 1456 + }, + { + "epoch": 0.44, + "grad_norm": 13.218994140625, + "learning_rate": 1.708128696000802e-05, + "loss": 2.3175, + "step": 1457 + }, + { + "epoch": 0.44, + "grad_norm": 13.680419921875, + "learning_rate": 1.707928234940363e-05, + "loss": 2.2045, + "step": 1458 + }, + { + "epoch": 0.44, + "grad_norm": 15.42464828491211, + "learning_rate": 1.707727773879924e-05, + "loss": 2.7285, + "step": 1459 + }, + { + "epoch": 0.44, + "grad_norm": 25.709156036376953, + "learning_rate": 1.707527312819485e-05, + "loss": 3.6841, + "step": 1460 + }, + { + "epoch": 0.44, + "grad_norm": 15.961677551269531, + "learning_rate": 1.707326851759046e-05, + "loss": 3.4276, + "step": 1461 + }, + { + "epoch": 0.44, + "grad_norm": 15.101017951965332, + "learning_rate": 1.7071263906986067e-05, + "loss": 2.935, + "step": 1462 + }, + { + "epoch": 0.44, + "grad_norm": 15.92037582397461, + "learning_rate": 1.706925929638168e-05, + "loss": 2.3067, + "step": 1463 + }, + { + "epoch": 0.44, + "grad_norm": 17.06378173828125, + "learning_rate": 1.7067254685777287e-05, + "loss": 2.2178, + "step": 1464 + }, + { + "epoch": 0.44, + "grad_norm": 15.061570167541504, + "learning_rate": 1.7065250075172897e-05, + "loss": 2.2647, + "step": 1465 + }, + { + "epoch": 0.44, + "grad_norm": 11.610605239868164, + "learning_rate": 1.706324546456851e-05, + "loss": 2.2437, + "step": 1466 + }, + { + "epoch": 0.44, + "grad_norm": 9.191241264343262, + "learning_rate": 1.7061240853964118e-05, + "loss": 2.011, + "step": 1467 + }, + { + "epoch": 0.44, + "grad_norm": 29.908700942993164, + "learning_rate": 1.7059236243359728e-05, + "loss": 3.4803, + "step": 1468 + }, + { + "epoch": 0.44, + "grad_norm": 31.88568115234375, + "learning_rate": 1.7057231632755338e-05, + "loss": 1.611, + "step": 1469 + }, + { + "epoch": 0.44, + "grad_norm": 18.888980865478516, + "learning_rate": 1.7055227022150948e-05, + "loss": 3.0609, + "step": 1470 + }, + { + "epoch": 0.44, + "grad_norm": 15.416746139526367, + "learning_rate": 1.7053222411546558e-05, + "loss": 2.4115, + "step": 1471 + }, + { + "epoch": 0.44, + "grad_norm": 16.257118225097656, + "learning_rate": 1.7051217800942168e-05, + "loss": 3.6617, + "step": 1472 + }, + { + "epoch": 0.44, + "grad_norm": 14.70289134979248, + "learning_rate": 1.7049213190337778e-05, + "loss": 2.0395, + "step": 1473 + }, + { + "epoch": 0.44, + "grad_norm": 14.966999053955078, + "learning_rate": 1.7047208579733388e-05, + "loss": 2.7337, + "step": 1474 + }, + { + "epoch": 0.44, + "grad_norm": 19.118864059448242, + "learning_rate": 1.7045203969128998e-05, + "loss": 2.5427, + "step": 1475 + }, + { + "epoch": 0.44, + "grad_norm": 21.44363021850586, + "learning_rate": 1.704319935852461e-05, + "loss": 3.0899, + "step": 1476 + }, + { + "epoch": 0.44, + "grad_norm": 16.222488403320312, + "learning_rate": 1.704119474792022e-05, + "loss": 2.3837, + "step": 1477 + }, + { + "epoch": 0.44, + "grad_norm": 13.13603401184082, + "learning_rate": 1.703919013731583e-05, + "loss": 2.4675, + "step": 1478 + }, + { + "epoch": 0.44, + "grad_norm": 20.3343563079834, + "learning_rate": 1.703718552671144e-05, + "loss": 2.9424, + "step": 1479 + }, + { + "epoch": 0.44, + "grad_norm": 19.545503616333008, + "learning_rate": 1.703518091610705e-05, + "loss": 2.9953, + "step": 1480 + }, + { + "epoch": 0.45, + "grad_norm": 18.053640365600586, + "learning_rate": 1.7033176305502655e-05, + "loss": 2.11, + "step": 1481 + }, + { + "epoch": 0.45, + "grad_norm": 18.4859561920166, + "learning_rate": 1.703117169489827e-05, + "loss": 3.0607, + "step": 1482 + }, + { + "epoch": 0.45, + "grad_norm": 13.820666313171387, + "learning_rate": 1.702916708429388e-05, + "loss": 2.8445, + "step": 1483 + }, + { + "epoch": 0.45, + "grad_norm": 15.42257022857666, + "learning_rate": 1.7027162473689486e-05, + "loss": 2.8965, + "step": 1484 + }, + { + "epoch": 0.45, + "grad_norm": 27.479591369628906, + "learning_rate": 1.70251578630851e-05, + "loss": 2.9632, + "step": 1485 + }, + { + "epoch": 0.45, + "grad_norm": 21.065073013305664, + "learning_rate": 1.7023153252480706e-05, + "loss": 2.7442, + "step": 1486 + }, + { + "epoch": 0.45, + "grad_norm": 7.838624477386475, + "learning_rate": 1.7021148641876316e-05, + "loss": 2.0983, + "step": 1487 + }, + { + "epoch": 0.45, + "grad_norm": 17.929758071899414, + "learning_rate": 1.7019144031271926e-05, + "loss": 2.3848, + "step": 1488 + }, + { + "epoch": 0.45, + "grad_norm": 10.071520805358887, + "learning_rate": 1.7017139420667536e-05, + "loss": 1.5338, + "step": 1489 + }, + { + "epoch": 0.45, + "grad_norm": 15.892718315124512, + "learning_rate": 1.7015134810063146e-05, + "loss": 2.5901, + "step": 1490 + }, + { + "epoch": 0.45, + "grad_norm": 13.212607383728027, + "learning_rate": 1.7013130199458756e-05, + "loss": 3.0064, + "step": 1491 + }, + { + "epoch": 0.45, + "grad_norm": 21.533849716186523, + "learning_rate": 1.7011125588854366e-05, + "loss": 2.9632, + "step": 1492 + }, + { + "epoch": 0.45, + "grad_norm": 18.14595603942871, + "learning_rate": 1.7009120978249976e-05, + "loss": 1.7862, + "step": 1493 + }, + { + "epoch": 0.45, + "grad_norm": 12.642379760742188, + "learning_rate": 1.7007116367645586e-05, + "loss": 2.7063, + "step": 1494 + }, + { + "epoch": 0.45, + "grad_norm": 18.708799362182617, + "learning_rate": 1.7005111757041197e-05, + "loss": 2.7772, + "step": 1495 + }, + { + "epoch": 0.45, + "grad_norm": 29.240921020507812, + "learning_rate": 1.7003107146436807e-05, + "loss": 1.7695, + "step": 1496 + }, + { + "epoch": 0.45, + "grad_norm": 18.173198699951172, + "learning_rate": 1.7001102535832417e-05, + "loss": 2.528, + "step": 1497 + }, + { + "epoch": 0.45, + "grad_norm": 10.611312866210938, + "learning_rate": 1.6999097925228027e-05, + "loss": 2.3301, + "step": 1498 + }, + { + "epoch": 0.45, + "grad_norm": 17.193771362304688, + "learning_rate": 1.6997093314623637e-05, + "loss": 1.8752, + "step": 1499 + }, + { + "epoch": 0.45, + "grad_norm": 17.25593376159668, + "learning_rate": 1.6995088704019244e-05, + "loss": 3.6294, + "step": 1500 + }, + { + "epoch": 0.45, + "grad_norm": 18.063823699951172, + "learning_rate": 1.6993084093414857e-05, + "loss": 3.0548, + "step": 1501 + }, + { + "epoch": 0.45, + "grad_norm": 22.38393783569336, + "learning_rate": 1.6991079482810467e-05, + "loss": 2.6282, + "step": 1502 + }, + { + "epoch": 0.45, + "grad_norm": 11.480286598205566, + "learning_rate": 1.6989074872206074e-05, + "loss": 1.8335, + "step": 1503 + }, + { + "epoch": 0.45, + "grad_norm": 26.893537521362305, + "learning_rate": 1.6987070261601687e-05, + "loss": 2.7152, + "step": 1504 + }, + { + "epoch": 0.45, + "grad_norm": 11.621118545532227, + "learning_rate": 1.6985065650997294e-05, + "loss": 2.3662, + "step": 1505 + }, + { + "epoch": 0.45, + "grad_norm": 15.238231658935547, + "learning_rate": 1.6983061040392904e-05, + "loss": 2.4623, + "step": 1506 + }, + { + "epoch": 0.45, + "grad_norm": 20.844465255737305, + "learning_rate": 1.6981056429788514e-05, + "loss": 2.5909, + "step": 1507 + }, + { + "epoch": 0.45, + "grad_norm": 11.749519348144531, + "learning_rate": 1.6979051819184124e-05, + "loss": 1.8395, + "step": 1508 + }, + { + "epoch": 0.45, + "grad_norm": 15.4244384765625, + "learning_rate": 1.6977047208579734e-05, + "loss": 3.7336, + "step": 1509 + }, + { + "epoch": 0.45, + "grad_norm": 11.57870864868164, + "learning_rate": 1.6975042597975344e-05, + "loss": 2.3881, + "step": 1510 + }, + { + "epoch": 0.45, + "grad_norm": 21.28474235534668, + "learning_rate": 1.6973037987370954e-05, + "loss": 2.8733, + "step": 1511 + }, + { + "epoch": 0.45, + "grad_norm": 13.479619026184082, + "learning_rate": 1.6971033376766565e-05, + "loss": 1.693, + "step": 1512 + }, + { + "epoch": 0.45, + "grad_norm": 17.223281860351562, + "learning_rate": 1.6969028766162175e-05, + "loss": 2.2971, + "step": 1513 + }, + { + "epoch": 0.46, + "grad_norm": 31.056154251098633, + "learning_rate": 1.6967024155557785e-05, + "loss": 2.5655, + "step": 1514 + }, + { + "epoch": 0.46, + "grad_norm": 60.43313980102539, + "learning_rate": 1.6965019544953395e-05, + "loss": 3.5182, + "step": 1515 + }, + { + "epoch": 0.46, + "grad_norm": 15.120966911315918, + "learning_rate": 1.6963014934349005e-05, + "loss": 2.7201, + "step": 1516 + }, + { + "epoch": 0.46, + "grad_norm": 19.221208572387695, + "learning_rate": 1.6961010323744615e-05, + "loss": 3.3837, + "step": 1517 + }, + { + "epoch": 0.46, + "grad_norm": 13.70907974243164, + "learning_rate": 1.6959005713140225e-05, + "loss": 1.8185, + "step": 1518 + }, + { + "epoch": 0.46, + "grad_norm": 25.266483306884766, + "learning_rate": 1.6957001102535832e-05, + "loss": 3.0797, + "step": 1519 + }, + { + "epoch": 0.46, + "grad_norm": 16.34473419189453, + "learning_rate": 1.6954996491931445e-05, + "loss": 2.689, + "step": 1520 + }, + { + "epoch": 0.46, + "grad_norm": 21.672119140625, + "learning_rate": 1.6952991881327055e-05, + "loss": 2.4117, + "step": 1521 + }, + { + "epoch": 0.46, + "grad_norm": 12.567007064819336, + "learning_rate": 1.6950987270722662e-05, + "loss": 2.5169, + "step": 1522 + }, + { + "epoch": 0.46, + "grad_norm": 54.59273147583008, + "learning_rate": 1.6948982660118275e-05, + "loss": 3.4834, + "step": 1523 + }, + { + "epoch": 0.46, + "grad_norm": 18.03861427307129, + "learning_rate": 1.6946978049513882e-05, + "loss": 2.824, + "step": 1524 + }, + { + "epoch": 0.46, + "grad_norm": 40.20027542114258, + "learning_rate": 1.6944973438909492e-05, + "loss": 2.5567, + "step": 1525 + }, + { + "epoch": 0.46, + "grad_norm": 12.565523147583008, + "learning_rate": 1.6942968828305106e-05, + "loss": 3.1967, + "step": 1526 + }, + { + "epoch": 0.46, + "grad_norm": 12.132319450378418, + "learning_rate": 1.6940964217700712e-05, + "loss": 2.5713, + "step": 1527 + }, + { + "epoch": 0.46, + "grad_norm": 28.399211883544922, + "learning_rate": 1.6938959607096323e-05, + "loss": 2.7361, + "step": 1528 + }, + { + "epoch": 0.46, + "grad_norm": 20.132675170898438, + "learning_rate": 1.6936954996491933e-05, + "loss": 2.6361, + "step": 1529 + }, + { + "epoch": 0.46, + "grad_norm": 19.281675338745117, + "learning_rate": 1.6934950385887543e-05, + "loss": 2.8946, + "step": 1530 + }, + { + "epoch": 0.46, + "grad_norm": 11.40804672241211, + "learning_rate": 1.6932945775283153e-05, + "loss": 2.4466, + "step": 1531 + }, + { + "epoch": 0.46, + "grad_norm": 19.21536636352539, + "learning_rate": 1.6930941164678763e-05, + "loss": 2.9093, + "step": 1532 + }, + { + "epoch": 0.46, + "grad_norm": 13.203646659851074, + "learning_rate": 1.6928936554074373e-05, + "loss": 2.3911, + "step": 1533 + }, + { + "epoch": 0.46, + "grad_norm": 11.949692726135254, + "learning_rate": 1.6926931943469983e-05, + "loss": 2.8228, + "step": 1534 + }, + { + "epoch": 0.46, + "grad_norm": 14.0568208694458, + "learning_rate": 1.6924927332865593e-05, + "loss": 2.7099, + "step": 1535 + }, + { + "epoch": 0.46, + "grad_norm": 55.63922882080078, + "learning_rate": 1.69229227222612e-05, + "loss": 2.8945, + "step": 1536 + }, + { + "epoch": 0.46, + "grad_norm": 9.444130897521973, + "learning_rate": 1.6920918111656813e-05, + "loss": 1.714, + "step": 1537 + }, + { + "epoch": 0.46, + "grad_norm": 12.79077434539795, + "learning_rate": 1.6918913501052423e-05, + "loss": 3.4578, + "step": 1538 + }, + { + "epoch": 0.46, + "grad_norm": 12.505064010620117, + "learning_rate": 1.691690889044803e-05, + "loss": 1.7876, + "step": 1539 + }, + { + "epoch": 0.46, + "grad_norm": 19.741012573242188, + "learning_rate": 1.6914904279843644e-05, + "loss": 2.2345, + "step": 1540 + }, + { + "epoch": 0.46, + "grad_norm": 26.9569149017334, + "learning_rate": 1.691289966923925e-05, + "loss": 2.3699, + "step": 1541 + }, + { + "epoch": 0.46, + "grad_norm": 21.60605812072754, + "learning_rate": 1.691089505863486e-05, + "loss": 3.1069, + "step": 1542 + }, + { + "epoch": 0.46, + "grad_norm": 14.085561752319336, + "learning_rate": 1.690889044803047e-05, + "loss": 2.5667, + "step": 1543 + }, + { + "epoch": 0.46, + "grad_norm": 19.00534439086914, + "learning_rate": 1.690688583742608e-05, + "loss": 2.816, + "step": 1544 + }, + { + "epoch": 0.46, + "grad_norm": 14.58003044128418, + "learning_rate": 1.690488122682169e-05, + "loss": 3.1388, + "step": 1545 + }, + { + "epoch": 0.46, + "grad_norm": 23.24366569519043, + "learning_rate": 1.69028766162173e-05, + "loss": 1.9925, + "step": 1546 + }, + { + "epoch": 0.47, + "grad_norm": 18.700740814208984, + "learning_rate": 1.690087200561291e-05, + "loss": 1.9935, + "step": 1547 + }, + { + "epoch": 0.47, + "grad_norm": 20.79172134399414, + "learning_rate": 1.689886739500852e-05, + "loss": 3.1285, + "step": 1548 + }, + { + "epoch": 0.47, + "grad_norm": 22.56122589111328, + "learning_rate": 1.689686278440413e-05, + "loss": 3.1273, + "step": 1549 + }, + { + "epoch": 0.47, + "grad_norm": 12.774810791015625, + "learning_rate": 1.689485817379974e-05, + "loss": 2.8262, + "step": 1550 + }, + { + "epoch": 0.47, + "grad_norm": 16.529850006103516, + "learning_rate": 1.689285356319535e-05, + "loss": 2.3473, + "step": 1551 + }, + { + "epoch": 0.47, + "grad_norm": 20.23556137084961, + "learning_rate": 1.689084895259096e-05, + "loss": 1.8937, + "step": 1552 + }, + { + "epoch": 0.47, + "grad_norm": 23.965484619140625, + "learning_rate": 1.688884434198657e-05, + "loss": 2.5148, + "step": 1553 + }, + { + "epoch": 0.47, + "grad_norm": 11.394997596740723, + "learning_rate": 1.688683973138218e-05, + "loss": 2.7078, + "step": 1554 + }, + { + "epoch": 0.47, + "grad_norm": 14.20688247680664, + "learning_rate": 1.6884835120777788e-05, + "loss": 1.9273, + "step": 1555 + }, + { + "epoch": 0.47, + "grad_norm": 8.693448066711426, + "learning_rate": 1.68828305101734e-05, + "loss": 1.7773, + "step": 1556 + }, + { + "epoch": 0.47, + "grad_norm": 14.1410493850708, + "learning_rate": 1.688082589956901e-05, + "loss": 2.5604, + "step": 1557 + }, + { + "epoch": 0.47, + "grad_norm": 29.748794555664062, + "learning_rate": 1.6878821288964618e-05, + "loss": 2.5162, + "step": 1558 + }, + { + "epoch": 0.47, + "grad_norm": 10.367297172546387, + "learning_rate": 1.6876816678360232e-05, + "loss": 3.0466, + "step": 1559 + }, + { + "epoch": 0.47, + "grad_norm": 16.686874389648438, + "learning_rate": 1.687481206775584e-05, + "loss": 3.304, + "step": 1560 + }, + { + "epoch": 0.47, + "eval_loss": 0.3973664343357086, + "eval_runtime": 43.4098, + "eval_samples_per_second": 34.071, + "eval_steps_per_second": 34.071, + "step": 1560 + }, + { + "epoch": 0.47, + "grad_norm": 14.685885429382324, + "learning_rate": 1.687280745715145e-05, + "loss": 2.3338, + "step": 1561 + }, + { + "epoch": 0.47, + "grad_norm": 20.013853073120117, + "learning_rate": 1.687080284654706e-05, + "loss": 2.3406, + "step": 1562 + }, + { + "epoch": 0.47, + "grad_norm": 14.868475914001465, + "learning_rate": 1.686879823594267e-05, + "loss": 1.8712, + "step": 1563 + }, + { + "epoch": 0.47, + "grad_norm": 20.89446449279785, + "learning_rate": 1.686679362533828e-05, + "loss": 2.7493, + "step": 1564 + }, + { + "epoch": 0.47, + "grad_norm": 34.56494903564453, + "learning_rate": 1.686478901473389e-05, + "loss": 2.8977, + "step": 1565 + }, + { + "epoch": 0.47, + "grad_norm": 14.068147659301758, + "learning_rate": 1.68627844041295e-05, + "loss": 2.5127, + "step": 1566 + }, + { + "epoch": 0.47, + "grad_norm": 18.625877380371094, + "learning_rate": 1.686077979352511e-05, + "loss": 2.8985, + "step": 1567 + }, + { + "epoch": 0.47, + "grad_norm": 11.287114143371582, + "learning_rate": 1.685877518292072e-05, + "loss": 1.6863, + "step": 1568 + }, + { + "epoch": 0.47, + "grad_norm": 19.42438316345215, + "learning_rate": 1.685677057231633e-05, + "loss": 2.5056, + "step": 1569 + }, + { + "epoch": 0.47, + "grad_norm": 13.997237205505371, + "learning_rate": 1.685476596171194e-05, + "loss": 2.6668, + "step": 1570 + }, + { + "epoch": 0.47, + "grad_norm": 10.12904167175293, + "learning_rate": 1.685276135110755e-05, + "loss": 2.0919, + "step": 1571 + }, + { + "epoch": 0.47, + "grad_norm": 23.484573364257812, + "learning_rate": 1.685075674050316e-05, + "loss": 2.9543, + "step": 1572 + }, + { + "epoch": 0.47, + "grad_norm": 21.227386474609375, + "learning_rate": 1.684875212989877e-05, + "loss": 2.0336, + "step": 1573 + }, + { + "epoch": 0.47, + "grad_norm": 30.29522132873535, + "learning_rate": 1.6846747519294376e-05, + "loss": 2.6453, + "step": 1574 + }, + { + "epoch": 0.47, + "grad_norm": 25.412912368774414, + "learning_rate": 1.684474290868999e-05, + "loss": 3.1794, + "step": 1575 + }, + { + "epoch": 0.47, + "grad_norm": 26.188493728637695, + "learning_rate": 1.68427382980856e-05, + "loss": 2.9667, + "step": 1576 + }, + { + "epoch": 0.47, + "grad_norm": 15.816750526428223, + "learning_rate": 1.6840733687481206e-05, + "loss": 2.0286, + "step": 1577 + }, + { + "epoch": 0.47, + "grad_norm": 17.47620964050293, + "learning_rate": 1.683872907687682e-05, + "loss": 3.0095, + "step": 1578 + }, + { + "epoch": 0.47, + "grad_norm": 14.732656478881836, + "learning_rate": 1.6836724466272427e-05, + "loss": 3.2241, + "step": 1579 + }, + { + "epoch": 0.48, + "grad_norm": 15.800749778747559, + "learning_rate": 1.6834719855668037e-05, + "loss": 2.4581, + "step": 1580 + }, + { + "epoch": 0.48, + "grad_norm": 14.257530212402344, + "learning_rate": 1.683271524506365e-05, + "loss": 2.6594, + "step": 1581 + }, + { + "epoch": 0.48, + "grad_norm": 18.350427627563477, + "learning_rate": 1.6830710634459257e-05, + "loss": 2.8644, + "step": 1582 + }, + { + "epoch": 0.48, + "grad_norm": 17.27747344970703, + "learning_rate": 1.6828706023854867e-05, + "loss": 2.8629, + "step": 1583 + }, + { + "epoch": 0.48, + "grad_norm": 13.865232467651367, + "learning_rate": 1.6826701413250477e-05, + "loss": 2.115, + "step": 1584 + }, + { + "epoch": 0.48, + "grad_norm": 26.224536895751953, + "learning_rate": 1.6824696802646087e-05, + "loss": 3.4931, + "step": 1585 + }, + { + "epoch": 0.48, + "grad_norm": 10.468141555786133, + "learning_rate": 1.6822692192041697e-05, + "loss": 1.9319, + "step": 1586 + }, + { + "epoch": 0.48, + "grad_norm": 10.993379592895508, + "learning_rate": 1.6820687581437307e-05, + "loss": 2.8885, + "step": 1587 + }, + { + "epoch": 0.48, + "grad_norm": 18.79608726501465, + "learning_rate": 1.6818682970832917e-05, + "loss": 3.4616, + "step": 1588 + }, + { + "epoch": 0.48, + "grad_norm": 23.80633544921875, + "learning_rate": 1.6816678360228528e-05, + "loss": 3.5015, + "step": 1589 + }, + { + "epoch": 0.48, + "grad_norm": 13.126861572265625, + "learning_rate": 1.6814673749624138e-05, + "loss": 2.2426, + "step": 1590 + }, + { + "epoch": 0.48, + "grad_norm": 18.61846923828125, + "learning_rate": 1.6812669139019748e-05, + "loss": 2.8252, + "step": 1591 + }, + { + "epoch": 0.48, + "grad_norm": 9.052312850952148, + "learning_rate": 1.6810664528415358e-05, + "loss": 2.1735, + "step": 1592 + }, + { + "epoch": 0.48, + "grad_norm": 9.982820510864258, + "learning_rate": 1.6808659917810964e-05, + "loss": 1.8825, + "step": 1593 + }, + { + "epoch": 0.48, + "grad_norm": 18.984390258789062, + "learning_rate": 1.6806655307206578e-05, + "loss": 3.0772, + "step": 1594 + }, + { + "epoch": 0.48, + "grad_norm": 19.849212646484375, + "learning_rate": 1.6804650696602188e-05, + "loss": 2.8981, + "step": 1595 + }, + { + "epoch": 0.48, + "grad_norm": 15.359475135803223, + "learning_rate": 1.6802646085997795e-05, + "loss": 2.7745, + "step": 1596 + }, + { + "epoch": 0.48, + "grad_norm": 13.44256591796875, + "learning_rate": 1.6800641475393408e-05, + "loss": 2.4322, + "step": 1597 + }, + { + "epoch": 0.48, + "grad_norm": 19.221418380737305, + "learning_rate": 1.6798636864789015e-05, + "loss": 1.8762, + "step": 1598 + }, + { + "epoch": 0.48, + "grad_norm": 20.303268432617188, + "learning_rate": 1.6796632254184625e-05, + "loss": 1.7788, + "step": 1599 + }, + { + "epoch": 0.48, + "grad_norm": 19.06624412536621, + "learning_rate": 1.679462764358024e-05, + "loss": 3.7968, + "step": 1600 + }, + { + "epoch": 0.48, + "grad_norm": 10.663858413696289, + "learning_rate": 1.6792623032975845e-05, + "loss": 1.1838, + "step": 1601 + }, + { + "epoch": 0.48, + "grad_norm": 16.027814865112305, + "learning_rate": 1.6790618422371455e-05, + "loss": 2.7883, + "step": 1602 + }, + { + "epoch": 0.48, + "grad_norm": 15.941658020019531, + "learning_rate": 1.6788613811767065e-05, + "loss": 2.9455, + "step": 1603 + }, + { + "epoch": 0.48, + "grad_norm": 16.636554718017578, + "learning_rate": 1.6786609201162675e-05, + "loss": 2.8941, + "step": 1604 + }, + { + "epoch": 0.48, + "grad_norm": 11.56485366821289, + "learning_rate": 1.6784604590558285e-05, + "loss": 2.961, + "step": 1605 + }, + { + "epoch": 0.48, + "grad_norm": 13.037918090820312, + "learning_rate": 1.6782599979953896e-05, + "loss": 2.292, + "step": 1606 + }, + { + "epoch": 0.48, + "grad_norm": 16.472829818725586, + "learning_rate": 1.6780595369349506e-05, + "loss": 3.5172, + "step": 1607 + }, + { + "epoch": 0.48, + "grad_norm": 21.085201263427734, + "learning_rate": 1.6778590758745116e-05, + "loss": 2.8983, + "step": 1608 + }, + { + "epoch": 0.48, + "grad_norm": 43.585323333740234, + "learning_rate": 1.6776586148140726e-05, + "loss": 2.7387, + "step": 1609 + }, + { + "epoch": 0.48, + "grad_norm": 12.80986213684082, + "learning_rate": 1.6774581537536332e-05, + "loss": 2.1192, + "step": 1610 + }, + { + "epoch": 0.48, + "grad_norm": 24.946006774902344, + "learning_rate": 1.6772576926931946e-05, + "loss": 3.2683, + "step": 1611 + }, + { + "epoch": 0.48, + "grad_norm": 10.049246788024902, + "learning_rate": 1.6770572316327556e-05, + "loss": 2.5447, + "step": 1612 + }, + { + "epoch": 0.48, + "grad_norm": 13.763422012329102, + "learning_rate": 1.6768567705723163e-05, + "loss": 2.1387, + "step": 1613 + }, + { + "epoch": 0.49, + "grad_norm": 18.731260299682617, + "learning_rate": 1.6766563095118776e-05, + "loss": 1.9977, + "step": 1614 + }, + { + "epoch": 0.49, + "grad_norm": 11.16000747680664, + "learning_rate": 1.6764558484514383e-05, + "loss": 1.9304, + "step": 1615 + }, + { + "epoch": 0.49, + "grad_norm": 15.650193214416504, + "learning_rate": 1.6762553873909993e-05, + "loss": 3.0744, + "step": 1616 + }, + { + "epoch": 0.49, + "grad_norm": 19.635541915893555, + "learning_rate": 1.6760549263305603e-05, + "loss": 2.1045, + "step": 1617 + }, + { + "epoch": 0.49, + "grad_norm": 17.894126892089844, + "learning_rate": 1.6758544652701213e-05, + "loss": 2.6789, + "step": 1618 + }, + { + "epoch": 0.49, + "grad_norm": 20.0896053314209, + "learning_rate": 1.6756540042096823e-05, + "loss": 2.6357, + "step": 1619 + }, + { + "epoch": 0.49, + "grad_norm": 18.415935516357422, + "learning_rate": 1.6754535431492433e-05, + "loss": 2.9713, + "step": 1620 + }, + { + "epoch": 0.49, + "grad_norm": 22.75372314453125, + "learning_rate": 1.6752530820888043e-05, + "loss": 3.3434, + "step": 1621 + }, + { + "epoch": 0.49, + "grad_norm": 20.71872901916504, + "learning_rate": 1.6750526210283654e-05, + "loss": 2.466, + "step": 1622 + }, + { + "epoch": 0.49, + "grad_norm": 15.37236499786377, + "learning_rate": 1.6748521599679264e-05, + "loss": 3.1419, + "step": 1623 + }, + { + "epoch": 0.49, + "grad_norm": 16.39763641357422, + "learning_rate": 1.6746516989074874e-05, + "loss": 2.6307, + "step": 1624 + }, + { + "epoch": 0.49, + "grad_norm": 12.221854209899902, + "learning_rate": 1.6744512378470484e-05, + "loss": 3.121, + "step": 1625 + }, + { + "epoch": 0.49, + "grad_norm": 12.343154907226562, + "learning_rate": 1.6742507767866094e-05, + "loss": 2.0893, + "step": 1626 + }, + { + "epoch": 0.49, + "grad_norm": 15.398106575012207, + "learning_rate": 1.6740503157261704e-05, + "loss": 2.126, + "step": 1627 + }, + { + "epoch": 0.49, + "grad_norm": 16.43120574951172, + "learning_rate": 1.6738498546657314e-05, + "loss": 3.3734, + "step": 1628 + }, + { + "epoch": 0.49, + "grad_norm": 11.764989852905273, + "learning_rate": 1.673649393605292e-05, + "loss": 1.5352, + "step": 1629 + }, + { + "epoch": 0.49, + "grad_norm": 24.872587203979492, + "learning_rate": 1.6734489325448534e-05, + "loss": 2.9974, + "step": 1630 + }, + { + "epoch": 0.49, + "grad_norm": 16.981983184814453, + "learning_rate": 1.6732484714844144e-05, + "loss": 2.1175, + "step": 1631 + }, + { + "epoch": 0.49, + "grad_norm": 20.261165618896484, + "learning_rate": 1.673048010423975e-05, + "loss": 3.5279, + "step": 1632 + }, + { + "epoch": 0.49, + "grad_norm": 26.34723663330078, + "learning_rate": 1.6728475493635364e-05, + "loss": 3.3491, + "step": 1633 + }, + { + "epoch": 0.49, + "grad_norm": 18.659168243408203, + "learning_rate": 1.672647088303097e-05, + "loss": 2.9663, + "step": 1634 + }, + { + "epoch": 0.49, + "grad_norm": 11.186785697937012, + "learning_rate": 1.672446627242658e-05, + "loss": 2.1397, + "step": 1635 + }, + { + "epoch": 0.49, + "grad_norm": 9.327524185180664, + "learning_rate": 1.672246166182219e-05, + "loss": 1.744, + "step": 1636 + }, + { + "epoch": 0.49, + "grad_norm": 17.69675064086914, + "learning_rate": 1.67204570512178e-05, + "loss": 2.9865, + "step": 1637 + }, + { + "epoch": 0.49, + "grad_norm": 21.239151000976562, + "learning_rate": 1.671845244061341e-05, + "loss": 2.8212, + "step": 1638 + }, + { + "epoch": 0.49, + "grad_norm": 41.4120979309082, + "learning_rate": 1.671644783000902e-05, + "loss": 2.7785, + "step": 1639 + }, + { + "epoch": 0.49, + "grad_norm": 11.790454864501953, + "learning_rate": 1.671444321940463e-05, + "loss": 2.2797, + "step": 1640 + }, + { + "epoch": 0.49, + "grad_norm": 17.779918670654297, + "learning_rate": 1.6712438608800242e-05, + "loss": 3.0112, + "step": 1641 + }, + { + "epoch": 0.49, + "grad_norm": 16.839773178100586, + "learning_rate": 1.6710433998195852e-05, + "loss": 2.4351, + "step": 1642 + }, + { + "epoch": 0.49, + "grad_norm": 15.281671524047852, + "learning_rate": 1.6708429387591462e-05, + "loss": 1.9823, + "step": 1643 + }, + { + "epoch": 0.49, + "grad_norm": 19.457735061645508, + "learning_rate": 1.6706424776987072e-05, + "loss": 3.6049, + "step": 1644 + }, + { + "epoch": 0.49, + "grad_norm": 16.624021530151367, + "learning_rate": 1.6704420166382682e-05, + "loss": 3.3101, + "step": 1645 + }, + { + "epoch": 0.49, + "grad_norm": 14.08040714263916, + "learning_rate": 1.6702415555778292e-05, + "loss": 2.4127, + "step": 1646 + }, + { + "epoch": 0.5, + "grad_norm": 23.419265747070312, + "learning_rate": 1.6700410945173902e-05, + "loss": 3.5622, + "step": 1647 + }, + { + "epoch": 0.5, + "grad_norm": 12.655960083007812, + "learning_rate": 1.669840633456951e-05, + "loss": 1.7092, + "step": 1648 + }, + { + "epoch": 0.5, + "grad_norm": 10.199254989624023, + "learning_rate": 1.6696401723965122e-05, + "loss": 1.9251, + "step": 1649 + }, + { + "epoch": 0.5, + "grad_norm": 30.36039161682129, + "learning_rate": 1.6694397113360732e-05, + "loss": 1.9794, + "step": 1650 + }, + { + "epoch": 0.5, + "grad_norm": 20.610151290893555, + "learning_rate": 1.669239250275634e-05, + "loss": 2.0625, + "step": 1651 + }, + { + "epoch": 0.5, + "grad_norm": 11.706425666809082, + "learning_rate": 1.6690387892151953e-05, + "loss": 1.8749, + "step": 1652 + }, + { + "epoch": 0.5, + "grad_norm": 9.867399215698242, + "learning_rate": 1.668838328154756e-05, + "loss": 2.4526, + "step": 1653 + }, + { + "epoch": 0.5, + "grad_norm": 21.617414474487305, + "learning_rate": 1.668637867094317e-05, + "loss": 4.5465, + "step": 1654 + }, + { + "epoch": 0.5, + "grad_norm": 21.315481185913086, + "learning_rate": 1.6684374060338783e-05, + "loss": 2.3748, + "step": 1655 + }, + { + "epoch": 0.5, + "grad_norm": 19.910005569458008, + "learning_rate": 1.668236944973439e-05, + "loss": 2.4007, + "step": 1656 + }, + { + "epoch": 0.5, + "grad_norm": 11.719648361206055, + "learning_rate": 1.668036483913e-05, + "loss": 2.5453, + "step": 1657 + }, + { + "epoch": 0.5, + "grad_norm": 14.366843223571777, + "learning_rate": 1.667836022852561e-05, + "loss": 2.7823, + "step": 1658 + }, + { + "epoch": 0.5, + "grad_norm": 20.43113136291504, + "learning_rate": 1.667635561792122e-05, + "loss": 3.0117, + "step": 1659 + }, + { + "epoch": 0.5, + "grad_norm": 25.725727081298828, + "learning_rate": 1.667435100731683e-05, + "loss": 1.9852, + "step": 1660 + }, + { + "epoch": 0.5, + "grad_norm": 12.890719413757324, + "learning_rate": 1.667234639671244e-05, + "loss": 2.8881, + "step": 1661 + }, + { + "epoch": 0.5, + "grad_norm": 15.555781364440918, + "learning_rate": 1.667034178610805e-05, + "loss": 3.1708, + "step": 1662 + }, + { + "epoch": 0.5, + "grad_norm": 13.397451400756836, + "learning_rate": 1.666833717550366e-05, + "loss": 2.0334, + "step": 1663 + }, + { + "epoch": 0.5, + "grad_norm": 12.852067947387695, + "learning_rate": 1.666633256489927e-05, + "loss": 3.193, + "step": 1664 + }, + { + "epoch": 0.5, + "grad_norm": 17.795087814331055, + "learning_rate": 1.666432795429488e-05, + "loss": 2.1889, + "step": 1665 + }, + { + "epoch": 0.5, + "grad_norm": 10.84477710723877, + "learning_rate": 1.666232334369049e-05, + "loss": 2.1249, + "step": 1666 + }, + { + "epoch": 0.5, + "grad_norm": 13.947750091552734, + "learning_rate": 1.66603187330861e-05, + "loss": 2.1903, + "step": 1667 + }, + { + "epoch": 0.5, + "grad_norm": 15.641617774963379, + "learning_rate": 1.665831412248171e-05, + "loss": 1.4892, + "step": 1668 + }, + { + "epoch": 0.5, + "grad_norm": 17.841720581054688, + "learning_rate": 1.665630951187732e-05, + "loss": 2.2156, + "step": 1669 + }, + { + "epoch": 0.5, + "grad_norm": 21.81488800048828, + "learning_rate": 1.6654304901272927e-05, + "loss": 2.7059, + "step": 1670 + }, + { + "epoch": 0.5, + "grad_norm": 10.093061447143555, + "learning_rate": 1.665230029066854e-05, + "loss": 2.087, + "step": 1671 + }, + { + "epoch": 0.5, + "grad_norm": 12.38084602355957, + "learning_rate": 1.6650295680064148e-05, + "loss": 2.6622, + "step": 1672 + }, + { + "epoch": 0.5, + "grad_norm": 13.34916877746582, + "learning_rate": 1.6648291069459758e-05, + "loss": 3.5172, + "step": 1673 + }, + { + "epoch": 0.5, + "grad_norm": 14.111736297607422, + "learning_rate": 1.664628645885537e-05, + "loss": 2.5334, + "step": 1674 + }, + { + "epoch": 0.5, + "grad_norm": 19.57823944091797, + "learning_rate": 1.6644281848250978e-05, + "loss": 2.8923, + "step": 1675 + }, + { + "epoch": 0.5, + "grad_norm": 20.907072067260742, + "learning_rate": 1.6642277237646588e-05, + "loss": 2.7302, + "step": 1676 + }, + { + "epoch": 0.5, + "grad_norm": 12.42391300201416, + "learning_rate": 1.6640272627042198e-05, + "loss": 2.4441, + "step": 1677 + }, + { + "epoch": 0.5, + "grad_norm": 39.446815490722656, + "learning_rate": 1.6638268016437808e-05, + "loss": 3.3596, + "step": 1678 + }, + { + "epoch": 0.5, + "grad_norm": 10.514243125915527, + "learning_rate": 1.6636263405833418e-05, + "loss": 1.9339, + "step": 1679 + }, + { + "epoch": 0.51, + "grad_norm": 22.982282638549805, + "learning_rate": 1.6634258795229028e-05, + "loss": 2.6203, + "step": 1680 + }, + { + "epoch": 0.51, + "eval_loss": 0.421953409910202, + "eval_runtime": 43.6652, + "eval_samples_per_second": 33.871, + "eval_steps_per_second": 33.871, + "step": 1680 + }, + { + "epoch": 0.51, + "grad_norm": 31.43738555908203, + "learning_rate": 1.663225418462464e-05, + "loss": 2.5882, + "step": 1681 + }, + { + "epoch": 0.51, + "grad_norm": 10.706387519836426, + "learning_rate": 1.663024957402025e-05, + "loss": 2.1867, + "step": 1682 + }, + { + "epoch": 0.51, + "grad_norm": 15.15072250366211, + "learning_rate": 1.662824496341586e-05, + "loss": 2.9213, + "step": 1683 + }, + { + "epoch": 0.51, + "grad_norm": 9.624457359313965, + "learning_rate": 1.6626240352811465e-05, + "loss": 1.8701, + "step": 1684 + }, + { + "epoch": 0.51, + "grad_norm": 9.49343490600586, + "learning_rate": 1.662423574220708e-05, + "loss": 1.612, + "step": 1685 + }, + { + "epoch": 0.51, + "grad_norm": 28.716997146606445, + "learning_rate": 1.662223113160269e-05, + "loss": 3.3154, + "step": 1686 + }, + { + "epoch": 0.51, + "grad_norm": 9.385660171508789, + "learning_rate": 1.66202265209983e-05, + "loss": 2.5589, + "step": 1687 + }, + { + "epoch": 0.51, + "grad_norm": 14.978555679321289, + "learning_rate": 1.661822191039391e-05, + "loss": 2.1595, + "step": 1688 + }, + { + "epoch": 0.51, + "grad_norm": 18.335002899169922, + "learning_rate": 1.6616217299789516e-05, + "loss": 2.4189, + "step": 1689 + }, + { + "epoch": 0.51, + "grad_norm": 16.582286834716797, + "learning_rate": 1.661421268918513e-05, + "loss": 3.3232, + "step": 1690 + }, + { + "epoch": 0.51, + "grad_norm": 13.659370422363281, + "learning_rate": 1.6612208078580736e-05, + "loss": 2.2705, + "step": 1691 + }, + { + "epoch": 0.51, + "grad_norm": 13.425541877746582, + "learning_rate": 1.6610203467976346e-05, + "loss": 2.4391, + "step": 1692 + }, + { + "epoch": 0.51, + "grad_norm": 22.200546264648438, + "learning_rate": 1.660819885737196e-05, + "loss": 2.4271, + "step": 1693 + }, + { + "epoch": 0.51, + "grad_norm": 14.445297241210938, + "learning_rate": 1.6606194246767566e-05, + "loss": 2.3692, + "step": 1694 + }, + { + "epoch": 0.51, + "grad_norm": 14.760397911071777, + "learning_rate": 1.6604189636163176e-05, + "loss": 2.3689, + "step": 1695 + }, + { + "epoch": 0.51, + "grad_norm": 10.618377685546875, + "learning_rate": 1.6602185025558786e-05, + "loss": 2.1075, + "step": 1696 + }, + { + "epoch": 0.51, + "grad_norm": 17.689512252807617, + "learning_rate": 1.6600180414954396e-05, + "loss": 3.0902, + "step": 1697 + }, + { + "epoch": 0.51, + "grad_norm": 8.896835327148438, + "learning_rate": 1.6598175804350006e-05, + "loss": 2.3301, + "step": 1698 + }, + { + "epoch": 0.51, + "grad_norm": 9.717402458190918, + "learning_rate": 1.6596171193745616e-05, + "loss": 1.9009, + "step": 1699 + }, + { + "epoch": 0.51, + "grad_norm": 16.815967559814453, + "learning_rate": 1.6594166583141227e-05, + "loss": 3.0338, + "step": 1700 + }, + { + "epoch": 0.51, + "grad_norm": 33.1158447265625, + "learning_rate": 1.6592161972536837e-05, + "loss": 3.0146, + "step": 1701 + }, + { + "epoch": 0.51, + "grad_norm": 12.088573455810547, + "learning_rate": 1.6590157361932447e-05, + "loss": 3.0398, + "step": 1702 + }, + { + "epoch": 0.51, + "grad_norm": 24.571584701538086, + "learning_rate": 1.6588152751328053e-05, + "loss": 3.3241, + "step": 1703 + }, + { + "epoch": 0.51, + "grad_norm": 23.808401107788086, + "learning_rate": 1.6586148140723667e-05, + "loss": 3.3574, + "step": 1704 + }, + { + "epoch": 0.51, + "grad_norm": 20.748384475708008, + "learning_rate": 1.6584143530119277e-05, + "loss": 3.0009, + "step": 1705 + }, + { + "epoch": 0.51, + "grad_norm": 24.722551345825195, + "learning_rate": 1.6582138919514884e-05, + "loss": 2.8073, + "step": 1706 + }, + { + "epoch": 0.51, + "grad_norm": 16.103742599487305, + "learning_rate": 1.6580134308910497e-05, + "loss": 3.2797, + "step": 1707 + }, + { + "epoch": 0.51, + "grad_norm": 17.112245559692383, + "learning_rate": 1.6578129698306104e-05, + "loss": 2.8844, + "step": 1708 + }, + { + "epoch": 0.51, + "grad_norm": 12.628554344177246, + "learning_rate": 1.6576125087701714e-05, + "loss": 2.5837, + "step": 1709 + }, + { + "epoch": 0.51, + "grad_norm": 15.136996269226074, + "learning_rate": 1.6574120477097327e-05, + "loss": 2.8925, + "step": 1710 + }, + { + "epoch": 0.51, + "grad_norm": 15.211194038391113, + "learning_rate": 1.6572115866492934e-05, + "loss": 2.2617, + "step": 1711 + }, + { + "epoch": 0.51, + "grad_norm": 13.138132095336914, + "learning_rate": 1.6570111255888544e-05, + "loss": 2.1384, + "step": 1712 + }, + { + "epoch": 0.52, + "grad_norm": 15.303442001342773, + "learning_rate": 1.6568106645284154e-05, + "loss": 3.7071, + "step": 1713 + }, + { + "epoch": 0.52, + "grad_norm": 16.974143981933594, + "learning_rate": 1.6566102034679764e-05, + "loss": 2.0731, + "step": 1714 + }, + { + "epoch": 0.52, + "grad_norm": 22.324352264404297, + "learning_rate": 1.6564097424075374e-05, + "loss": 2.5875, + "step": 1715 + }, + { + "epoch": 0.52, + "grad_norm": 17.300701141357422, + "learning_rate": 1.6562092813470984e-05, + "loss": 2.9271, + "step": 1716 + }, + { + "epoch": 0.52, + "grad_norm": 20.104694366455078, + "learning_rate": 1.6560088202866595e-05, + "loss": 3.3379, + "step": 1717 + }, + { + "epoch": 0.52, + "grad_norm": 11.9177827835083, + "learning_rate": 1.6558083592262205e-05, + "loss": 2.2438, + "step": 1718 + }, + { + "epoch": 0.52, + "grad_norm": 12.473027229309082, + "learning_rate": 1.6556078981657815e-05, + "loss": 2.8623, + "step": 1719 + }, + { + "epoch": 0.52, + "grad_norm": 16.957218170166016, + "learning_rate": 1.6554074371053425e-05, + "loss": 2.7224, + "step": 1720 + }, + { + "epoch": 0.52, + "grad_norm": 9.168384552001953, + "learning_rate": 1.6552069760449035e-05, + "loss": 1.4801, + "step": 1721 + }, + { + "epoch": 0.52, + "grad_norm": 14.333914756774902, + "learning_rate": 1.655006514984464e-05, + "loss": 1.834, + "step": 1722 + }, + { + "epoch": 0.52, + "grad_norm": 8.387497901916504, + "learning_rate": 1.6548060539240255e-05, + "loss": 1.4313, + "step": 1723 + }, + { + "epoch": 0.52, + "grad_norm": 10.79047679901123, + "learning_rate": 1.6546055928635865e-05, + "loss": 3.2971, + "step": 1724 + }, + { + "epoch": 0.52, + "grad_norm": 10.988299369812012, + "learning_rate": 1.6544051318031472e-05, + "loss": 2.5413, + "step": 1725 + }, + { + "epoch": 0.52, + "grad_norm": 16.327035903930664, + "learning_rate": 1.6542046707427085e-05, + "loss": 2.1365, + "step": 1726 + }, + { + "epoch": 0.52, + "grad_norm": 10.375070571899414, + "learning_rate": 1.6540042096822692e-05, + "loss": 2.7831, + "step": 1727 + }, + { + "epoch": 0.52, + "grad_norm": 11.78280258178711, + "learning_rate": 1.6538037486218302e-05, + "loss": 2.6977, + "step": 1728 + }, + { + "epoch": 0.52, + "grad_norm": 25.220386505126953, + "learning_rate": 1.6536032875613916e-05, + "loss": 2.6352, + "step": 1729 + }, + { + "epoch": 0.52, + "grad_norm": 38.015220642089844, + "learning_rate": 1.6534028265009522e-05, + "loss": 2.9953, + "step": 1730 + }, + { + "epoch": 0.52, + "grad_norm": 16.3960018157959, + "learning_rate": 1.6532023654405132e-05, + "loss": 2.1179, + "step": 1731 + }, + { + "epoch": 0.52, + "grad_norm": 17.71184730529785, + "learning_rate": 1.6530019043800742e-05, + "loss": 2.3667, + "step": 1732 + }, + { + "epoch": 0.52, + "grad_norm": 11.10054874420166, + "learning_rate": 1.6528014433196353e-05, + "loss": 1.6067, + "step": 1733 + }, + { + "epoch": 0.52, + "grad_norm": 19.382043838500977, + "learning_rate": 1.6526009822591963e-05, + "loss": 3.0277, + "step": 1734 + }, + { + "epoch": 0.52, + "grad_norm": 64.88998413085938, + "learning_rate": 1.6524005211987573e-05, + "loss": 1.9251, + "step": 1735 + }, + { + "epoch": 0.52, + "grad_norm": 16.99798583984375, + "learning_rate": 1.6522000601383183e-05, + "loss": 2.2818, + "step": 1736 + }, + { + "epoch": 0.52, + "grad_norm": 11.27112102508545, + "learning_rate": 1.6519995990778793e-05, + "loss": 2.4658, + "step": 1737 + }, + { + "epoch": 0.52, + "grad_norm": 16.193252563476562, + "learning_rate": 1.6517991380174403e-05, + "loss": 2.023, + "step": 1738 + }, + { + "epoch": 0.52, + "grad_norm": 11.178709983825684, + "learning_rate": 1.6515986769570013e-05, + "loss": 2.0991, + "step": 1739 + }, + { + "epoch": 0.52, + "grad_norm": 12.193035125732422, + "learning_rate": 1.6513982158965623e-05, + "loss": 2.4835, + "step": 1740 + }, + { + "epoch": 0.52, + "grad_norm": 11.212671279907227, + "learning_rate": 1.6511977548361233e-05, + "loss": 1.8206, + "step": 1741 + }, + { + "epoch": 0.52, + "grad_norm": 7.858899116516113, + "learning_rate": 1.6509972937756843e-05, + "loss": 2.2607, + "step": 1742 + }, + { + "epoch": 0.52, + "grad_norm": 34.4583740234375, + "learning_rate": 1.6507968327152453e-05, + "loss": 2.8326, + "step": 1743 + }, + { + "epoch": 0.52, + "grad_norm": 15.727729797363281, + "learning_rate": 1.650596371654806e-05, + "loss": 3.1574, + "step": 1744 + }, + { + "epoch": 0.52, + "grad_norm": 25.254718780517578, + "learning_rate": 1.6503959105943674e-05, + "loss": 3.1741, + "step": 1745 + }, + { + "epoch": 0.52, + "grad_norm": 17.325634002685547, + "learning_rate": 1.650195449533928e-05, + "loss": 2.9572, + "step": 1746 + }, + { + "epoch": 0.53, + "grad_norm": 23.073179244995117, + "learning_rate": 1.649994988473489e-05, + "loss": 1.7773, + "step": 1747 + }, + { + "epoch": 0.53, + "grad_norm": 16.520126342773438, + "learning_rate": 1.6497945274130504e-05, + "loss": 3.4431, + "step": 1748 + }, + { + "epoch": 0.53, + "grad_norm": 18.78542709350586, + "learning_rate": 1.649594066352611e-05, + "loss": 2.6952, + "step": 1749 + }, + { + "epoch": 0.53, + "grad_norm": 19.45966911315918, + "learning_rate": 1.649393605292172e-05, + "loss": 2.005, + "step": 1750 + }, + { + "epoch": 0.53, + "grad_norm": 11.350322723388672, + "learning_rate": 1.649193144231733e-05, + "loss": 1.6326, + "step": 1751 + }, + { + "epoch": 0.53, + "grad_norm": 18.4759521484375, + "learning_rate": 1.648992683171294e-05, + "loss": 2.4786, + "step": 1752 + }, + { + "epoch": 0.53, + "grad_norm": 15.755661010742188, + "learning_rate": 1.648792222110855e-05, + "loss": 1.7824, + "step": 1753 + }, + { + "epoch": 0.53, + "grad_norm": 46.813907623291016, + "learning_rate": 1.648591761050416e-05, + "loss": 3.5372, + "step": 1754 + }, + { + "epoch": 0.53, + "grad_norm": 11.6588134765625, + "learning_rate": 1.648391299989977e-05, + "loss": 1.7987, + "step": 1755 + }, + { + "epoch": 0.53, + "grad_norm": 15.711779594421387, + "learning_rate": 1.648190838929538e-05, + "loss": 2.7252, + "step": 1756 + }, + { + "epoch": 0.53, + "grad_norm": 9.573955535888672, + "learning_rate": 1.647990377869099e-05, + "loss": 2.4631, + "step": 1757 + }, + { + "epoch": 0.53, + "grad_norm": 9.92371940612793, + "learning_rate": 1.64778991680866e-05, + "loss": 2.0796, + "step": 1758 + }, + { + "epoch": 0.53, + "grad_norm": 26.95187759399414, + "learning_rate": 1.647589455748221e-05, + "loss": 2.8893, + "step": 1759 + }, + { + "epoch": 0.53, + "grad_norm": 9.273636817932129, + "learning_rate": 1.647388994687782e-05, + "loss": 2.0131, + "step": 1760 + }, + { + "epoch": 0.53, + "grad_norm": 19.220373153686523, + "learning_rate": 1.647188533627343e-05, + "loss": 3.017, + "step": 1761 + }, + { + "epoch": 0.53, + "grad_norm": 34.35186767578125, + "learning_rate": 1.646988072566904e-05, + "loss": 3.3928, + "step": 1762 + }, + { + "epoch": 0.53, + "grad_norm": 36.41823959350586, + "learning_rate": 1.6467876115064648e-05, + "loss": 3.6049, + "step": 1763 + }, + { + "epoch": 0.53, + "grad_norm": 14.849743843078613, + "learning_rate": 1.6465871504460262e-05, + "loss": 1.9525, + "step": 1764 + }, + { + "epoch": 0.53, + "grad_norm": 20.26209259033203, + "learning_rate": 1.646386689385587e-05, + "loss": 2.2319, + "step": 1765 + }, + { + "epoch": 0.53, + "grad_norm": 24.784208297729492, + "learning_rate": 1.646186228325148e-05, + "loss": 2.5943, + "step": 1766 + }, + { + "epoch": 0.53, + "grad_norm": 8.97102165222168, + "learning_rate": 1.6459857672647092e-05, + "loss": 2.0995, + "step": 1767 + }, + { + "epoch": 0.53, + "grad_norm": 12.025717735290527, + "learning_rate": 1.64578530620427e-05, + "loss": 1.3348, + "step": 1768 + }, + { + "epoch": 0.53, + "grad_norm": 22.216552734375, + "learning_rate": 1.645584845143831e-05, + "loss": 2.9476, + "step": 1769 + }, + { + "epoch": 0.53, + "grad_norm": 16.27435874938965, + "learning_rate": 1.645384384083392e-05, + "loss": 2.9931, + "step": 1770 + }, + { + "epoch": 0.53, + "grad_norm": 23.192441940307617, + "learning_rate": 1.645183923022953e-05, + "loss": 3.2747, + "step": 1771 + }, + { + "epoch": 0.53, + "grad_norm": 26.984561920166016, + "learning_rate": 1.644983461962514e-05, + "loss": 3.2546, + "step": 1772 + }, + { + "epoch": 0.53, + "grad_norm": 16.29502296447754, + "learning_rate": 1.644783000902075e-05, + "loss": 2.9974, + "step": 1773 + }, + { + "epoch": 0.53, + "grad_norm": 15.848845481872559, + "learning_rate": 1.644582539841636e-05, + "loss": 3.1271, + "step": 1774 + }, + { + "epoch": 0.53, + "grad_norm": 34.28373336791992, + "learning_rate": 1.644382078781197e-05, + "loss": 3.1274, + "step": 1775 + }, + { + "epoch": 0.53, + "grad_norm": 18.812297821044922, + "learning_rate": 1.644181617720758e-05, + "loss": 2.0213, + "step": 1776 + }, + { + "epoch": 0.53, + "grad_norm": 23.698867797851562, + "learning_rate": 1.6439811566603186e-05, + "loss": 3.1921, + "step": 1777 + }, + { + "epoch": 0.53, + "grad_norm": 8.073270797729492, + "learning_rate": 1.64378069559988e-05, + "loss": 2.1989, + "step": 1778 + }, + { + "epoch": 0.53, + "grad_norm": 18.90115737915039, + "learning_rate": 1.643580234539441e-05, + "loss": 3.2046, + "step": 1779 + }, + { + "epoch": 0.54, + "grad_norm": 12.170257568359375, + "learning_rate": 1.6433797734790016e-05, + "loss": 3.0221, + "step": 1780 + }, + { + "epoch": 0.54, + "grad_norm": 19.89399528503418, + "learning_rate": 1.643179312418563e-05, + "loss": 3.4413, + "step": 1781 + }, + { + "epoch": 0.54, + "grad_norm": 14.685297012329102, + "learning_rate": 1.6429788513581237e-05, + "loss": 2.2774, + "step": 1782 + }, + { + "epoch": 0.54, + "grad_norm": 29.041440963745117, + "learning_rate": 1.6427783902976847e-05, + "loss": 2.4153, + "step": 1783 + }, + { + "epoch": 0.54, + "grad_norm": 13.65660285949707, + "learning_rate": 1.642577929237246e-05, + "loss": 2.6716, + "step": 1784 + }, + { + "epoch": 0.54, + "grad_norm": 11.705273628234863, + "learning_rate": 1.6423774681768067e-05, + "loss": 1.8001, + "step": 1785 + }, + { + "epoch": 0.54, + "grad_norm": 25.235740661621094, + "learning_rate": 1.6421770071163677e-05, + "loss": 3.1693, + "step": 1786 + }, + { + "epoch": 0.54, + "grad_norm": 15.884099960327148, + "learning_rate": 1.6419765460559287e-05, + "loss": 2.4285, + "step": 1787 + }, + { + "epoch": 0.54, + "grad_norm": 52.04803466796875, + "learning_rate": 1.6417760849954897e-05, + "loss": 4.0988, + "step": 1788 + }, + { + "epoch": 0.54, + "grad_norm": 42.220130920410156, + "learning_rate": 1.6415756239350507e-05, + "loss": 2.3157, + "step": 1789 + }, + { + "epoch": 0.54, + "grad_norm": 19.528057098388672, + "learning_rate": 1.6413751628746117e-05, + "loss": 2.153, + "step": 1790 + }, + { + "epoch": 0.54, + "grad_norm": 10.022256851196289, + "learning_rate": 1.6411747018141727e-05, + "loss": 2.6832, + "step": 1791 + }, + { + "epoch": 0.54, + "grad_norm": 14.05746841430664, + "learning_rate": 1.6409742407537337e-05, + "loss": 2.6188, + "step": 1792 + }, + { + "epoch": 0.54, + "grad_norm": 17.109386444091797, + "learning_rate": 1.6407737796932947e-05, + "loss": 2.4536, + "step": 1793 + }, + { + "epoch": 0.54, + "grad_norm": 14.443997383117676, + "learning_rate": 1.6405733186328558e-05, + "loss": 2.4826, + "step": 1794 + }, + { + "epoch": 0.54, + "grad_norm": 29.712779998779297, + "learning_rate": 1.6403728575724168e-05, + "loss": 3.0167, + "step": 1795 + }, + { + "epoch": 0.54, + "grad_norm": 19.298267364501953, + "learning_rate": 1.6401723965119778e-05, + "loss": 2.9807, + "step": 1796 + }, + { + "epoch": 0.54, + "grad_norm": 15.033485412597656, + "learning_rate": 1.6399719354515388e-05, + "loss": 2.6656, + "step": 1797 + }, + { + "epoch": 0.54, + "grad_norm": 16.12212371826172, + "learning_rate": 1.6397714743910998e-05, + "loss": 2.3691, + "step": 1798 + }, + { + "epoch": 0.54, + "grad_norm": 26.953134536743164, + "learning_rate": 1.6395710133306605e-05, + "loss": 3.2797, + "step": 1799 + }, + { + "epoch": 0.54, + "grad_norm": 9.630372047424316, + "learning_rate": 1.6393705522702218e-05, + "loss": 1.7721, + "step": 1800 + }, + { + "epoch": 0.54, + "eval_loss": 0.3416885435581207, + "eval_runtime": 43.4913, + "eval_samples_per_second": 34.007, + "eval_steps_per_second": 34.007, + "step": 1800 + }, + { + "epoch": 0.54, + "grad_norm": 18.797924041748047, + "learning_rate": 1.6391700912097825e-05, + "loss": 1.5968, + "step": 1801 + }, + { + "epoch": 0.54, + "grad_norm": 31.31778907775879, + "learning_rate": 1.6389696301493435e-05, + "loss": 2.9072, + "step": 1802 + }, + { + "epoch": 0.54, + "grad_norm": 14.117547035217285, + "learning_rate": 1.6387691690889048e-05, + "loss": 2.4949, + "step": 1803 + }, + { + "epoch": 0.54, + "grad_norm": 12.579538345336914, + "learning_rate": 1.6385687080284655e-05, + "loss": 1.8951, + "step": 1804 + }, + { + "epoch": 0.54, + "grad_norm": 23.75373077392578, + "learning_rate": 1.6383682469680265e-05, + "loss": 1.9489, + "step": 1805 + }, + { + "epoch": 0.54, + "grad_norm": 13.660398483276367, + "learning_rate": 1.6381677859075875e-05, + "loss": 1.6119, + "step": 1806 + }, + { + "epoch": 0.54, + "grad_norm": 60.78506851196289, + "learning_rate": 1.6379673248471485e-05, + "loss": 2.7651, + "step": 1807 + }, + { + "epoch": 0.54, + "grad_norm": 14.931727409362793, + "learning_rate": 1.6377668637867095e-05, + "loss": 2.2028, + "step": 1808 + }, + { + "epoch": 0.54, + "grad_norm": 10.574440956115723, + "learning_rate": 1.6375664027262705e-05, + "loss": 2.6472, + "step": 1809 + }, + { + "epoch": 0.54, + "grad_norm": 14.71934986114502, + "learning_rate": 1.6373659416658315e-05, + "loss": 2.3538, + "step": 1810 + }, + { + "epoch": 0.54, + "grad_norm": 16.00834846496582, + "learning_rate": 1.6371654806053926e-05, + "loss": 2.8576, + "step": 1811 + }, + { + "epoch": 0.54, + "grad_norm": 15.810260772705078, + "learning_rate": 1.6369650195449536e-05, + "loss": 2.9982, + "step": 1812 + }, + { + "epoch": 0.55, + "grad_norm": 29.83978843688965, + "learning_rate": 1.6367645584845146e-05, + "loss": 3.0221, + "step": 1813 + }, + { + "epoch": 0.55, + "grad_norm": 16.944915771484375, + "learning_rate": 1.6365640974240756e-05, + "loss": 2.8306, + "step": 1814 + }, + { + "epoch": 0.55, + "grad_norm": 22.454910278320312, + "learning_rate": 1.6363636363636366e-05, + "loss": 2.3444, + "step": 1815 + }, + { + "epoch": 0.55, + "grad_norm": 16.41376495361328, + "learning_rate": 1.6361631753031976e-05, + "loss": 1.8047, + "step": 1816 + }, + { + "epoch": 0.55, + "grad_norm": 4576.21875, + "learning_rate": 1.6359627142427586e-05, + "loss": 2.2836, + "step": 1817 + }, + { + "epoch": 0.55, + "grad_norm": 15.997746467590332, + "learning_rate": 1.6357622531823193e-05, + "loss": 2.5703, + "step": 1818 + }, + { + "epoch": 0.55, + "grad_norm": 15.037249565124512, + "learning_rate": 1.6355617921218806e-05, + "loss": 3.2597, + "step": 1819 + }, + { + "epoch": 0.55, + "grad_norm": 13.758872032165527, + "learning_rate": 1.6353613310614413e-05, + "loss": 3.4616, + "step": 1820 + }, + { + "epoch": 0.55, + "grad_norm": 10.099397659301758, + "learning_rate": 1.6351608700010023e-05, + "loss": 1.96, + "step": 1821 + }, + { + "epoch": 0.55, + "grad_norm": 32.01512908935547, + "learning_rate": 1.6349604089405636e-05, + "loss": 4.0462, + "step": 1822 + }, + { + "epoch": 0.55, + "grad_norm": 17.568777084350586, + "learning_rate": 1.6347599478801243e-05, + "loss": 2.5459, + "step": 1823 + }, + { + "epoch": 0.55, + "grad_norm": 20.525028228759766, + "learning_rate": 1.6345594868196853e-05, + "loss": 2.6286, + "step": 1824 + }, + { + "epoch": 0.55, + "grad_norm": 18.882827758789062, + "learning_rate": 1.6343590257592463e-05, + "loss": 2.5098, + "step": 1825 + }, + { + "epoch": 0.55, + "grad_norm": 10.823426246643066, + "learning_rate": 1.6341585646988073e-05, + "loss": 1.6721, + "step": 1826 + }, + { + "epoch": 0.55, + "grad_norm": 14.602689743041992, + "learning_rate": 1.6339581036383684e-05, + "loss": 2.6786, + "step": 1827 + }, + { + "epoch": 0.55, + "grad_norm": 17.44394302368164, + "learning_rate": 1.6337576425779294e-05, + "loss": 2.5762, + "step": 1828 + }, + { + "epoch": 0.55, + "grad_norm": 19.712026596069336, + "learning_rate": 1.6335571815174904e-05, + "loss": 2.6143, + "step": 1829 + }, + { + "epoch": 0.55, + "grad_norm": 25.916528701782227, + "learning_rate": 1.6333567204570514e-05, + "loss": 2.5159, + "step": 1830 + }, + { + "epoch": 0.55, + "grad_norm": 17.429548263549805, + "learning_rate": 1.6331562593966124e-05, + "loss": 3.5722, + "step": 1831 + }, + { + "epoch": 0.55, + "grad_norm": 14.735082626342773, + "learning_rate": 1.6329557983361734e-05, + "loss": 2.0687, + "step": 1832 + }, + { + "epoch": 0.55, + "grad_norm": 19.132709503173828, + "learning_rate": 1.6327553372757344e-05, + "loss": 2.9178, + "step": 1833 + }, + { + "epoch": 0.55, + "grad_norm": 20.842416763305664, + "learning_rate": 1.6325548762152954e-05, + "loss": 3.3049, + "step": 1834 + }, + { + "epoch": 0.55, + "grad_norm": 14.198040008544922, + "learning_rate": 1.6323544151548564e-05, + "loss": 2.8631, + "step": 1835 + }, + { + "epoch": 0.55, + "grad_norm": 12.003619194030762, + "learning_rate": 1.6321539540944174e-05, + "loss": 2.0466, + "step": 1836 + }, + { + "epoch": 0.55, + "grad_norm": 14.629396438598633, + "learning_rate": 1.631953493033978e-05, + "loss": 2.967, + "step": 1837 + }, + { + "epoch": 0.55, + "grad_norm": 15.829573631286621, + "learning_rate": 1.6317530319735394e-05, + "loss": 2.1286, + "step": 1838 + }, + { + "epoch": 0.55, + "grad_norm": 14.251692771911621, + "learning_rate": 1.6315525709131e-05, + "loss": 2.4073, + "step": 1839 + }, + { + "epoch": 0.55, + "grad_norm": 25.430139541625977, + "learning_rate": 1.631352109852661e-05, + "loss": 2.9292, + "step": 1840 + }, + { + "epoch": 0.55, + "grad_norm": 27.490047454833984, + "learning_rate": 1.6311516487922225e-05, + "loss": 3.4921, + "step": 1841 + }, + { + "epoch": 0.55, + "grad_norm": 9.101457595825195, + "learning_rate": 1.630951187731783e-05, + "loss": 2.1141, + "step": 1842 + }, + { + "epoch": 0.55, + "grad_norm": 12.929884910583496, + "learning_rate": 1.630750726671344e-05, + "loss": 1.934, + "step": 1843 + }, + { + "epoch": 0.55, + "grad_norm": 11.764657020568848, + "learning_rate": 1.630550265610905e-05, + "loss": 2.0135, + "step": 1844 + }, + { + "epoch": 0.55, + "grad_norm": 15.838483810424805, + "learning_rate": 1.630349804550466e-05, + "loss": 2.7645, + "step": 1845 + }, + { + "epoch": 0.56, + "grad_norm": 19.013673782348633, + "learning_rate": 1.6301493434900272e-05, + "loss": 2.2205, + "step": 1846 + }, + { + "epoch": 0.56, + "grad_norm": 12.440937042236328, + "learning_rate": 1.6299488824295882e-05, + "loss": 3.1805, + "step": 1847 + }, + { + "epoch": 0.56, + "grad_norm": 18.47015380859375, + "learning_rate": 1.6297484213691492e-05, + "loss": 2.9228, + "step": 1848 + }, + { + "epoch": 0.56, + "grad_norm": 26.46478843688965, + "learning_rate": 1.6295479603087102e-05, + "loss": 3.1306, + "step": 1849 + }, + { + "epoch": 0.56, + "grad_norm": 20.930150985717773, + "learning_rate": 1.6293474992482712e-05, + "loss": 2.2428, + "step": 1850 + }, + { + "epoch": 0.56, + "grad_norm": 13.267045021057129, + "learning_rate": 1.629147038187832e-05, + "loss": 2.6697, + "step": 1851 + }, + { + "epoch": 0.56, + "grad_norm": 11.723921775817871, + "learning_rate": 1.6289465771273932e-05, + "loss": 2.8602, + "step": 1852 + }, + { + "epoch": 0.56, + "grad_norm": 11.42064094543457, + "learning_rate": 1.6287461160669542e-05, + "loss": 2.9313, + "step": 1853 + }, + { + "epoch": 0.56, + "grad_norm": 31.033870697021484, + "learning_rate": 1.628545655006515e-05, + "loss": 3.2394, + "step": 1854 + }, + { + "epoch": 0.56, + "grad_norm": 17.858312606811523, + "learning_rate": 1.6283451939460762e-05, + "loss": 2.1223, + "step": 1855 + }, + { + "epoch": 0.56, + "grad_norm": 13.080681800842285, + "learning_rate": 1.628144732885637e-05, + "loss": 2.4824, + "step": 1856 + }, + { + "epoch": 0.56, + "grad_norm": 9.161316871643066, + "learning_rate": 1.6279442718251983e-05, + "loss": 1.5993, + "step": 1857 + }, + { + "epoch": 0.56, + "grad_norm": 15.82925796508789, + "learning_rate": 1.6277438107647593e-05, + "loss": 2.2007, + "step": 1858 + }, + { + "epoch": 0.56, + "grad_norm": 29.120468139648438, + "learning_rate": 1.62754334970432e-05, + "loss": 3.2814, + "step": 1859 + }, + { + "epoch": 0.56, + "grad_norm": 20.672487258911133, + "learning_rate": 1.6273428886438813e-05, + "loss": 2.6652, + "step": 1860 + }, + { + "epoch": 0.56, + "grad_norm": 18.959318161010742, + "learning_rate": 1.627142427583442e-05, + "loss": 2.5797, + "step": 1861 + }, + { + "epoch": 0.56, + "grad_norm": 10.144336700439453, + "learning_rate": 1.626941966523003e-05, + "loss": 2.1145, + "step": 1862 + }, + { + "epoch": 0.56, + "grad_norm": 19.466264724731445, + "learning_rate": 1.626741505462564e-05, + "loss": 2.4139, + "step": 1863 + }, + { + "epoch": 0.56, + "grad_norm": 13.246243476867676, + "learning_rate": 1.626541044402125e-05, + "loss": 2.2984, + "step": 1864 + }, + { + "epoch": 0.56, + "grad_norm": 15.933266639709473, + "learning_rate": 1.626340583341686e-05, + "loss": 2.6167, + "step": 1865 + }, + { + "epoch": 0.56, + "grad_norm": 10.605395317077637, + "learning_rate": 1.626140122281247e-05, + "loss": 2.094, + "step": 1866 + }, + { + "epoch": 0.56, + "grad_norm": 14.293509483337402, + "learning_rate": 1.625939661220808e-05, + "loss": 2.4193, + "step": 1867 + }, + { + "epoch": 0.56, + "grad_norm": 14.355191230773926, + "learning_rate": 1.625739200160369e-05, + "loss": 3.3217, + "step": 1868 + }, + { + "epoch": 0.56, + "grad_norm": 11.297273635864258, + "learning_rate": 1.62553873909993e-05, + "loss": 1.9198, + "step": 1869 + }, + { + "epoch": 0.56, + "grad_norm": 19.611162185668945, + "learning_rate": 1.625338278039491e-05, + "loss": 2.3439, + "step": 1870 + }, + { + "epoch": 0.56, + "grad_norm": 9.530548095703125, + "learning_rate": 1.625137816979052e-05, + "loss": 1.1765, + "step": 1871 + }, + { + "epoch": 0.56, + "grad_norm": 20.621294021606445, + "learning_rate": 1.624937355918613e-05, + "loss": 2.2362, + "step": 1872 + }, + { + "epoch": 0.56, + "grad_norm": 20.514188766479492, + "learning_rate": 1.6247368948581737e-05, + "loss": 3.2915, + "step": 1873 + }, + { + "epoch": 0.56, + "grad_norm": 11.529969215393066, + "learning_rate": 1.624536433797735e-05, + "loss": 1.7275, + "step": 1874 + }, + { + "epoch": 0.56, + "grad_norm": 15.720181465148926, + "learning_rate": 1.6243359727372957e-05, + "loss": 2.0002, + "step": 1875 + }, + { + "epoch": 0.56, + "grad_norm": 20.808303833007812, + "learning_rate": 1.6241355116768567e-05, + "loss": 3.8721, + "step": 1876 + }, + { + "epoch": 0.56, + "grad_norm": 11.557236671447754, + "learning_rate": 1.623935050616418e-05, + "loss": 1.6353, + "step": 1877 + }, + { + "epoch": 0.56, + "grad_norm": 13.909945487976074, + "learning_rate": 1.6237345895559788e-05, + "loss": 2.2762, + "step": 1878 + }, + { + "epoch": 0.56, + "grad_norm": 36.823299407958984, + "learning_rate": 1.6235341284955398e-05, + "loss": 2.6792, + "step": 1879 + }, + { + "epoch": 0.57, + "grad_norm": 73.27782440185547, + "learning_rate": 1.6233336674351008e-05, + "loss": 3.6506, + "step": 1880 + }, + { + "epoch": 0.57, + "grad_norm": 15.239794731140137, + "learning_rate": 1.6231332063746618e-05, + "loss": 2.5237, + "step": 1881 + }, + { + "epoch": 0.57, + "grad_norm": 28.48020362854004, + "learning_rate": 1.6229327453142228e-05, + "loss": 2.8108, + "step": 1882 + }, + { + "epoch": 0.57, + "grad_norm": 9.497574806213379, + "learning_rate": 1.6227322842537838e-05, + "loss": 1.4183, + "step": 1883 + }, + { + "epoch": 0.57, + "grad_norm": 12.64692211151123, + "learning_rate": 1.6225318231933448e-05, + "loss": 2.5578, + "step": 1884 + }, + { + "epoch": 0.57, + "grad_norm": 23.391645431518555, + "learning_rate": 1.6223313621329058e-05, + "loss": 2.03, + "step": 1885 + }, + { + "epoch": 0.57, + "grad_norm": 25.81462860107422, + "learning_rate": 1.622130901072467e-05, + "loss": 2.3658, + "step": 1886 + }, + { + "epoch": 0.57, + "grad_norm": 22.7410831451416, + "learning_rate": 1.621930440012028e-05, + "loss": 2.291, + "step": 1887 + }, + { + "epoch": 0.57, + "grad_norm": 14.532958030700684, + "learning_rate": 1.621729978951589e-05, + "loss": 2.3839, + "step": 1888 + }, + { + "epoch": 0.57, + "grad_norm": 9.742128372192383, + "learning_rate": 1.62152951789115e-05, + "loss": 1.9635, + "step": 1889 + }, + { + "epoch": 0.57, + "grad_norm": 11.445870399475098, + "learning_rate": 1.621329056830711e-05, + "loss": 2.448, + "step": 1890 + }, + { + "epoch": 0.57, + "grad_norm": 17.39885902404785, + "learning_rate": 1.621128595770272e-05, + "loss": 1.9545, + "step": 1891 + }, + { + "epoch": 0.57, + "grad_norm": 11.735391616821289, + "learning_rate": 1.6209281347098325e-05, + "loss": 2.49, + "step": 1892 + }, + { + "epoch": 0.57, + "grad_norm": 31.197790145874023, + "learning_rate": 1.620727673649394e-05, + "loss": 2.4989, + "step": 1893 + }, + { + "epoch": 0.57, + "grad_norm": 28.4355525970459, + "learning_rate": 1.6205272125889546e-05, + "loss": 2.9879, + "step": 1894 + }, + { + "epoch": 0.57, + "grad_norm": 21.622291564941406, + "learning_rate": 1.6203267515285156e-05, + "loss": 3.1001, + "step": 1895 + }, + { + "epoch": 0.57, + "grad_norm": 15.58934497833252, + "learning_rate": 1.620126290468077e-05, + "loss": 2.0499, + "step": 1896 + }, + { + "epoch": 0.57, + "grad_norm": 16.792016983032227, + "learning_rate": 1.6199258294076376e-05, + "loss": 2.5507, + "step": 1897 + }, + { + "epoch": 0.57, + "grad_norm": 15.837790489196777, + "learning_rate": 1.6197253683471986e-05, + "loss": 2.5776, + "step": 1898 + }, + { + "epoch": 0.57, + "grad_norm": 13.746726989746094, + "learning_rate": 1.6195249072867596e-05, + "loss": 1.7229, + "step": 1899 + }, + { + "epoch": 0.57, + "grad_norm": 20.217098236083984, + "learning_rate": 1.6193244462263206e-05, + "loss": 2.7257, + "step": 1900 + }, + { + "epoch": 0.57, + "grad_norm": 14.193788528442383, + "learning_rate": 1.6191239851658816e-05, + "loss": 2.4532, + "step": 1901 + }, + { + "epoch": 0.57, + "grad_norm": 12.840964317321777, + "learning_rate": 1.6189235241054426e-05, + "loss": 1.3137, + "step": 1902 + }, + { + "epoch": 0.57, + "grad_norm": 10.857292175292969, + "learning_rate": 1.6187230630450036e-05, + "loss": 2.3681, + "step": 1903 + }, + { + "epoch": 0.57, + "grad_norm": 11.725610733032227, + "learning_rate": 1.6185226019845646e-05, + "loss": 2.0677, + "step": 1904 + }, + { + "epoch": 0.57, + "grad_norm": 16.389385223388672, + "learning_rate": 1.6183221409241257e-05, + "loss": 2.2652, + "step": 1905 + }, + { + "epoch": 0.57, + "grad_norm": 20.219581604003906, + "learning_rate": 1.6181216798636867e-05, + "loss": 2.3222, + "step": 1906 + }, + { + "epoch": 0.57, + "grad_norm": 15.775784492492676, + "learning_rate": 1.6179212188032477e-05, + "loss": 2.0435, + "step": 1907 + }, + { + "epoch": 0.57, + "grad_norm": 11.360732078552246, + "learning_rate": 1.6177207577428087e-05, + "loss": 2.0024, + "step": 1908 + }, + { + "epoch": 0.57, + "grad_norm": 26.330806732177734, + "learning_rate": 1.6175202966823697e-05, + "loss": 3.1264, + "step": 1909 + }, + { + "epoch": 0.57, + "grad_norm": 16.440622329711914, + "learning_rate": 1.6173198356219307e-05, + "loss": 2.6761, + "step": 1910 + }, + { + "epoch": 0.57, + "grad_norm": 14.55276107788086, + "learning_rate": 1.6171193745614914e-05, + "loss": 2.2345, + "step": 1911 + }, + { + "epoch": 0.57, + "grad_norm": 15.58680534362793, + "learning_rate": 1.6169189135010527e-05, + "loss": 2.8794, + "step": 1912 + }, + { + "epoch": 0.58, + "grad_norm": 27.979393005371094, + "learning_rate": 1.6167184524406137e-05, + "loss": 2.5893, + "step": 1913 + }, + { + "epoch": 0.58, + "grad_norm": 14.484015464782715, + "learning_rate": 1.6165179913801744e-05, + "loss": 2.4981, + "step": 1914 + }, + { + "epoch": 0.58, + "grad_norm": 9.881906509399414, + "learning_rate": 1.6163175303197357e-05, + "loss": 2.1874, + "step": 1915 + }, + { + "epoch": 0.58, + "grad_norm": 17.09141731262207, + "learning_rate": 1.6161170692592964e-05, + "loss": 2.9159, + "step": 1916 + }, + { + "epoch": 0.58, + "grad_norm": 18.972490310668945, + "learning_rate": 1.6159166081988574e-05, + "loss": 1.6856, + "step": 1917 + }, + { + "epoch": 0.58, + "grad_norm": 10.02529525756836, + "learning_rate": 1.6157161471384184e-05, + "loss": 1.7176, + "step": 1918 + }, + { + "epoch": 0.58, + "grad_norm": 21.166505813598633, + "learning_rate": 1.6155156860779794e-05, + "loss": 2.5582, + "step": 1919 + }, + { + "epoch": 0.58, + "grad_norm": 34.15064239501953, + "learning_rate": 1.6153152250175404e-05, + "loss": 4.2157, + "step": 1920 + }, + { + "epoch": 0.58, + "eval_loss": 0.339510053396225, + "eval_runtime": 43.4353, + "eval_samples_per_second": 34.051, + "eval_steps_per_second": 34.051, + "step": 1920 + }, + { + "epoch": 0.58, + "grad_norm": 18.826351165771484, + "learning_rate": 1.6151147639571015e-05, + "loss": 2.8172, + "step": 1921 + }, + { + "epoch": 0.58, + "grad_norm": 28.641447067260742, + "learning_rate": 1.6149143028966625e-05, + "loss": 3.8235, + "step": 1922 + }, + { + "epoch": 0.58, + "grad_norm": 18.224559783935547, + "learning_rate": 1.6147138418362235e-05, + "loss": 1.6789, + "step": 1923 + }, + { + "epoch": 0.58, + "grad_norm": 20.033323287963867, + "learning_rate": 1.6145133807757845e-05, + "loss": 2.2715, + "step": 1924 + }, + { + "epoch": 0.58, + "grad_norm": 18.451032638549805, + "learning_rate": 1.6143129197153455e-05, + "loss": 3.6021, + "step": 1925 + }, + { + "epoch": 0.58, + "grad_norm": 13.090971946716309, + "learning_rate": 1.6141124586549065e-05, + "loss": 2.8423, + "step": 1926 + }, + { + "epoch": 0.58, + "grad_norm": 31.57698631286621, + "learning_rate": 1.6139119975944675e-05, + "loss": 2.1646, + "step": 1927 + }, + { + "epoch": 0.58, + "grad_norm": 19.8615779876709, + "learning_rate": 1.6137115365340285e-05, + "loss": 2.4651, + "step": 1928 + }, + { + "epoch": 0.58, + "grad_norm": 17.440027236938477, + "learning_rate": 1.6135110754735895e-05, + "loss": 2.2323, + "step": 1929 + }, + { + "epoch": 0.58, + "grad_norm": 12.359676361083984, + "learning_rate": 1.6133106144131502e-05, + "loss": 3.0975, + "step": 1930 + }, + { + "epoch": 0.58, + "grad_norm": 7.871943473815918, + "learning_rate": 1.6131101533527115e-05, + "loss": 1.8966, + "step": 1931 + }, + { + "epoch": 0.58, + "grad_norm": 13.588000297546387, + "learning_rate": 1.6129096922922725e-05, + "loss": 2.7859, + "step": 1932 + }, + { + "epoch": 0.58, + "grad_norm": 16.730295181274414, + "learning_rate": 1.6127092312318332e-05, + "loss": 2.4625, + "step": 1933 + }, + { + "epoch": 0.58, + "grad_norm": 13.648134231567383, + "learning_rate": 1.6125087701713946e-05, + "loss": 2.5004, + "step": 1934 + }, + { + "epoch": 0.58, + "grad_norm": 19.520750045776367, + "learning_rate": 1.6123083091109552e-05, + "loss": 3.5212, + "step": 1935 + }, + { + "epoch": 0.58, + "grad_norm": 19.73345184326172, + "learning_rate": 1.6121078480505162e-05, + "loss": 2.2163, + "step": 1936 + }, + { + "epoch": 0.58, + "grad_norm": 11.382865905761719, + "learning_rate": 1.6119073869900772e-05, + "loss": 1.5394, + "step": 1937 + }, + { + "epoch": 0.58, + "grad_norm": 11.32697868347168, + "learning_rate": 1.6117069259296383e-05, + "loss": 1.7365, + "step": 1938 + }, + { + "epoch": 0.58, + "grad_norm": 17.160417556762695, + "learning_rate": 1.6115064648691993e-05, + "loss": 3.0884, + "step": 1939 + }, + { + "epoch": 0.58, + "grad_norm": 10.711620330810547, + "learning_rate": 1.6113060038087603e-05, + "loss": 2.0598, + "step": 1940 + }, + { + "epoch": 0.58, + "grad_norm": 15.657376289367676, + "learning_rate": 1.6111055427483213e-05, + "loss": 2.5557, + "step": 1941 + }, + { + "epoch": 0.58, + "grad_norm": 17.509824752807617, + "learning_rate": 1.6109050816878823e-05, + "loss": 2.2766, + "step": 1942 + }, + { + "epoch": 0.58, + "grad_norm": 10.339923858642578, + "learning_rate": 1.6107046206274433e-05, + "loss": 1.8058, + "step": 1943 + }, + { + "epoch": 0.58, + "grad_norm": 16.32088851928711, + "learning_rate": 1.6105041595670043e-05, + "loss": 2.2214, + "step": 1944 + }, + { + "epoch": 0.58, + "grad_norm": 34.30913543701172, + "learning_rate": 1.6103036985065653e-05, + "loss": 3.2443, + "step": 1945 + }, + { + "epoch": 0.59, + "grad_norm": 13.773527145385742, + "learning_rate": 1.6101032374461263e-05, + "loss": 3.0382, + "step": 1946 + }, + { + "epoch": 0.59, + "grad_norm": 23.005949020385742, + "learning_rate": 1.609902776385687e-05, + "loss": 2.0551, + "step": 1947 + }, + { + "epoch": 0.59, + "grad_norm": 32.260467529296875, + "learning_rate": 1.6097023153252483e-05, + "loss": 2.4883, + "step": 1948 + }, + { + "epoch": 0.59, + "grad_norm": 13.194154739379883, + "learning_rate": 1.609501854264809e-05, + "loss": 2.7221, + "step": 1949 + }, + { + "epoch": 0.59, + "grad_norm": 29.038406372070312, + "learning_rate": 1.60930139320437e-05, + "loss": 2.7329, + "step": 1950 + }, + { + "epoch": 0.59, + "grad_norm": 15.281466484069824, + "learning_rate": 1.6091009321439314e-05, + "loss": 2.5336, + "step": 1951 + }, + { + "epoch": 0.59, + "grad_norm": 14.665143013000488, + "learning_rate": 1.608900471083492e-05, + "loss": 1.9784, + "step": 1952 + }, + { + "epoch": 0.59, + "grad_norm": 17.575735092163086, + "learning_rate": 1.608700010023053e-05, + "loss": 1.5022, + "step": 1953 + }, + { + "epoch": 0.59, + "grad_norm": 16.851743698120117, + "learning_rate": 1.608499548962614e-05, + "loss": 2.1047, + "step": 1954 + }, + { + "epoch": 0.59, + "grad_norm": 17.620439529418945, + "learning_rate": 1.608299087902175e-05, + "loss": 2.4975, + "step": 1955 + }, + { + "epoch": 0.59, + "grad_norm": 18.211750030517578, + "learning_rate": 1.608098626841736e-05, + "loss": 2.441, + "step": 1956 + }, + { + "epoch": 0.59, + "grad_norm": 38.04513168334961, + "learning_rate": 1.607898165781297e-05, + "loss": 3.2777, + "step": 1957 + }, + { + "epoch": 0.59, + "grad_norm": 18.044525146484375, + "learning_rate": 1.607697704720858e-05, + "loss": 3.4203, + "step": 1958 + }, + { + "epoch": 0.59, + "grad_norm": 14.710494995117188, + "learning_rate": 1.607497243660419e-05, + "loss": 2.0862, + "step": 1959 + }, + { + "epoch": 0.59, + "grad_norm": 10.158746719360352, + "learning_rate": 1.60729678259998e-05, + "loss": 1.8249, + "step": 1960 + }, + { + "epoch": 0.59, + "grad_norm": 14.311506271362305, + "learning_rate": 1.607096321539541e-05, + "loss": 2.0706, + "step": 1961 + }, + { + "epoch": 0.59, + "grad_norm": 9.432246208190918, + "learning_rate": 1.606895860479102e-05, + "loss": 2.0623, + "step": 1962 + }, + { + "epoch": 0.59, + "grad_norm": 17.04642105102539, + "learning_rate": 1.606695399418663e-05, + "loss": 2.918, + "step": 1963 + }, + { + "epoch": 0.59, + "grad_norm": 11.562948226928711, + "learning_rate": 1.606494938358224e-05, + "loss": 2.5815, + "step": 1964 + }, + { + "epoch": 0.59, + "grad_norm": 18.638172149658203, + "learning_rate": 1.606294477297785e-05, + "loss": 2.995, + "step": 1965 + }, + { + "epoch": 0.59, + "grad_norm": 14.144832611083984, + "learning_rate": 1.6060940162373458e-05, + "loss": 2.1113, + "step": 1966 + }, + { + "epoch": 0.59, + "grad_norm": 16.268152236938477, + "learning_rate": 1.605893555176907e-05, + "loss": 2.2524, + "step": 1967 + }, + { + "epoch": 0.59, + "grad_norm": 16.423564910888672, + "learning_rate": 1.6056930941164678e-05, + "loss": 2.2185, + "step": 1968 + }, + { + "epoch": 0.59, + "grad_norm": 28.028411865234375, + "learning_rate": 1.605492633056029e-05, + "loss": 2.556, + "step": 1969 + }, + { + "epoch": 0.59, + "grad_norm": 21.24933624267578, + "learning_rate": 1.6052921719955902e-05, + "loss": 2.3257, + "step": 1970 + }, + { + "epoch": 0.59, + "grad_norm": 15.96957778930664, + "learning_rate": 1.605091710935151e-05, + "loss": 2.617, + "step": 1971 + }, + { + "epoch": 0.59, + "grad_norm": 10.580945014953613, + "learning_rate": 1.604891249874712e-05, + "loss": 1.4834, + "step": 1972 + }, + { + "epoch": 0.59, + "grad_norm": 12.444055557250977, + "learning_rate": 1.604690788814273e-05, + "loss": 2.222, + "step": 1973 + }, + { + "epoch": 0.59, + "grad_norm": 12.718807220458984, + "learning_rate": 1.604490327753834e-05, + "loss": 1.986, + "step": 1974 + }, + { + "epoch": 0.59, + "grad_norm": 11.073088645935059, + "learning_rate": 1.604289866693395e-05, + "loss": 2.3385, + "step": 1975 + }, + { + "epoch": 0.59, + "grad_norm": 22.884178161621094, + "learning_rate": 1.604089405632956e-05, + "loss": 3.075, + "step": 1976 + }, + { + "epoch": 0.59, + "grad_norm": 13.408631324768066, + "learning_rate": 1.603888944572517e-05, + "loss": 1.867, + "step": 1977 + }, + { + "epoch": 0.59, + "grad_norm": 20.047677993774414, + "learning_rate": 1.603688483512078e-05, + "loss": 2.3888, + "step": 1978 + }, + { + "epoch": 0.6, + "grad_norm": 56.602203369140625, + "learning_rate": 1.603488022451639e-05, + "loss": 2.9184, + "step": 1979 + }, + { + "epoch": 0.6, + "grad_norm": 23.749616622924805, + "learning_rate": 1.6032875613912e-05, + "loss": 2.7749, + "step": 1980 + }, + { + "epoch": 0.6, + "grad_norm": 26.525043487548828, + "learning_rate": 1.603087100330761e-05, + "loss": 2.1839, + "step": 1981 + }, + { + "epoch": 0.6, + "grad_norm": 32.17291259765625, + "learning_rate": 1.602886639270322e-05, + "loss": 2.7472, + "step": 1982 + }, + { + "epoch": 0.6, + "grad_norm": 13.829222679138184, + "learning_rate": 1.602686178209883e-05, + "loss": 2.3262, + "step": 1983 + }, + { + "epoch": 0.6, + "grad_norm": 10.74743366241455, + "learning_rate": 1.602485717149444e-05, + "loss": 3.0373, + "step": 1984 + }, + { + "epoch": 0.6, + "grad_norm": 19.987829208374023, + "learning_rate": 1.6022852560890046e-05, + "loss": 2.2646, + "step": 1985 + }, + { + "epoch": 0.6, + "grad_norm": 15.228018760681152, + "learning_rate": 1.602084795028566e-05, + "loss": 2.8864, + "step": 1986 + }, + { + "epoch": 0.6, + "grad_norm": 39.494258880615234, + "learning_rate": 1.601884333968127e-05, + "loss": 3.7293, + "step": 1987 + }, + { + "epoch": 0.6, + "grad_norm": 21.58100700378418, + "learning_rate": 1.6016838729076877e-05, + "loss": 3.238, + "step": 1988 + }, + { + "epoch": 0.6, + "grad_norm": 26.791006088256836, + "learning_rate": 1.601483411847249e-05, + "loss": 2.7061, + "step": 1989 + }, + { + "epoch": 0.6, + "grad_norm": 26.071443557739258, + "learning_rate": 1.6012829507868097e-05, + "loss": 2.4725, + "step": 1990 + }, + { + "epoch": 0.6, + "grad_norm": 14.587154388427734, + "learning_rate": 1.6010824897263707e-05, + "loss": 2.7513, + "step": 1991 + }, + { + "epoch": 0.6, + "grad_norm": 19.951908111572266, + "learning_rate": 1.6008820286659317e-05, + "loss": 2.3058, + "step": 1992 + }, + { + "epoch": 0.6, + "grad_norm": 18.910688400268555, + "learning_rate": 1.6006815676054927e-05, + "loss": 2.4444, + "step": 1993 + }, + { + "epoch": 0.6, + "grad_norm": 12.591381072998047, + "learning_rate": 1.6004811065450537e-05, + "loss": 3.0018, + "step": 1994 + }, + { + "epoch": 0.6, + "grad_norm": 25.672998428344727, + "learning_rate": 1.6002806454846147e-05, + "loss": 2.6811, + "step": 1995 + }, + { + "epoch": 0.6, + "grad_norm": 24.982601165771484, + "learning_rate": 1.6000801844241757e-05, + "loss": 3.1955, + "step": 1996 + }, + { + "epoch": 0.6, + "grad_norm": 12.664204597473145, + "learning_rate": 1.5998797233637367e-05, + "loss": 2.2998, + "step": 1997 + }, + { + "epoch": 0.6, + "grad_norm": 10.164942741394043, + "learning_rate": 1.5996792623032977e-05, + "loss": 3.4713, + "step": 1998 + }, + { + "epoch": 0.6, + "grad_norm": 11.533797264099121, + "learning_rate": 1.5994788012428588e-05, + "loss": 2.7492, + "step": 1999 + }, + { + "epoch": 0.6, + "grad_norm": 31.470319747924805, + "learning_rate": 1.5992783401824198e-05, + "loss": 3.0072, + "step": 2000 + }, + { + "epoch": 0.6, + "grad_norm": 17.791658401489258, + "learning_rate": 1.5990778791219808e-05, + "loss": 1.9317, + "step": 2001 + }, + { + "epoch": 0.6, + "grad_norm": 31.150108337402344, + "learning_rate": 1.5988774180615418e-05, + "loss": 2.8161, + "step": 2002 + }, + { + "epoch": 0.6, + "grad_norm": 13.658658027648926, + "learning_rate": 1.5986769570011028e-05, + "loss": 1.9486, + "step": 2003 + }, + { + "epoch": 0.6, + "grad_norm": 13.074492454528809, + "learning_rate": 1.5984764959406635e-05, + "loss": 1.8306, + "step": 2004 + }, + { + "epoch": 0.6, + "grad_norm": 13.934167861938477, + "learning_rate": 1.5982760348802248e-05, + "loss": 2.0469, + "step": 2005 + }, + { + "epoch": 0.6, + "grad_norm": 19.25830078125, + "learning_rate": 1.5980755738197858e-05, + "loss": 3.0445, + "step": 2006 + }, + { + "epoch": 0.6, + "grad_norm": 12.957606315612793, + "learning_rate": 1.5978751127593465e-05, + "loss": 2.2492, + "step": 2007 + }, + { + "epoch": 0.6, + "grad_norm": 18.43289566040039, + "learning_rate": 1.5976746516989078e-05, + "loss": 3.2304, + "step": 2008 + }, + { + "epoch": 0.6, + "grad_norm": 15.042205810546875, + "learning_rate": 1.5974741906384685e-05, + "loss": 2.5782, + "step": 2009 + }, + { + "epoch": 0.6, + "grad_norm": 32.639183044433594, + "learning_rate": 1.5972737295780295e-05, + "loss": 2.2884, + "step": 2010 + }, + { + "epoch": 0.6, + "grad_norm": 14.541574478149414, + "learning_rate": 1.5970732685175905e-05, + "loss": 2.1057, + "step": 2011 + }, + { + "epoch": 0.6, + "grad_norm": 33.099979400634766, + "learning_rate": 1.5968728074571515e-05, + "loss": 2.8465, + "step": 2012 + }, + { + "epoch": 0.61, + "grad_norm": 29.527706146240234, + "learning_rate": 1.5966723463967125e-05, + "loss": 1.8221, + "step": 2013 + }, + { + "epoch": 0.61, + "grad_norm": 12.222841262817383, + "learning_rate": 1.5964718853362735e-05, + "loss": 1.9562, + "step": 2014 + }, + { + "epoch": 0.61, + "grad_norm": 11.430624961853027, + "learning_rate": 1.5962714242758345e-05, + "loss": 1.8148, + "step": 2015 + }, + { + "epoch": 0.61, + "grad_norm": 12.992792129516602, + "learning_rate": 1.5960709632153956e-05, + "loss": 2.0858, + "step": 2016 + }, + { + "epoch": 0.61, + "grad_norm": 25.652822494506836, + "learning_rate": 1.5958705021549566e-05, + "loss": 3.0605, + "step": 2017 + }, + { + "epoch": 0.61, + "grad_norm": 9.427448272705078, + "learning_rate": 1.5956700410945176e-05, + "loss": 2.0717, + "step": 2018 + }, + { + "epoch": 0.61, + "grad_norm": 18.489501953125, + "learning_rate": 1.5954695800340786e-05, + "loss": 2.3319, + "step": 2019 + }, + { + "epoch": 0.61, + "grad_norm": 14.960678100585938, + "learning_rate": 1.5952691189736396e-05, + "loss": 2.3096, + "step": 2020 + }, + { + "epoch": 0.61, + "grad_norm": 17.850452423095703, + "learning_rate": 1.5950686579132003e-05, + "loss": 2.5948, + "step": 2021 + }, + { + "epoch": 0.61, + "grad_norm": 13.320516586303711, + "learning_rate": 1.5948681968527616e-05, + "loss": 2.0685, + "step": 2022 + }, + { + "epoch": 0.61, + "grad_norm": 12.267958641052246, + "learning_rate": 1.5946677357923223e-05, + "loss": 2.1863, + "step": 2023 + }, + { + "epoch": 0.61, + "grad_norm": 17.197010040283203, + "learning_rate": 1.5944672747318833e-05, + "loss": 2.5286, + "step": 2024 + }, + { + "epoch": 0.61, + "grad_norm": 16.6329288482666, + "learning_rate": 1.5942668136714446e-05, + "loss": 2.0742, + "step": 2025 + }, + { + "epoch": 0.61, + "grad_norm": 36.75046157836914, + "learning_rate": 1.5940663526110053e-05, + "loss": 2.7239, + "step": 2026 + }, + { + "epoch": 0.61, + "grad_norm": 14.18947982788086, + "learning_rate": 1.5938658915505663e-05, + "loss": 2.6615, + "step": 2027 + }, + { + "epoch": 0.61, + "grad_norm": 13.752023696899414, + "learning_rate": 1.5936654304901273e-05, + "loss": 1.8863, + "step": 2028 + }, + { + "epoch": 0.61, + "grad_norm": 17.094932556152344, + "learning_rate": 1.5934649694296883e-05, + "loss": 3.2093, + "step": 2029 + }, + { + "epoch": 0.61, + "grad_norm": 16.756057739257812, + "learning_rate": 1.5932645083692497e-05, + "loss": 2.6765, + "step": 2030 + }, + { + "epoch": 0.61, + "grad_norm": 21.48285484313965, + "learning_rate": 1.5930640473088103e-05, + "loss": 2.0404, + "step": 2031 + }, + { + "epoch": 0.61, + "grad_norm": 31.83589744567871, + "learning_rate": 1.5928635862483714e-05, + "loss": 2.708, + "step": 2032 + }, + { + "epoch": 0.61, + "grad_norm": 50.02532958984375, + "learning_rate": 1.5926631251879324e-05, + "loss": 2.5092, + "step": 2033 + }, + { + "epoch": 0.61, + "grad_norm": 11.736242294311523, + "learning_rate": 1.5924626641274934e-05, + "loss": 1.6545, + "step": 2034 + }, + { + "epoch": 0.61, + "grad_norm": 22.306257247924805, + "learning_rate": 1.5922622030670544e-05, + "loss": 2.438, + "step": 2035 + }, + { + "epoch": 0.61, + "grad_norm": 19.088542938232422, + "learning_rate": 1.5920617420066154e-05, + "loss": 2.5573, + "step": 2036 + }, + { + "epoch": 0.61, + "grad_norm": 43.53116989135742, + "learning_rate": 1.5918612809461764e-05, + "loss": 2.4721, + "step": 2037 + }, + { + "epoch": 0.61, + "grad_norm": 16.74660873413086, + "learning_rate": 1.5916608198857374e-05, + "loss": 2.1669, + "step": 2038 + }, + { + "epoch": 0.61, + "grad_norm": 18.83763885498047, + "learning_rate": 1.5914603588252984e-05, + "loss": 2.0639, + "step": 2039 + }, + { + "epoch": 0.61, + "grad_norm": 12.473159790039062, + "learning_rate": 1.591259897764859e-05, + "loss": 2.0599, + "step": 2040 + }, + { + "epoch": 0.61, + "eval_loss": 0.35515907406806946, + "eval_runtime": 43.2723, + "eval_samples_per_second": 34.179, + "eval_steps_per_second": 34.179, + "step": 2040 + }, + { + "epoch": 0.61, + "grad_norm": 34.54806900024414, + "learning_rate": 1.5910594367044204e-05, + "loss": 2.8225, + "step": 2041 + }, + { + "epoch": 0.61, + "grad_norm": 11.377670288085938, + "learning_rate": 1.5908589756439814e-05, + "loss": 1.3387, + "step": 2042 + }, + { + "epoch": 0.61, + "grad_norm": 21.465269088745117, + "learning_rate": 1.590658514583542e-05, + "loss": 3.1448, + "step": 2043 + }, + { + "epoch": 0.61, + "grad_norm": 13.400733947753906, + "learning_rate": 1.5904580535231035e-05, + "loss": 1.9627, + "step": 2044 + }, + { + "epoch": 0.61, + "grad_norm": 8.886704444885254, + "learning_rate": 1.590257592462664e-05, + "loss": 1.8101, + "step": 2045 + }, + { + "epoch": 0.62, + "grad_norm": 17.08548355102539, + "learning_rate": 1.590057131402225e-05, + "loss": 2.1456, + "step": 2046 + }, + { + "epoch": 0.62, + "grad_norm": 16.1516170501709, + "learning_rate": 1.589856670341786e-05, + "loss": 2.642, + "step": 2047 + }, + { + "epoch": 0.62, + "grad_norm": 16.168611526489258, + "learning_rate": 1.589656209281347e-05, + "loss": 2.2961, + "step": 2048 + }, + { + "epoch": 0.62, + "grad_norm": 17.722412109375, + "learning_rate": 1.589455748220908e-05, + "loss": 2.2299, + "step": 2049 + }, + { + "epoch": 0.62, + "grad_norm": 17.987937927246094, + "learning_rate": 1.589255287160469e-05, + "loss": 2.1003, + "step": 2050 + }, + { + "epoch": 0.62, + "grad_norm": 13.64321517944336, + "learning_rate": 1.5890548261000302e-05, + "loss": 2.5651, + "step": 2051 + }, + { + "epoch": 0.62, + "grad_norm": 10.08339786529541, + "learning_rate": 1.5888543650395912e-05, + "loss": 2.679, + "step": 2052 + }, + { + "epoch": 0.62, + "grad_norm": 19.540874481201172, + "learning_rate": 1.5886539039791522e-05, + "loss": 1.9754, + "step": 2053 + }, + { + "epoch": 0.62, + "grad_norm": 11.823172569274902, + "learning_rate": 1.5884534429187132e-05, + "loss": 1.5079, + "step": 2054 + }, + { + "epoch": 0.62, + "grad_norm": 14.114450454711914, + "learning_rate": 1.5882529818582742e-05, + "loss": 2.3742, + "step": 2055 + }, + { + "epoch": 0.62, + "grad_norm": 9.858566284179688, + "learning_rate": 1.5880525207978352e-05, + "loss": 3.4989, + "step": 2056 + }, + { + "epoch": 0.62, + "grad_norm": 14.25406551361084, + "learning_rate": 1.5878520597373962e-05, + "loss": 3.3511, + "step": 2057 + }, + { + "epoch": 0.62, + "grad_norm": 16.247241973876953, + "learning_rate": 1.5876515986769572e-05, + "loss": 2.1215, + "step": 2058 + }, + { + "epoch": 0.62, + "grad_norm": 13.218579292297363, + "learning_rate": 1.587451137616518e-05, + "loss": 2.4852, + "step": 2059 + }, + { + "epoch": 0.62, + "grad_norm": 13.075812339782715, + "learning_rate": 1.5872506765560793e-05, + "loss": 2.3496, + "step": 2060 + }, + { + "epoch": 0.62, + "grad_norm": 26.715604782104492, + "learning_rate": 1.5870502154956403e-05, + "loss": 2.1378, + "step": 2061 + }, + { + "epoch": 0.62, + "grad_norm": 20.732059478759766, + "learning_rate": 1.586849754435201e-05, + "loss": 2.008, + "step": 2062 + }, + { + "epoch": 0.62, + "grad_norm": 18.93284797668457, + "learning_rate": 1.5866492933747623e-05, + "loss": 3.4783, + "step": 2063 + }, + { + "epoch": 0.62, + "grad_norm": 9.962442398071289, + "learning_rate": 1.586448832314323e-05, + "loss": 2.3671, + "step": 2064 + }, + { + "epoch": 0.62, + "grad_norm": 9.932574272155762, + "learning_rate": 1.586248371253884e-05, + "loss": 1.7524, + "step": 2065 + }, + { + "epoch": 0.62, + "grad_norm": 16.17888069152832, + "learning_rate": 1.586047910193445e-05, + "loss": 1.6364, + "step": 2066 + }, + { + "epoch": 0.62, + "grad_norm": 24.62932014465332, + "learning_rate": 1.585847449133006e-05, + "loss": 1.8643, + "step": 2067 + }, + { + "epoch": 0.62, + "grad_norm": 17.033721923828125, + "learning_rate": 1.585646988072567e-05, + "loss": 2.2953, + "step": 2068 + }, + { + "epoch": 0.62, + "grad_norm": 24.557415008544922, + "learning_rate": 1.585446527012128e-05, + "loss": 3.0784, + "step": 2069 + }, + { + "epoch": 0.62, + "grad_norm": 27.415164947509766, + "learning_rate": 1.585246065951689e-05, + "loss": 2.3325, + "step": 2070 + }, + { + "epoch": 0.62, + "grad_norm": 18.90687370300293, + "learning_rate": 1.58504560489125e-05, + "loss": 2.4899, + "step": 2071 + }, + { + "epoch": 0.62, + "grad_norm": 12.064643859863281, + "learning_rate": 1.584845143830811e-05, + "loss": 2.6493, + "step": 2072 + }, + { + "epoch": 0.62, + "grad_norm": 15.442275047302246, + "learning_rate": 1.584644682770372e-05, + "loss": 1.9658, + "step": 2073 + }, + { + "epoch": 0.62, + "grad_norm": 14.710504531860352, + "learning_rate": 1.584444221709933e-05, + "loss": 2.2267, + "step": 2074 + }, + { + "epoch": 0.62, + "grad_norm": 9.406432151794434, + "learning_rate": 1.584243760649494e-05, + "loss": 1.4927, + "step": 2075 + }, + { + "epoch": 0.62, + "grad_norm": 14.804086685180664, + "learning_rate": 1.584043299589055e-05, + "loss": 1.8073, + "step": 2076 + }, + { + "epoch": 0.62, + "grad_norm": 15.176630973815918, + "learning_rate": 1.583842838528616e-05, + "loss": 1.8248, + "step": 2077 + }, + { + "epoch": 0.62, + "grad_norm": 20.446643829345703, + "learning_rate": 1.5836423774681767e-05, + "loss": 2.3386, + "step": 2078 + }, + { + "epoch": 0.63, + "grad_norm": 20.13260269165039, + "learning_rate": 1.583441916407738e-05, + "loss": 2.7811, + "step": 2079 + }, + { + "epoch": 0.63, + "grad_norm": 18.64963150024414, + "learning_rate": 1.583241455347299e-05, + "loss": 2.9017, + "step": 2080 + }, + { + "epoch": 0.63, + "grad_norm": 17.931795120239258, + "learning_rate": 1.5830409942868597e-05, + "loss": 2.3632, + "step": 2081 + }, + { + "epoch": 0.63, + "grad_norm": 19.928274154663086, + "learning_rate": 1.582840533226421e-05, + "loss": 1.7388, + "step": 2082 + }, + { + "epoch": 0.63, + "grad_norm": 11.930707931518555, + "learning_rate": 1.5826400721659818e-05, + "loss": 1.5894, + "step": 2083 + }, + { + "epoch": 0.63, + "grad_norm": 24.95637321472168, + "learning_rate": 1.5824396111055428e-05, + "loss": 3.1926, + "step": 2084 + }, + { + "epoch": 0.63, + "grad_norm": 14.697010040283203, + "learning_rate": 1.582239150045104e-05, + "loss": 3.0097, + "step": 2085 + }, + { + "epoch": 0.63, + "grad_norm": 9.35490894317627, + "learning_rate": 1.5820386889846648e-05, + "loss": 1.5664, + "step": 2086 + }, + { + "epoch": 0.63, + "grad_norm": 21.47760581970215, + "learning_rate": 1.5818382279242258e-05, + "loss": 2.8386, + "step": 2087 + }, + { + "epoch": 0.63, + "grad_norm": 16.634397506713867, + "learning_rate": 1.5816377668637868e-05, + "loss": 2.8702, + "step": 2088 + }, + { + "epoch": 0.63, + "grad_norm": 12.756913185119629, + "learning_rate": 1.5814373058033478e-05, + "loss": 2.456, + "step": 2089 + }, + { + "epoch": 0.63, + "grad_norm": 13.137290000915527, + "learning_rate": 1.5812368447429088e-05, + "loss": 1.8993, + "step": 2090 + }, + { + "epoch": 0.63, + "grad_norm": 23.007200241088867, + "learning_rate": 1.58103638368247e-05, + "loss": 2.7142, + "step": 2091 + }, + { + "epoch": 0.63, + "grad_norm": 10.685704231262207, + "learning_rate": 1.580835922622031e-05, + "loss": 1.872, + "step": 2092 + }, + { + "epoch": 0.63, + "grad_norm": 34.11006164550781, + "learning_rate": 1.580635461561592e-05, + "loss": 2.8411, + "step": 2093 + }, + { + "epoch": 0.63, + "grad_norm": 15.17668628692627, + "learning_rate": 1.580435000501153e-05, + "loss": 2.318, + "step": 2094 + }, + { + "epoch": 0.63, + "grad_norm": 10.943110466003418, + "learning_rate": 1.580234539440714e-05, + "loss": 3.1694, + "step": 2095 + }, + { + "epoch": 0.63, + "grad_norm": 17.957090377807617, + "learning_rate": 1.580034078380275e-05, + "loss": 2.8144, + "step": 2096 + }, + { + "epoch": 0.63, + "grad_norm": 21.06376075744629, + "learning_rate": 1.5798336173198355e-05, + "loss": 2.4499, + "step": 2097 + }, + { + "epoch": 0.63, + "grad_norm": 17.400442123413086, + "learning_rate": 1.579633156259397e-05, + "loss": 1.1497, + "step": 2098 + }, + { + "epoch": 0.63, + "grad_norm": 14.588909149169922, + "learning_rate": 1.579432695198958e-05, + "loss": 3.2946, + "step": 2099 + }, + { + "epoch": 0.63, + "grad_norm": 13.294282913208008, + "learning_rate": 1.5792322341385186e-05, + "loss": 1.6623, + "step": 2100 + }, + { + "epoch": 0.63, + "grad_norm": 23.731582641601562, + "learning_rate": 1.57903177307808e-05, + "loss": 3.3506, + "step": 2101 + }, + { + "epoch": 0.63, + "grad_norm": 15.64577579498291, + "learning_rate": 1.5788313120176406e-05, + "loss": 2.6303, + "step": 2102 + }, + { + "epoch": 0.63, + "grad_norm": 19.747045516967773, + "learning_rate": 1.5786308509572016e-05, + "loss": 2.5166, + "step": 2103 + }, + { + "epoch": 0.63, + "grad_norm": 42.6046142578125, + "learning_rate": 1.578430389896763e-05, + "loss": 2.8453, + "step": 2104 + }, + { + "epoch": 0.63, + "grad_norm": 8.715738296508789, + "learning_rate": 1.5782299288363236e-05, + "loss": 1.9525, + "step": 2105 + }, + { + "epoch": 0.63, + "grad_norm": 31.00543975830078, + "learning_rate": 1.5780294677758846e-05, + "loss": 3.0852, + "step": 2106 + }, + { + "epoch": 0.63, + "grad_norm": 10.329021453857422, + "learning_rate": 1.5778290067154456e-05, + "loss": 2.1254, + "step": 2107 + }, + { + "epoch": 0.63, + "grad_norm": 12.255505561828613, + "learning_rate": 1.5776285456550066e-05, + "loss": 2.1254, + "step": 2108 + }, + { + "epoch": 0.63, + "grad_norm": 13.093355178833008, + "learning_rate": 1.5774280845945676e-05, + "loss": 2.1318, + "step": 2109 + }, + { + "epoch": 0.63, + "grad_norm": 34.728302001953125, + "learning_rate": 1.5772276235341287e-05, + "loss": 3.1983, + "step": 2110 + }, + { + "epoch": 0.63, + "grad_norm": 13.47143840789795, + "learning_rate": 1.5770271624736897e-05, + "loss": 2.0851, + "step": 2111 + }, + { + "epoch": 0.63, + "grad_norm": 13.41025161743164, + "learning_rate": 1.5768267014132507e-05, + "loss": 2.8133, + "step": 2112 + }, + { + "epoch": 0.64, + "grad_norm": 15.229413986206055, + "learning_rate": 1.5766262403528117e-05, + "loss": 2.8769, + "step": 2113 + }, + { + "epoch": 0.64, + "grad_norm": 29.792980194091797, + "learning_rate": 1.5764257792923724e-05, + "loss": 2.6786, + "step": 2114 + }, + { + "epoch": 0.64, + "grad_norm": 12.121004104614258, + "learning_rate": 1.5762253182319337e-05, + "loss": 2.3366, + "step": 2115 + }, + { + "epoch": 0.64, + "grad_norm": 17.383033752441406, + "learning_rate": 1.5760248571714947e-05, + "loss": 2.2895, + "step": 2116 + }, + { + "epoch": 0.64, + "grad_norm": 28.382801055908203, + "learning_rate": 1.5758243961110554e-05, + "loss": 2.5891, + "step": 2117 + }, + { + "epoch": 0.64, + "grad_norm": 14.136621475219727, + "learning_rate": 1.5756239350506167e-05, + "loss": 2.4559, + "step": 2118 + }, + { + "epoch": 0.64, + "grad_norm": 12.245827674865723, + "learning_rate": 1.5754234739901774e-05, + "loss": 1.3721, + "step": 2119 + }, + { + "epoch": 0.64, + "grad_norm": 10.706925392150879, + "learning_rate": 1.5752230129297384e-05, + "loss": 2.2164, + "step": 2120 + }, + { + "epoch": 0.64, + "grad_norm": 16.694616317749023, + "learning_rate": 1.5750225518692994e-05, + "loss": 3.428, + "step": 2121 + }, + { + "epoch": 0.64, + "grad_norm": 33.90262985229492, + "learning_rate": 1.5748220908088604e-05, + "loss": 3.3125, + "step": 2122 + }, + { + "epoch": 0.64, + "grad_norm": 12.597078323364258, + "learning_rate": 1.5746216297484214e-05, + "loss": 1.9381, + "step": 2123 + }, + { + "epoch": 0.64, + "grad_norm": 13.725189208984375, + "learning_rate": 1.5744211686879824e-05, + "loss": 1.9921, + "step": 2124 + }, + { + "epoch": 0.64, + "grad_norm": 13.38775634765625, + "learning_rate": 1.5742207076275434e-05, + "loss": 2.3561, + "step": 2125 + }, + { + "epoch": 0.64, + "grad_norm": 15.606164932250977, + "learning_rate": 1.5740202465671045e-05, + "loss": 3.1076, + "step": 2126 + }, + { + "epoch": 0.64, + "grad_norm": 17.07231330871582, + "learning_rate": 1.5738197855066655e-05, + "loss": 2.0324, + "step": 2127 + }, + { + "epoch": 0.64, + "grad_norm": 18.28791618347168, + "learning_rate": 1.5736193244462265e-05, + "loss": 2.8287, + "step": 2128 + }, + { + "epoch": 0.64, + "grad_norm": 20.407081604003906, + "learning_rate": 1.5734188633857875e-05, + "loss": 2.3472, + "step": 2129 + }, + { + "epoch": 0.64, + "grad_norm": 12.980356216430664, + "learning_rate": 1.5732184023253485e-05, + "loss": 3.1184, + "step": 2130 + }, + { + "epoch": 0.64, + "grad_norm": 19.251155853271484, + "learning_rate": 1.5730179412649095e-05, + "loss": 2.2474, + "step": 2131 + }, + { + "epoch": 0.64, + "grad_norm": 8.661418914794922, + "learning_rate": 1.5728174802044705e-05, + "loss": 1.4024, + "step": 2132 + }, + { + "epoch": 0.64, + "grad_norm": 24.52813148498535, + "learning_rate": 1.5726170191440312e-05, + "loss": 2.4654, + "step": 2133 + }, + { + "epoch": 0.64, + "grad_norm": 12.824371337890625, + "learning_rate": 1.5724165580835925e-05, + "loss": 1.5831, + "step": 2134 + }, + { + "epoch": 0.64, + "grad_norm": 11.373884201049805, + "learning_rate": 1.5722160970231535e-05, + "loss": 2.043, + "step": 2135 + }, + { + "epoch": 0.64, + "grad_norm": 24.369014739990234, + "learning_rate": 1.5720156359627142e-05, + "loss": 2.4182, + "step": 2136 + }, + { + "epoch": 0.64, + "grad_norm": 12.161373138427734, + "learning_rate": 1.5718151749022755e-05, + "loss": 1.8717, + "step": 2137 + }, + { + "epoch": 0.64, + "grad_norm": 22.5483341217041, + "learning_rate": 1.5716147138418362e-05, + "loss": 1.8705, + "step": 2138 + }, + { + "epoch": 0.64, + "grad_norm": 16.376750946044922, + "learning_rate": 1.5714142527813972e-05, + "loss": 2.359, + "step": 2139 + }, + { + "epoch": 0.64, + "grad_norm": 12.507425308227539, + "learning_rate": 1.5712137917209582e-05, + "loss": 1.8627, + "step": 2140 + }, + { + "epoch": 0.64, + "grad_norm": 12.094954490661621, + "learning_rate": 1.5710133306605192e-05, + "loss": 1.5144, + "step": 2141 + }, + { + "epoch": 0.64, + "grad_norm": 12.721661567687988, + "learning_rate": 1.5708128696000802e-05, + "loss": 2.4135, + "step": 2142 + }, + { + "epoch": 0.64, + "grad_norm": 16.270231246948242, + "learning_rate": 1.5706124085396413e-05, + "loss": 1.5671, + "step": 2143 + }, + { + "epoch": 0.64, + "grad_norm": 22.51713752746582, + "learning_rate": 1.5704119474792023e-05, + "loss": 2.8953, + "step": 2144 + }, + { + "epoch": 0.64, + "grad_norm": 14.330571174621582, + "learning_rate": 1.5702114864187633e-05, + "loss": 1.948, + "step": 2145 + }, + { + "epoch": 0.65, + "grad_norm": 16.00999641418457, + "learning_rate": 1.5700110253583243e-05, + "loss": 2.0595, + "step": 2146 + }, + { + "epoch": 0.65, + "grad_norm": 23.264543533325195, + "learning_rate": 1.5698105642978853e-05, + "loss": 2.7985, + "step": 2147 + }, + { + "epoch": 0.65, + "grad_norm": 12.527949333190918, + "learning_rate": 1.5696101032374463e-05, + "loss": 2.2662, + "step": 2148 + }, + { + "epoch": 0.65, + "grad_norm": 23.13153076171875, + "learning_rate": 1.5694096421770073e-05, + "loss": 3.2819, + "step": 2149 + }, + { + "epoch": 0.65, + "grad_norm": 11.828460693359375, + "learning_rate": 1.5692091811165683e-05, + "loss": 1.9632, + "step": 2150 + }, + { + "epoch": 0.65, + "grad_norm": 30.631380081176758, + "learning_rate": 1.5690087200561293e-05, + "loss": 2.5282, + "step": 2151 + }, + { + "epoch": 0.65, + "grad_norm": 16.613862991333008, + "learning_rate": 1.56880825899569e-05, + "loss": 2.6759, + "step": 2152 + }, + { + "epoch": 0.65, + "grad_norm": 36.72877502441406, + "learning_rate": 1.5686077979352513e-05, + "loss": 2.0691, + "step": 2153 + }, + { + "epoch": 0.65, + "grad_norm": 11.451091766357422, + "learning_rate": 1.5684073368748123e-05, + "loss": 1.2804, + "step": 2154 + }, + { + "epoch": 0.65, + "grad_norm": 13.62600326538086, + "learning_rate": 1.568206875814373e-05, + "loss": 1.9118, + "step": 2155 + }, + { + "epoch": 0.65, + "grad_norm": 21.245529174804688, + "learning_rate": 1.5680064147539344e-05, + "loss": 2.7814, + "step": 2156 + }, + { + "epoch": 0.65, + "grad_norm": 22.127309799194336, + "learning_rate": 1.567805953693495e-05, + "loss": 1.8782, + "step": 2157 + }, + { + "epoch": 0.65, + "grad_norm": 11.301019668579102, + "learning_rate": 1.567605492633056e-05, + "loss": 3.1518, + "step": 2158 + }, + { + "epoch": 0.65, + "grad_norm": 11.62056827545166, + "learning_rate": 1.5674050315726174e-05, + "loss": 2.3903, + "step": 2159 + }, + { + "epoch": 0.65, + "grad_norm": 15.686822891235352, + "learning_rate": 1.567204570512178e-05, + "loss": 2.4537, + "step": 2160 + }, + { + "epoch": 0.65, + "eval_loss": 0.3099038302898407, + "eval_runtime": 43.3953, + "eval_samples_per_second": 34.082, + "eval_steps_per_second": 34.082, + "step": 2160 + }, + { + "epoch": 0.65, + "grad_norm": 14.51310920715332, + "learning_rate": 1.567004109451739e-05, + "loss": 2.5808, + "step": 2161 + }, + { + "epoch": 0.65, + "grad_norm": 15.057507514953613, + "learning_rate": 1.5668036483913e-05, + "loss": 2.8596, + "step": 2162 + }, + { + "epoch": 0.65, + "grad_norm": 12.982969284057617, + "learning_rate": 1.566603187330861e-05, + "loss": 2.5042, + "step": 2163 + }, + { + "epoch": 0.65, + "grad_norm": 11.829024314880371, + "learning_rate": 1.566402726270422e-05, + "loss": 1.9732, + "step": 2164 + }, + { + "epoch": 0.65, + "grad_norm": 14.070128440856934, + "learning_rate": 1.566202265209983e-05, + "loss": 2.6257, + "step": 2165 + }, + { + "epoch": 0.65, + "grad_norm": 10.464433670043945, + "learning_rate": 1.566001804149544e-05, + "loss": 2.178, + "step": 2166 + }, + { + "epoch": 0.65, + "grad_norm": 11.11734390258789, + "learning_rate": 1.565801343089105e-05, + "loss": 1.5285, + "step": 2167 + }, + { + "epoch": 0.65, + "grad_norm": 14.957688331604004, + "learning_rate": 1.565600882028666e-05, + "loss": 2.1816, + "step": 2168 + }, + { + "epoch": 0.65, + "grad_norm": 16.353477478027344, + "learning_rate": 1.565400420968227e-05, + "loss": 2.2476, + "step": 2169 + }, + { + "epoch": 0.65, + "grad_norm": 9.818117141723633, + "learning_rate": 1.565199959907788e-05, + "loss": 2.1336, + "step": 2170 + }, + { + "epoch": 0.65, + "grad_norm": 20.093345642089844, + "learning_rate": 1.564999498847349e-05, + "loss": 2.6549, + "step": 2171 + }, + { + "epoch": 0.65, + "grad_norm": 18.352497100830078, + "learning_rate": 1.56479903778691e-05, + "loss": 2.6063, + "step": 2172 + }, + { + "epoch": 0.65, + "grad_norm": 26.143571853637695, + "learning_rate": 1.5645985767264712e-05, + "loss": 2.8258, + "step": 2173 + }, + { + "epoch": 0.65, + "grad_norm": 18.577131271362305, + "learning_rate": 1.564398115666032e-05, + "loss": 1.8137, + "step": 2174 + }, + { + "epoch": 0.65, + "grad_norm": 11.283987045288086, + "learning_rate": 1.5641976546055932e-05, + "loss": 1.9124, + "step": 2175 + }, + { + "epoch": 0.65, + "grad_norm": 19.12086296081543, + "learning_rate": 1.563997193545154e-05, + "loss": 2.3072, + "step": 2176 + }, + { + "epoch": 0.65, + "grad_norm": 11.322100639343262, + "learning_rate": 1.563796732484715e-05, + "loss": 1.8458, + "step": 2177 + }, + { + "epoch": 0.65, + "grad_norm": 16.309368133544922, + "learning_rate": 1.5635962714242762e-05, + "loss": 2.3407, + "step": 2178 + }, + { + "epoch": 0.66, + "grad_norm": 42.83634567260742, + "learning_rate": 1.563395810363837e-05, + "loss": 2.9269, + "step": 2179 + }, + { + "epoch": 0.66, + "grad_norm": 16.00682830810547, + "learning_rate": 1.563195349303398e-05, + "loss": 2.2228, + "step": 2180 + }, + { + "epoch": 0.66, + "grad_norm": 10.3877592086792, + "learning_rate": 1.562994888242959e-05, + "loss": 2.0959, + "step": 2181 + }, + { + "epoch": 0.66, + "grad_norm": 23.170366287231445, + "learning_rate": 1.56279442718252e-05, + "loss": 2.8749, + "step": 2182 + }, + { + "epoch": 0.66, + "grad_norm": 10.100824356079102, + "learning_rate": 1.562593966122081e-05, + "loss": 2.0848, + "step": 2183 + }, + { + "epoch": 0.66, + "grad_norm": 17.805927276611328, + "learning_rate": 1.562393505061642e-05, + "loss": 2.6061, + "step": 2184 + }, + { + "epoch": 0.66, + "grad_norm": 18.987218856811523, + "learning_rate": 1.562193044001203e-05, + "loss": 2.7442, + "step": 2185 + }, + { + "epoch": 0.66, + "grad_norm": 44.760196685791016, + "learning_rate": 1.561992582940764e-05, + "loss": 2.3028, + "step": 2186 + }, + { + "epoch": 0.66, + "grad_norm": 17.44770622253418, + "learning_rate": 1.561792121880325e-05, + "loss": 3.0411, + "step": 2187 + }, + { + "epoch": 0.66, + "grad_norm": 15.366558074951172, + "learning_rate": 1.5615916608198856e-05, + "loss": 2.4066, + "step": 2188 + }, + { + "epoch": 0.66, + "grad_norm": 14.17463493347168, + "learning_rate": 1.561391199759447e-05, + "loss": 1.7436, + "step": 2189 + }, + { + "epoch": 0.66, + "grad_norm": 11.214127540588379, + "learning_rate": 1.561190738699008e-05, + "loss": 1.9083, + "step": 2190 + }, + { + "epoch": 0.66, + "grad_norm": 14.602449417114258, + "learning_rate": 1.5609902776385686e-05, + "loss": 2.8559, + "step": 2191 + }, + { + "epoch": 0.66, + "grad_norm": 16.023332595825195, + "learning_rate": 1.56078981657813e-05, + "loss": 1.8949, + "step": 2192 + }, + { + "epoch": 0.66, + "grad_norm": 20.160932540893555, + "learning_rate": 1.5605893555176907e-05, + "loss": 2.0295, + "step": 2193 + }, + { + "epoch": 0.66, + "grad_norm": 12.389837265014648, + "learning_rate": 1.5603888944572517e-05, + "loss": 2.2384, + "step": 2194 + }, + { + "epoch": 0.66, + "grad_norm": 18.265491485595703, + "learning_rate": 1.5601884333968127e-05, + "loss": 2.4571, + "step": 2195 + }, + { + "epoch": 0.66, + "grad_norm": 14.70784854888916, + "learning_rate": 1.5599879723363737e-05, + "loss": 2.1679, + "step": 2196 + }, + { + "epoch": 0.66, + "grad_norm": 16.466394424438477, + "learning_rate": 1.5597875112759347e-05, + "loss": 1.9488, + "step": 2197 + }, + { + "epoch": 0.66, + "grad_norm": 21.658117294311523, + "learning_rate": 1.5595870502154957e-05, + "loss": 2.7079, + "step": 2198 + }, + { + "epoch": 0.66, + "grad_norm": 17.353038787841797, + "learning_rate": 1.5593865891550567e-05, + "loss": 2.9999, + "step": 2199 + }, + { + "epoch": 0.66, + "grad_norm": 15.819283485412598, + "learning_rate": 1.5591861280946177e-05, + "loss": 2.1287, + "step": 2200 + }, + { + "epoch": 0.66, + "grad_norm": 13.62539291381836, + "learning_rate": 1.5589856670341787e-05, + "loss": 1.9569, + "step": 2201 + }, + { + "epoch": 0.66, + "grad_norm": 18.662357330322266, + "learning_rate": 1.5587852059737397e-05, + "loss": 2.51, + "step": 2202 + }, + { + "epoch": 0.66, + "grad_norm": 26.726213455200195, + "learning_rate": 1.5585847449133007e-05, + "loss": 1.9874, + "step": 2203 + }, + { + "epoch": 0.66, + "grad_norm": 16.964014053344727, + "learning_rate": 1.5583842838528618e-05, + "loss": 3.0422, + "step": 2204 + }, + { + "epoch": 0.66, + "grad_norm": 24.55567169189453, + "learning_rate": 1.5581838227924228e-05, + "loss": 2.4687, + "step": 2205 + }, + { + "epoch": 0.66, + "grad_norm": 12.898763656616211, + "learning_rate": 1.5579833617319838e-05, + "loss": 1.8264, + "step": 2206 + }, + { + "epoch": 0.66, + "grad_norm": 14.676507949829102, + "learning_rate": 1.5577829006715444e-05, + "loss": 1.942, + "step": 2207 + }, + { + "epoch": 0.66, + "grad_norm": 21.591907501220703, + "learning_rate": 1.5575824396111058e-05, + "loss": 2.193, + "step": 2208 + }, + { + "epoch": 0.66, + "grad_norm": 22.116395950317383, + "learning_rate": 1.5573819785506668e-05, + "loss": 1.8098, + "step": 2209 + }, + { + "epoch": 0.66, + "grad_norm": 17.048828125, + "learning_rate": 1.5571815174902275e-05, + "loss": 2.4309, + "step": 2210 + }, + { + "epoch": 0.66, + "grad_norm": 17.50075912475586, + "learning_rate": 1.5569810564297888e-05, + "loss": 3.5984, + "step": 2211 + }, + { + "epoch": 0.67, + "grad_norm": 42.22193145751953, + "learning_rate": 1.5567805953693495e-05, + "loss": 2.6408, + "step": 2212 + }, + { + "epoch": 0.67, + "grad_norm": 16.618879318237305, + "learning_rate": 1.5565801343089105e-05, + "loss": 1.9168, + "step": 2213 + }, + { + "epoch": 0.67, + "grad_norm": 20.115299224853516, + "learning_rate": 1.5563796732484715e-05, + "loss": 2.5345, + "step": 2214 + }, + { + "epoch": 0.67, + "grad_norm": 9.575844764709473, + "learning_rate": 1.5561792121880325e-05, + "loss": 2.5817, + "step": 2215 + }, + { + "epoch": 0.67, + "grad_norm": 16.084333419799805, + "learning_rate": 1.5559787511275935e-05, + "loss": 2.8486, + "step": 2216 + }, + { + "epoch": 0.67, + "grad_norm": 9.678840637207031, + "learning_rate": 1.5557782900671545e-05, + "loss": 2.0502, + "step": 2217 + }, + { + "epoch": 0.67, + "grad_norm": 25.424049377441406, + "learning_rate": 1.5555778290067155e-05, + "loss": 2.3241, + "step": 2218 + }, + { + "epoch": 0.67, + "grad_norm": 14.452130317687988, + "learning_rate": 1.5553773679462765e-05, + "loss": 2.2107, + "step": 2219 + }, + { + "epoch": 0.67, + "grad_norm": 34.95116424560547, + "learning_rate": 1.5551769068858375e-05, + "loss": 2.9797, + "step": 2220 + }, + { + "epoch": 0.67, + "grad_norm": 10.273086547851562, + "learning_rate": 1.5549764458253986e-05, + "loss": 1.676, + "step": 2221 + }, + { + "epoch": 0.67, + "grad_norm": 16.252601623535156, + "learning_rate": 1.5547759847649596e-05, + "loss": 2.6206, + "step": 2222 + }, + { + "epoch": 0.67, + "grad_norm": 10.310708045959473, + "learning_rate": 1.5545755237045206e-05, + "loss": 2.1657, + "step": 2223 + }, + { + "epoch": 0.67, + "grad_norm": 11.262001037597656, + "learning_rate": 1.5543750626440816e-05, + "loss": 1.9435, + "step": 2224 + }, + { + "epoch": 0.67, + "grad_norm": 20.996740341186523, + "learning_rate": 1.5541746015836426e-05, + "loss": 2.7842, + "step": 2225 + }, + { + "epoch": 0.67, + "grad_norm": 25.680212020874023, + "learning_rate": 1.5539741405232033e-05, + "loss": 2.4817, + "step": 2226 + }, + { + "epoch": 0.67, + "grad_norm": 11.317098617553711, + "learning_rate": 1.5537736794627646e-05, + "loss": 1.2717, + "step": 2227 + }, + { + "epoch": 0.67, + "grad_norm": 14.966314315795898, + "learning_rate": 1.5535732184023256e-05, + "loss": 2.5357, + "step": 2228 + }, + { + "epoch": 0.67, + "grad_norm": 22.922651290893555, + "learning_rate": 1.5533727573418863e-05, + "loss": 3.0417, + "step": 2229 + }, + { + "epoch": 0.67, + "grad_norm": 21.57832908630371, + "learning_rate": 1.5531722962814476e-05, + "loss": 2.1712, + "step": 2230 + }, + { + "epoch": 0.67, + "grad_norm": 22.31909942626953, + "learning_rate": 1.5529718352210083e-05, + "loss": 2.7337, + "step": 2231 + }, + { + "epoch": 0.67, + "grad_norm": 10.842183113098145, + "learning_rate": 1.5527713741605693e-05, + "loss": 1.886, + "step": 2232 + }, + { + "epoch": 0.67, + "grad_norm": 17.99361228942871, + "learning_rate": 1.5525709131001307e-05, + "loss": 2.3479, + "step": 2233 + }, + { + "epoch": 0.67, + "grad_norm": 36.20960998535156, + "learning_rate": 1.5523704520396913e-05, + "loss": 2.8728, + "step": 2234 + }, + { + "epoch": 0.67, + "grad_norm": 14.748534202575684, + "learning_rate": 1.5521699909792523e-05, + "loss": 2.0896, + "step": 2235 + }, + { + "epoch": 0.67, + "grad_norm": 10.2194242477417, + "learning_rate": 1.5519695299188133e-05, + "loss": 2.0784, + "step": 2236 + }, + { + "epoch": 0.67, + "grad_norm": 19.986635208129883, + "learning_rate": 1.5517690688583744e-05, + "loss": 2.4334, + "step": 2237 + }, + { + "epoch": 0.67, + "grad_norm": 19.66067886352539, + "learning_rate": 1.5515686077979354e-05, + "loss": 2.684, + "step": 2238 + }, + { + "epoch": 0.67, + "grad_norm": 70.83124542236328, + "learning_rate": 1.5513681467374964e-05, + "loss": 2.5154, + "step": 2239 + }, + { + "epoch": 0.67, + "grad_norm": 23.612783432006836, + "learning_rate": 1.5511676856770574e-05, + "loss": 2.886, + "step": 2240 + }, + { + "epoch": 0.67, + "grad_norm": 15.822755813598633, + "learning_rate": 1.5509672246166184e-05, + "loss": 2.0681, + "step": 2241 + }, + { + "epoch": 0.67, + "grad_norm": 25.98380470275879, + "learning_rate": 1.5507667635561794e-05, + "loss": 2.8038, + "step": 2242 + }, + { + "epoch": 0.67, + "grad_norm": 27.250049591064453, + "learning_rate": 1.5505663024957404e-05, + "loss": 2.5481, + "step": 2243 + }, + { + "epoch": 0.67, + "grad_norm": 36.15570831298828, + "learning_rate": 1.5503658414353014e-05, + "loss": 2.1858, + "step": 2244 + }, + { + "epoch": 0.67, + "grad_norm": 10.935135841369629, + "learning_rate": 1.5501653803748624e-05, + "loss": 1.1376, + "step": 2245 + }, + { + "epoch": 0.68, + "grad_norm": 21.463272094726562, + "learning_rate": 1.5499649193144234e-05, + "loss": 2.5708, + "step": 2246 + }, + { + "epoch": 0.68, + "grad_norm": 12.168373107910156, + "learning_rate": 1.5497644582539844e-05, + "loss": 2.3351, + "step": 2247 + }, + { + "epoch": 0.68, + "grad_norm": 20.284778594970703, + "learning_rate": 1.549563997193545e-05, + "loss": 2.2741, + "step": 2248 + }, + { + "epoch": 0.68, + "grad_norm": 21.697620391845703, + "learning_rate": 1.5493635361331065e-05, + "loss": 3.3955, + "step": 2249 + }, + { + "epoch": 0.68, + "grad_norm": 15.422367095947266, + "learning_rate": 1.549163075072667e-05, + "loss": 2.879, + "step": 2250 + }, + { + "epoch": 0.68, + "grad_norm": 17.34280776977539, + "learning_rate": 1.548962614012228e-05, + "loss": 2.5866, + "step": 2251 + }, + { + "epoch": 0.68, + "grad_norm": 17.975322723388672, + "learning_rate": 1.5487621529517895e-05, + "loss": 2.3053, + "step": 2252 + }, + { + "epoch": 0.68, + "grad_norm": 30.330408096313477, + "learning_rate": 1.54856169189135e-05, + "loss": 3.002, + "step": 2253 + }, + { + "epoch": 0.68, + "grad_norm": 17.363624572753906, + "learning_rate": 1.548361230830911e-05, + "loss": 2.323, + "step": 2254 + }, + { + "epoch": 0.68, + "grad_norm": 14.340827941894531, + "learning_rate": 1.548160769770472e-05, + "loss": 1.8183, + "step": 2255 + }, + { + "epoch": 0.68, + "grad_norm": 11.480280876159668, + "learning_rate": 1.5479603087100332e-05, + "loss": 1.9501, + "step": 2256 + }, + { + "epoch": 0.68, + "grad_norm": 10.279202461242676, + "learning_rate": 1.5477598476495942e-05, + "loss": 2.1804, + "step": 2257 + }, + { + "epoch": 0.68, + "grad_norm": 7.849579334259033, + "learning_rate": 1.5475593865891552e-05, + "loss": 2.1096, + "step": 2258 + }, + { + "epoch": 0.68, + "grad_norm": 15.246369361877441, + "learning_rate": 1.5473589255287162e-05, + "loss": 2.5827, + "step": 2259 + }, + { + "epoch": 0.68, + "grad_norm": 11.893373489379883, + "learning_rate": 1.5471584644682772e-05, + "loss": 1.4826, + "step": 2260 + }, + { + "epoch": 0.68, + "grad_norm": 22.876691818237305, + "learning_rate": 1.5469580034078382e-05, + "loss": 2.7584, + "step": 2261 + }, + { + "epoch": 0.68, + "grad_norm": 22.17548179626465, + "learning_rate": 1.546757542347399e-05, + "loss": 2.8086, + "step": 2262 + }, + { + "epoch": 0.68, + "grad_norm": 19.29637908935547, + "learning_rate": 1.5465570812869602e-05, + "loss": 2.4008, + "step": 2263 + }, + { + "epoch": 0.68, + "grad_norm": 18.765453338623047, + "learning_rate": 1.5463566202265212e-05, + "loss": 2.7496, + "step": 2264 + }, + { + "epoch": 0.68, + "grad_norm": 15.087578773498535, + "learning_rate": 1.5461561591660823e-05, + "loss": 2.6039, + "step": 2265 + }, + { + "epoch": 0.68, + "grad_norm": 9.94806957244873, + "learning_rate": 1.5459556981056433e-05, + "loss": 1.0356, + "step": 2266 + }, + { + "epoch": 0.68, + "grad_norm": 20.82723617553711, + "learning_rate": 1.545755237045204e-05, + "loss": 2.3768, + "step": 2267 + }, + { + "epoch": 0.68, + "grad_norm": 11.183643341064453, + "learning_rate": 1.5455547759847653e-05, + "loss": 2.0118, + "step": 2268 + }, + { + "epoch": 0.68, + "grad_norm": 11.922686576843262, + "learning_rate": 1.545354314924326e-05, + "loss": 2.3468, + "step": 2269 + }, + { + "epoch": 0.68, + "grad_norm": 10.599013328552246, + "learning_rate": 1.545153853863887e-05, + "loss": 1.6547, + "step": 2270 + }, + { + "epoch": 0.68, + "grad_norm": 9.613250732421875, + "learning_rate": 1.5449533928034483e-05, + "loss": 2.3148, + "step": 2271 + }, + { + "epoch": 0.68, + "grad_norm": 20.166654586791992, + "learning_rate": 1.544752931743009e-05, + "loss": 2.3727, + "step": 2272 + }, + { + "epoch": 0.68, + "grad_norm": 14.838998794555664, + "learning_rate": 1.54455247068257e-05, + "loss": 1.8119, + "step": 2273 + }, + { + "epoch": 0.68, + "grad_norm": 15.088861465454102, + "learning_rate": 1.544352009622131e-05, + "loss": 1.2957, + "step": 2274 + }, + { + "epoch": 0.68, + "grad_norm": 16.902069091796875, + "learning_rate": 1.544151548561692e-05, + "loss": 2.637, + "step": 2275 + }, + { + "epoch": 0.68, + "grad_norm": 39.02754592895508, + "learning_rate": 1.543951087501253e-05, + "loss": 2.1298, + "step": 2276 + }, + { + "epoch": 0.68, + "grad_norm": 13.487964630126953, + "learning_rate": 1.543750626440814e-05, + "loss": 2.4923, + "step": 2277 + }, + { + "epoch": 0.68, + "grad_norm": 28.00465965270996, + "learning_rate": 1.543550165380375e-05, + "loss": 2.9849, + "step": 2278 + }, + { + "epoch": 0.69, + "grad_norm": 12.890748977661133, + "learning_rate": 1.543349704319936e-05, + "loss": 2.6115, + "step": 2279 + }, + { + "epoch": 0.69, + "grad_norm": 14.795134544372559, + "learning_rate": 1.543149243259497e-05, + "loss": 1.9501, + "step": 2280 + }, + { + "epoch": 0.69, + "eval_loss": 0.27775686979293823, + "eval_runtime": 43.6573, + "eval_samples_per_second": 33.878, + "eval_steps_per_second": 33.878, + "step": 2280 + }, + { + "epoch": 0.69, + "grad_norm": 17.335163116455078, + "learning_rate": 1.5429487821990577e-05, + "loss": 2.4545, + "step": 2281 + }, + { + "epoch": 0.69, + "grad_norm": 19.485933303833008, + "learning_rate": 1.542748321138619e-05, + "loss": 1.9536, + "step": 2282 + }, + { + "epoch": 0.69, + "grad_norm": 17.225101470947266, + "learning_rate": 1.54254786007818e-05, + "loss": 1.7893, + "step": 2283 + }, + { + "epoch": 0.69, + "grad_norm": 12.618383407592773, + "learning_rate": 1.5423473990177407e-05, + "loss": 2.0978, + "step": 2284 + }, + { + "epoch": 0.69, + "grad_norm": 21.85410499572754, + "learning_rate": 1.542146937957302e-05, + "loss": 2.1725, + "step": 2285 + }, + { + "epoch": 0.69, + "grad_norm": 13.22624397277832, + "learning_rate": 1.5419464768968628e-05, + "loss": 1.9183, + "step": 2286 + }, + { + "epoch": 0.69, + "grad_norm": 14.312644004821777, + "learning_rate": 1.5417460158364238e-05, + "loss": 1.764, + "step": 2287 + }, + { + "epoch": 0.69, + "grad_norm": 17.018287658691406, + "learning_rate": 1.541545554775985e-05, + "loss": 1.6649, + "step": 2288 + }, + { + "epoch": 0.69, + "grad_norm": 19.239450454711914, + "learning_rate": 1.5413450937155458e-05, + "loss": 2.6163, + "step": 2289 + }, + { + "epoch": 0.69, + "grad_norm": 8.873007774353027, + "learning_rate": 1.5411446326551068e-05, + "loss": 2.4475, + "step": 2290 + }, + { + "epoch": 0.69, + "grad_norm": 34.43484878540039, + "learning_rate": 1.5409441715946678e-05, + "loss": 3.171, + "step": 2291 + }, + { + "epoch": 0.69, + "grad_norm": 13.385025024414062, + "learning_rate": 1.5407437105342288e-05, + "loss": 2.4151, + "step": 2292 + }, + { + "epoch": 0.69, + "grad_norm": 23.839799880981445, + "learning_rate": 1.5405432494737898e-05, + "loss": 3.4928, + "step": 2293 + }, + { + "epoch": 0.69, + "grad_norm": 13.791393280029297, + "learning_rate": 1.5403427884133508e-05, + "loss": 1.5945, + "step": 2294 + }, + { + "epoch": 0.69, + "grad_norm": 15.341072082519531, + "learning_rate": 1.5401423273529118e-05, + "loss": 1.6761, + "step": 2295 + }, + { + "epoch": 0.69, + "grad_norm": 16.246986389160156, + "learning_rate": 1.539941866292473e-05, + "loss": 1.9111, + "step": 2296 + }, + { + "epoch": 0.69, + "grad_norm": 21.339174270629883, + "learning_rate": 1.539741405232034e-05, + "loss": 2.3112, + "step": 2297 + }, + { + "epoch": 0.69, + "grad_norm": 19.16057777404785, + "learning_rate": 1.539540944171595e-05, + "loss": 1.8671, + "step": 2298 + }, + { + "epoch": 0.69, + "grad_norm": 10.452445983886719, + "learning_rate": 1.539340483111156e-05, + "loss": 1.1408, + "step": 2299 + }, + { + "epoch": 0.69, + "grad_norm": 12.354015350341797, + "learning_rate": 1.5391400220507165e-05, + "loss": 2.4143, + "step": 2300 + }, + { + "epoch": 0.69, + "grad_norm": 12.557357788085938, + "learning_rate": 1.538939560990278e-05, + "loss": 1.8995, + "step": 2301 + }, + { + "epoch": 0.69, + "grad_norm": 16.86603546142578, + "learning_rate": 1.538739099929839e-05, + "loss": 2.0175, + "step": 2302 + }, + { + "epoch": 0.69, + "grad_norm": 22.319530487060547, + "learning_rate": 1.5385386388693996e-05, + "loss": 3.6267, + "step": 2303 + }, + { + "epoch": 0.69, + "grad_norm": 21.675506591796875, + "learning_rate": 1.538338177808961e-05, + "loss": 2.8276, + "step": 2304 + }, + { + "epoch": 0.69, + "grad_norm": 25.780988693237305, + "learning_rate": 1.5381377167485216e-05, + "loss": 2.1939, + "step": 2305 + }, + { + "epoch": 0.69, + "grad_norm": 21.725067138671875, + "learning_rate": 1.5379372556880826e-05, + "loss": 2.4273, + "step": 2306 + }, + { + "epoch": 0.69, + "grad_norm": 13.079160690307617, + "learning_rate": 1.537736794627644e-05, + "loss": 2.748, + "step": 2307 + }, + { + "epoch": 0.69, + "grad_norm": 70.98606872558594, + "learning_rate": 1.5375363335672046e-05, + "loss": 3.8798, + "step": 2308 + }, + { + "epoch": 0.69, + "grad_norm": 30.048757553100586, + "learning_rate": 1.5373358725067656e-05, + "loss": 1.797, + "step": 2309 + }, + { + "epoch": 0.69, + "grad_norm": 19.938982009887695, + "learning_rate": 1.5371354114463266e-05, + "loss": 2.9236, + "step": 2310 + }, + { + "epoch": 0.69, + "grad_norm": 19.096269607543945, + "learning_rate": 1.5369349503858876e-05, + "loss": 3.0036, + "step": 2311 + }, + { + "epoch": 0.7, + "grad_norm": 16.5783634185791, + "learning_rate": 1.5367344893254486e-05, + "loss": 2.3479, + "step": 2312 + }, + { + "epoch": 0.7, + "grad_norm": 11.655875205993652, + "learning_rate": 1.5365340282650096e-05, + "loss": 2.0129, + "step": 2313 + }, + { + "epoch": 0.7, + "grad_norm": 12.814486503601074, + "learning_rate": 1.5363335672045706e-05, + "loss": 2.283, + "step": 2314 + }, + { + "epoch": 0.7, + "grad_norm": 12.23183822631836, + "learning_rate": 1.5361331061441317e-05, + "loss": 2.0969, + "step": 2315 + }, + { + "epoch": 0.7, + "grad_norm": 23.845361709594727, + "learning_rate": 1.5359326450836927e-05, + "loss": 2.6363, + "step": 2316 + }, + { + "epoch": 0.7, + "grad_norm": 22.525117874145508, + "learning_rate": 1.5357321840232537e-05, + "loss": 2.7907, + "step": 2317 + }, + { + "epoch": 0.7, + "grad_norm": 35.314666748046875, + "learning_rate": 1.5355317229628147e-05, + "loss": 2.434, + "step": 2318 + }, + { + "epoch": 0.7, + "grad_norm": 10.943061828613281, + "learning_rate": 1.5353312619023757e-05, + "loss": 1.8863, + "step": 2319 + }, + { + "epoch": 0.7, + "grad_norm": 16.291006088256836, + "learning_rate": 1.5351308008419367e-05, + "loss": 1.8541, + "step": 2320 + }, + { + "epoch": 0.7, + "grad_norm": 12.258851051330566, + "learning_rate": 1.5349303397814977e-05, + "loss": 2.3217, + "step": 2321 + }, + { + "epoch": 0.7, + "grad_norm": 12.439240455627441, + "learning_rate": 1.5347298787210584e-05, + "loss": 2.1214, + "step": 2322 + }, + { + "epoch": 0.7, + "grad_norm": 24.879018783569336, + "learning_rate": 1.5345294176606197e-05, + "loss": 2.4778, + "step": 2323 + }, + { + "epoch": 0.7, + "grad_norm": 14.103094100952148, + "learning_rate": 1.5343289566001804e-05, + "loss": 3.2186, + "step": 2324 + }, + { + "epoch": 0.7, + "grad_norm": 16.0571346282959, + "learning_rate": 1.5341284955397414e-05, + "loss": 1.7382, + "step": 2325 + }, + { + "epoch": 0.7, + "grad_norm": 14.569293975830078, + "learning_rate": 1.5339280344793027e-05, + "loss": 2.6728, + "step": 2326 + }, + { + "epoch": 0.7, + "grad_norm": 19.245534896850586, + "learning_rate": 1.5337275734188634e-05, + "loss": 2.1571, + "step": 2327 + }, + { + "epoch": 0.7, + "grad_norm": 35.83250427246094, + "learning_rate": 1.5335271123584244e-05, + "loss": 3.574, + "step": 2328 + }, + { + "epoch": 0.7, + "grad_norm": 22.268566131591797, + "learning_rate": 1.5333266512979854e-05, + "loss": 2.8414, + "step": 2329 + }, + { + "epoch": 0.7, + "grad_norm": 13.335097312927246, + "learning_rate": 1.5331261902375464e-05, + "loss": 2.2446, + "step": 2330 + }, + { + "epoch": 0.7, + "grad_norm": 21.128978729248047, + "learning_rate": 1.5329257291771075e-05, + "loss": 2.1243, + "step": 2331 + }, + { + "epoch": 0.7, + "grad_norm": 15.031562805175781, + "learning_rate": 1.5327252681166685e-05, + "loss": 2.1856, + "step": 2332 + }, + { + "epoch": 0.7, + "grad_norm": 24.040231704711914, + "learning_rate": 1.5325248070562295e-05, + "loss": 2.7496, + "step": 2333 + }, + { + "epoch": 0.7, + "grad_norm": 21.210121154785156, + "learning_rate": 1.5323243459957905e-05, + "loss": 2.1727, + "step": 2334 + }, + { + "epoch": 0.7, + "grad_norm": 14.999757766723633, + "learning_rate": 1.5321238849353515e-05, + "loss": 1.8486, + "step": 2335 + }, + { + "epoch": 0.7, + "grad_norm": 21.83721160888672, + "learning_rate": 1.5319234238749125e-05, + "loss": 2.4315, + "step": 2336 + }, + { + "epoch": 0.7, + "grad_norm": 10.136129379272461, + "learning_rate": 1.5317229628144735e-05, + "loss": 1.8972, + "step": 2337 + }, + { + "epoch": 0.7, + "grad_norm": 14.037278175354004, + "learning_rate": 1.5315225017540345e-05, + "loss": 1.9107, + "step": 2338 + }, + { + "epoch": 0.7, + "grad_norm": 12.793769836425781, + "learning_rate": 1.5313220406935955e-05, + "loss": 1.8938, + "step": 2339 + }, + { + "epoch": 0.7, + "grad_norm": 46.66535568237305, + "learning_rate": 1.5311215796331565e-05, + "loss": 3.1435, + "step": 2340 + }, + { + "epoch": 0.7, + "grad_norm": 24.93342399597168, + "learning_rate": 1.5309211185727172e-05, + "loss": 2.3244, + "step": 2341 + }, + { + "epoch": 0.7, + "grad_norm": 17.550573348999023, + "learning_rate": 1.5307206575122785e-05, + "loss": 1.4479, + "step": 2342 + }, + { + "epoch": 0.7, + "grad_norm": 23.929763793945312, + "learning_rate": 1.5305201964518392e-05, + "loss": 2.6339, + "step": 2343 + }, + { + "epoch": 0.7, + "grad_norm": 18.37746238708496, + "learning_rate": 1.5303197353914002e-05, + "loss": 1.916, + "step": 2344 + }, + { + "epoch": 0.71, + "grad_norm": 32.94269943237305, + "learning_rate": 1.5301192743309616e-05, + "loss": 2.7435, + "step": 2345 + }, + { + "epoch": 0.71, + "grad_norm": 14.292465209960938, + "learning_rate": 1.5299188132705222e-05, + "loss": 1.8105, + "step": 2346 + }, + { + "epoch": 0.71, + "grad_norm": 12.711530685424805, + "learning_rate": 1.5297183522100832e-05, + "loss": 1.9106, + "step": 2347 + }, + { + "epoch": 0.71, + "grad_norm": 16.853271484375, + "learning_rate": 1.5295178911496443e-05, + "loss": 1.9825, + "step": 2348 + }, + { + "epoch": 0.71, + "grad_norm": 17.740068435668945, + "learning_rate": 1.5293174300892053e-05, + "loss": 1.5591, + "step": 2349 + }, + { + "epoch": 0.71, + "grad_norm": 19.151620864868164, + "learning_rate": 1.5291169690287663e-05, + "loss": 3.9344, + "step": 2350 + }, + { + "epoch": 0.71, + "grad_norm": 14.547835350036621, + "learning_rate": 1.5289165079683273e-05, + "loss": 2.0179, + "step": 2351 + }, + { + "epoch": 0.71, + "grad_norm": 14.701192855834961, + "learning_rate": 1.5287160469078883e-05, + "loss": 2.1844, + "step": 2352 + }, + { + "epoch": 0.71, + "grad_norm": 20.547576904296875, + "learning_rate": 1.5285155858474493e-05, + "loss": 2.5008, + "step": 2353 + }, + { + "epoch": 0.71, + "grad_norm": 13.004501342773438, + "learning_rate": 1.5283151247870103e-05, + "loss": 1.8492, + "step": 2354 + }, + { + "epoch": 0.71, + "grad_norm": 18.574153900146484, + "learning_rate": 1.528114663726571e-05, + "loss": 1.7889, + "step": 2355 + }, + { + "epoch": 0.71, + "grad_norm": 21.489116668701172, + "learning_rate": 1.5279142026661323e-05, + "loss": 2.2442, + "step": 2356 + }, + { + "epoch": 0.71, + "grad_norm": 11.905816078186035, + "learning_rate": 1.5277137416056933e-05, + "loss": 1.6658, + "step": 2357 + }, + { + "epoch": 0.71, + "grad_norm": 16.373111724853516, + "learning_rate": 1.527513280545254e-05, + "loss": 1.7949, + "step": 2358 + }, + { + "epoch": 0.71, + "grad_norm": 14.20676326751709, + "learning_rate": 1.5273128194848153e-05, + "loss": 2.0334, + "step": 2359 + }, + { + "epoch": 0.71, + "grad_norm": 14.752305030822754, + "learning_rate": 1.527112358424376e-05, + "loss": 2.1005, + "step": 2360 + }, + { + "epoch": 0.71, + "grad_norm": 13.963676452636719, + "learning_rate": 1.526911897363937e-05, + "loss": 2.9856, + "step": 2361 + }, + { + "epoch": 0.71, + "grad_norm": 11.170400619506836, + "learning_rate": 1.5267114363034984e-05, + "loss": 1.8121, + "step": 2362 + }, + { + "epoch": 0.71, + "grad_norm": 17.13726043701172, + "learning_rate": 1.526510975243059e-05, + "loss": 2.1476, + "step": 2363 + }, + { + "epoch": 0.71, + "grad_norm": 79.9980697631836, + "learning_rate": 1.52631051418262e-05, + "loss": 3.3246, + "step": 2364 + }, + { + "epoch": 0.71, + "grad_norm": 21.73980712890625, + "learning_rate": 1.526110053122181e-05, + "loss": 2.9358, + "step": 2365 + }, + { + "epoch": 0.71, + "grad_norm": 13.559164047241211, + "learning_rate": 1.525909592061742e-05, + "loss": 2.0114, + "step": 2366 + }, + { + "epoch": 0.71, + "grad_norm": 14.406414031982422, + "learning_rate": 1.525709131001303e-05, + "loss": 1.1554, + "step": 2367 + }, + { + "epoch": 0.71, + "grad_norm": 11.396732330322266, + "learning_rate": 1.5255086699408641e-05, + "loss": 2.1299, + "step": 2368 + }, + { + "epoch": 0.71, + "grad_norm": 22.816299438476562, + "learning_rate": 1.5253082088804253e-05, + "loss": 2.8568, + "step": 2369 + }, + { + "epoch": 0.71, + "grad_norm": 19.31545066833496, + "learning_rate": 1.5251077478199861e-05, + "loss": 2.9191, + "step": 2370 + }, + { + "epoch": 0.71, + "grad_norm": 20.314498901367188, + "learning_rate": 1.5249072867595471e-05, + "loss": 1.6643, + "step": 2371 + }, + { + "epoch": 0.71, + "grad_norm": 21.570932388305664, + "learning_rate": 1.524706825699108e-05, + "loss": 2.4345, + "step": 2372 + }, + { + "epoch": 0.71, + "grad_norm": 12.214997291564941, + "learning_rate": 1.5245063646386691e-05, + "loss": 1.8357, + "step": 2373 + }, + { + "epoch": 0.71, + "grad_norm": 15.286229133605957, + "learning_rate": 1.5243059035782301e-05, + "loss": 1.6836, + "step": 2374 + }, + { + "epoch": 0.71, + "grad_norm": 18.07011604309082, + "learning_rate": 1.524105442517791e-05, + "loss": 2.6134, + "step": 2375 + }, + { + "epoch": 0.71, + "grad_norm": 13.150678634643555, + "learning_rate": 1.5239049814573522e-05, + "loss": 2.336, + "step": 2376 + }, + { + "epoch": 0.71, + "grad_norm": 14.786459922790527, + "learning_rate": 1.523704520396913e-05, + "loss": 1.8392, + "step": 2377 + }, + { + "epoch": 0.71, + "grad_norm": 12.723509788513184, + "learning_rate": 1.523504059336474e-05, + "loss": 2.4568, + "step": 2378 + }, + { + "epoch": 0.72, + "grad_norm": 17.796947479248047, + "learning_rate": 1.5233035982760348e-05, + "loss": 2.8035, + "step": 2379 + }, + { + "epoch": 0.72, + "grad_norm": 7.9330854415893555, + "learning_rate": 1.523103137215596e-05, + "loss": 1.0556, + "step": 2380 + }, + { + "epoch": 0.72, + "grad_norm": 20.23641586303711, + "learning_rate": 1.522902676155157e-05, + "loss": 2.4805, + "step": 2381 + }, + { + "epoch": 0.72, + "grad_norm": 14.428129196166992, + "learning_rate": 1.5227022150947179e-05, + "loss": 2.3162, + "step": 2382 + }, + { + "epoch": 0.72, + "grad_norm": 19.485916137695312, + "learning_rate": 1.522501754034279e-05, + "loss": 2.8538, + "step": 2383 + }, + { + "epoch": 0.72, + "grad_norm": 23.651782989501953, + "learning_rate": 1.5223012929738399e-05, + "loss": 2.2296, + "step": 2384 + }, + { + "epoch": 0.72, + "grad_norm": 20.420745849609375, + "learning_rate": 1.5221008319134009e-05, + "loss": 2.8712, + "step": 2385 + }, + { + "epoch": 0.72, + "grad_norm": 13.436408042907715, + "learning_rate": 1.5219003708529619e-05, + "loss": 2.4493, + "step": 2386 + }, + { + "epoch": 0.72, + "grad_norm": 9.011188507080078, + "learning_rate": 1.5216999097925229e-05, + "loss": 1.8621, + "step": 2387 + }, + { + "epoch": 0.72, + "grad_norm": 24.090200424194336, + "learning_rate": 1.521499448732084e-05, + "loss": 2.7823, + "step": 2388 + }, + { + "epoch": 0.72, + "grad_norm": 22.469762802124023, + "learning_rate": 1.521298987671645e-05, + "loss": 2.1563, + "step": 2389 + }, + { + "epoch": 0.72, + "grad_norm": 20.96700668334961, + "learning_rate": 1.521098526611206e-05, + "loss": 2.3123, + "step": 2390 + }, + { + "epoch": 0.72, + "grad_norm": 13.538233757019043, + "learning_rate": 1.5208980655507668e-05, + "loss": 1.8336, + "step": 2391 + }, + { + "epoch": 0.72, + "grad_norm": 19.34774398803711, + "learning_rate": 1.520697604490328e-05, + "loss": 2.2524, + "step": 2392 + }, + { + "epoch": 0.72, + "grad_norm": 23.267269134521484, + "learning_rate": 1.520497143429889e-05, + "loss": 3.195, + "step": 2393 + }, + { + "epoch": 0.72, + "grad_norm": 43.13511657714844, + "learning_rate": 1.5202966823694498e-05, + "loss": 2.9556, + "step": 2394 + }, + { + "epoch": 0.72, + "grad_norm": 12.41313362121582, + "learning_rate": 1.520096221309011e-05, + "loss": 2.4461, + "step": 2395 + }, + { + "epoch": 0.72, + "grad_norm": 16.363807678222656, + "learning_rate": 1.5198957602485718e-05, + "loss": 2.2686, + "step": 2396 + }, + { + "epoch": 0.72, + "grad_norm": 18.332998275756836, + "learning_rate": 1.5196952991881328e-05, + "loss": 2.3713, + "step": 2397 + }, + { + "epoch": 0.72, + "grad_norm": 15.461286544799805, + "learning_rate": 1.5194948381276937e-05, + "loss": 2.0574, + "step": 2398 + }, + { + "epoch": 0.72, + "grad_norm": 32.771728515625, + "learning_rate": 1.5192943770672548e-05, + "loss": 2.7732, + "step": 2399 + }, + { + "epoch": 0.72, + "grad_norm": 15.005850791931152, + "learning_rate": 1.5190939160068158e-05, + "loss": 2.0059, + "step": 2400 + }, + { + "epoch": 0.72, + "eval_loss": 0.2799599766731262, + "eval_runtime": 43.2863, + "eval_samples_per_second": 34.168, + "eval_steps_per_second": 34.168, + "step": 2400 + }, + { + "epoch": 0.72, + "grad_norm": 18.537982940673828, + "learning_rate": 1.5188934549463767e-05, + "loss": 2.3335, + "step": 2401 + }, + { + "epoch": 0.72, + "grad_norm": 28.75159454345703, + "learning_rate": 1.5186929938859379e-05, + "loss": 2.5293, + "step": 2402 + }, + { + "epoch": 0.72, + "grad_norm": 19.689199447631836, + "learning_rate": 1.5184925328254987e-05, + "loss": 3.4202, + "step": 2403 + }, + { + "epoch": 0.72, + "grad_norm": 16.85619354248047, + "learning_rate": 1.5182920717650597e-05, + "loss": 2.1249, + "step": 2404 + }, + { + "epoch": 0.72, + "grad_norm": 15.209796905517578, + "learning_rate": 1.5180916107046209e-05, + "loss": 2.1942, + "step": 2405 + }, + { + "epoch": 0.72, + "grad_norm": 19.145883560180664, + "learning_rate": 1.5178911496441817e-05, + "loss": 2.3506, + "step": 2406 + }, + { + "epoch": 0.72, + "grad_norm": 13.587536811828613, + "learning_rate": 1.5176906885837427e-05, + "loss": 2.483, + "step": 2407 + }, + { + "epoch": 0.72, + "grad_norm": 14.813488960266113, + "learning_rate": 1.5174902275233036e-05, + "loss": 1.6431, + "step": 2408 + }, + { + "epoch": 0.72, + "grad_norm": 7.390352249145508, + "learning_rate": 1.5172897664628648e-05, + "loss": 1.329, + "step": 2409 + }, + { + "epoch": 0.72, + "grad_norm": 124.1446304321289, + "learning_rate": 1.5170893054024256e-05, + "loss": 2.8408, + "step": 2410 + }, + { + "epoch": 0.72, + "grad_norm": 31.081998825073242, + "learning_rate": 1.5168888443419868e-05, + "loss": 2.8232, + "step": 2411 + }, + { + "epoch": 0.73, + "grad_norm": 18.641647338867188, + "learning_rate": 1.5166883832815478e-05, + "loss": 1.8497, + "step": 2412 + }, + { + "epoch": 0.73, + "grad_norm": 17.504941940307617, + "learning_rate": 1.5164879222211086e-05, + "loss": 2.4579, + "step": 2413 + }, + { + "epoch": 0.73, + "grad_norm": 11.096992492675781, + "learning_rate": 1.5162874611606698e-05, + "loss": 2.1797, + "step": 2414 + }, + { + "epoch": 0.73, + "grad_norm": 14.132731437683105, + "learning_rate": 1.5160870001002306e-05, + "loss": 1.5483, + "step": 2415 + }, + { + "epoch": 0.73, + "grad_norm": 49.675132751464844, + "learning_rate": 1.5158865390397916e-05, + "loss": 2.4514, + "step": 2416 + }, + { + "epoch": 0.73, + "grad_norm": 12.76675796508789, + "learning_rate": 1.5156860779793528e-05, + "loss": 1.2279, + "step": 2417 + }, + { + "epoch": 0.73, + "grad_norm": 10.700181007385254, + "learning_rate": 1.5154856169189137e-05, + "loss": 2.0745, + "step": 2418 + }, + { + "epoch": 0.73, + "grad_norm": 17.42853546142578, + "learning_rate": 1.5152851558584747e-05, + "loss": 2.7441, + "step": 2419 + }, + { + "epoch": 0.73, + "grad_norm": 8.670104026794434, + "learning_rate": 1.5150846947980355e-05, + "loss": 1.8953, + "step": 2420 + }, + { + "epoch": 0.73, + "grad_norm": 24.383087158203125, + "learning_rate": 1.5148842337375967e-05, + "loss": 2.6507, + "step": 2421 + }, + { + "epoch": 0.73, + "grad_norm": 12.565937042236328, + "learning_rate": 1.5146837726771575e-05, + "loss": 1.5887, + "step": 2422 + }, + { + "epoch": 0.73, + "grad_norm": 22.554595947265625, + "learning_rate": 1.5144833116167185e-05, + "loss": 2.4942, + "step": 2423 + }, + { + "epoch": 0.73, + "grad_norm": 8.884187698364258, + "learning_rate": 1.5142828505562797e-05, + "loss": 1.2549, + "step": 2424 + }, + { + "epoch": 0.73, + "grad_norm": 15.001145362854004, + "learning_rate": 1.5140823894958406e-05, + "loss": 2.1248, + "step": 2425 + }, + { + "epoch": 0.73, + "grad_norm": 13.459707260131836, + "learning_rate": 1.5138819284354016e-05, + "loss": 2.0756, + "step": 2426 + }, + { + "epoch": 0.73, + "grad_norm": 15.579310417175293, + "learning_rate": 1.5136814673749624e-05, + "loss": 2.8265, + "step": 2427 + }, + { + "epoch": 0.73, + "grad_norm": 12.242110252380371, + "learning_rate": 1.5134810063145236e-05, + "loss": 2.2286, + "step": 2428 + }, + { + "epoch": 0.73, + "grad_norm": 30.915904998779297, + "learning_rate": 1.5132805452540844e-05, + "loss": 2.2951, + "step": 2429 + }, + { + "epoch": 0.73, + "grad_norm": 15.82615852355957, + "learning_rate": 1.5130800841936454e-05, + "loss": 2.0723, + "step": 2430 + }, + { + "epoch": 0.73, + "grad_norm": 15.196529388427734, + "learning_rate": 1.5128796231332066e-05, + "loss": 2.2142, + "step": 2431 + }, + { + "epoch": 0.73, + "grad_norm": 43.0781135559082, + "learning_rate": 1.5126791620727674e-05, + "loss": 3.249, + "step": 2432 + }, + { + "epoch": 0.73, + "grad_norm": 10.74176025390625, + "learning_rate": 1.5124787010123284e-05, + "loss": 1.763, + "step": 2433 + }, + { + "epoch": 0.73, + "grad_norm": 14.15688705444336, + "learning_rate": 1.5122782399518895e-05, + "loss": 1.8488, + "step": 2434 + }, + { + "epoch": 0.73, + "grad_norm": 16.205501556396484, + "learning_rate": 1.5120777788914505e-05, + "loss": 2.8312, + "step": 2435 + }, + { + "epoch": 0.73, + "grad_norm": 23.575292587280273, + "learning_rate": 1.5118773178310115e-05, + "loss": 1.8974, + "step": 2436 + }, + { + "epoch": 0.73, + "grad_norm": 12.507033348083496, + "learning_rate": 1.5116768567705725e-05, + "loss": 2.0892, + "step": 2437 + }, + { + "epoch": 0.73, + "grad_norm": 11.459385871887207, + "learning_rate": 1.5114763957101335e-05, + "loss": 2.232, + "step": 2438 + }, + { + "epoch": 0.73, + "grad_norm": 13.064870834350586, + "learning_rate": 1.5112759346496943e-05, + "loss": 2.9855, + "step": 2439 + }, + { + "epoch": 0.73, + "grad_norm": 21.083112716674805, + "learning_rate": 1.5110754735892555e-05, + "loss": 2.7381, + "step": 2440 + }, + { + "epoch": 0.73, + "grad_norm": 16.428953170776367, + "learning_rate": 1.5108750125288163e-05, + "loss": 2.8427, + "step": 2441 + }, + { + "epoch": 0.73, + "grad_norm": 13.690245628356934, + "learning_rate": 1.5106745514683774e-05, + "loss": 1.9179, + "step": 2442 + }, + { + "epoch": 0.73, + "grad_norm": 13.37304401397705, + "learning_rate": 1.5104740904079385e-05, + "loss": 1.8193, + "step": 2443 + }, + { + "epoch": 0.73, + "grad_norm": 10.359079360961914, + "learning_rate": 1.5102736293474994e-05, + "loss": 2.5558, + "step": 2444 + }, + { + "epoch": 0.74, + "grad_norm": 12.4136381149292, + "learning_rate": 1.5100731682870604e-05, + "loss": 2.191, + "step": 2445 + }, + { + "epoch": 0.74, + "grad_norm": 9.771052360534668, + "learning_rate": 1.5098727072266212e-05, + "loss": 1.5917, + "step": 2446 + }, + { + "epoch": 0.74, + "grad_norm": 12.872698783874512, + "learning_rate": 1.5096722461661824e-05, + "loss": 1.6524, + "step": 2447 + }, + { + "epoch": 0.74, + "grad_norm": 28.35697364807129, + "learning_rate": 1.5094717851057434e-05, + "loss": 2.569, + "step": 2448 + }, + { + "epoch": 0.74, + "grad_norm": 18.459850311279297, + "learning_rate": 1.5092713240453042e-05, + "loss": 2.1667, + "step": 2449 + }, + { + "epoch": 0.74, + "grad_norm": 12.4850492477417, + "learning_rate": 1.5090708629848654e-05, + "loss": 2.2356, + "step": 2450 + }, + { + "epoch": 0.74, + "grad_norm": 21.161081314086914, + "learning_rate": 1.5088704019244263e-05, + "loss": 2.1366, + "step": 2451 + }, + { + "epoch": 0.74, + "grad_norm": 22.59266471862793, + "learning_rate": 1.5086699408639873e-05, + "loss": 2.4504, + "step": 2452 + }, + { + "epoch": 0.74, + "grad_norm": 11.43664836883545, + "learning_rate": 1.5084694798035481e-05, + "loss": 2.434, + "step": 2453 + }, + { + "epoch": 0.74, + "grad_norm": 12.218484878540039, + "learning_rate": 1.5082690187431093e-05, + "loss": 1.2119, + "step": 2454 + }, + { + "epoch": 0.74, + "grad_norm": 10.82869815826416, + "learning_rate": 1.5080685576826703e-05, + "loss": 1.8022, + "step": 2455 + }, + { + "epoch": 0.74, + "grad_norm": 17.006765365600586, + "learning_rate": 1.5078680966222311e-05, + "loss": 2.1377, + "step": 2456 + }, + { + "epoch": 0.74, + "grad_norm": 21.112869262695312, + "learning_rate": 1.5076676355617923e-05, + "loss": 2.799, + "step": 2457 + }, + { + "epoch": 0.74, + "grad_norm": 24.335466384887695, + "learning_rate": 1.5074671745013532e-05, + "loss": 2.4533, + "step": 2458 + }, + { + "epoch": 0.74, + "grad_norm": 20.010364532470703, + "learning_rate": 1.5072667134409142e-05, + "loss": 1.9907, + "step": 2459 + }, + { + "epoch": 0.74, + "grad_norm": 34.753047943115234, + "learning_rate": 1.5070662523804753e-05, + "loss": 2.0948, + "step": 2460 + }, + { + "epoch": 0.74, + "grad_norm": 19.572534561157227, + "learning_rate": 1.5068657913200362e-05, + "loss": 2.1058, + "step": 2461 + }, + { + "epoch": 0.74, + "grad_norm": 13.721003532409668, + "learning_rate": 1.5066653302595972e-05, + "loss": 1.4905, + "step": 2462 + }, + { + "epoch": 0.74, + "grad_norm": 14.475379943847656, + "learning_rate": 1.5064648691991582e-05, + "loss": 1.7658, + "step": 2463 + }, + { + "epoch": 0.74, + "grad_norm": 24.027629852294922, + "learning_rate": 1.5062644081387192e-05, + "loss": 2.6237, + "step": 2464 + }, + { + "epoch": 0.74, + "grad_norm": 15.826460838317871, + "learning_rate": 1.50606394707828e-05, + "loss": 2.6549, + "step": 2465 + }, + { + "epoch": 0.74, + "grad_norm": 43.245018005371094, + "learning_rate": 1.5058634860178412e-05, + "loss": 2.9669, + "step": 2466 + }, + { + "epoch": 0.74, + "grad_norm": 13.400920867919922, + "learning_rate": 1.5056630249574022e-05, + "loss": 1.9453, + "step": 2467 + }, + { + "epoch": 0.74, + "grad_norm": 15.512686729431152, + "learning_rate": 1.505462563896963e-05, + "loss": 1.9563, + "step": 2468 + }, + { + "epoch": 0.74, + "grad_norm": 18.510339736938477, + "learning_rate": 1.5052621028365242e-05, + "loss": 1.5118, + "step": 2469 + }, + { + "epoch": 0.74, + "grad_norm": 13.163358688354492, + "learning_rate": 1.505061641776085e-05, + "loss": 1.8231, + "step": 2470 + }, + { + "epoch": 0.74, + "grad_norm": 8.8781156539917, + "learning_rate": 1.5048611807156461e-05, + "loss": 1.1656, + "step": 2471 + }, + { + "epoch": 0.74, + "grad_norm": 25.674057006835938, + "learning_rate": 1.504660719655207e-05, + "loss": 2.7279, + "step": 2472 + }, + { + "epoch": 0.74, + "grad_norm": 16.811283111572266, + "learning_rate": 1.5044602585947681e-05, + "loss": 2.0451, + "step": 2473 + }, + { + "epoch": 0.74, + "grad_norm": 15.99625301361084, + "learning_rate": 1.5042597975343291e-05, + "loss": 1.7554, + "step": 2474 + }, + { + "epoch": 0.74, + "grad_norm": 16.740371704101562, + "learning_rate": 1.50405933647389e-05, + "loss": 2.9046, + "step": 2475 + }, + { + "epoch": 0.74, + "grad_norm": 11.545669555664062, + "learning_rate": 1.5038588754134511e-05, + "loss": 1.1914, + "step": 2476 + }, + { + "epoch": 0.74, + "grad_norm": 12.485255241394043, + "learning_rate": 1.503658414353012e-05, + "loss": 2.0212, + "step": 2477 + }, + { + "epoch": 0.75, + "grad_norm": 28.34869956970215, + "learning_rate": 1.503457953292573e-05, + "loss": 1.4173, + "step": 2478 + }, + { + "epoch": 0.75, + "grad_norm": 14.046238899230957, + "learning_rate": 1.5032574922321342e-05, + "loss": 2.3209, + "step": 2479 + }, + { + "epoch": 0.75, + "grad_norm": 40.98884201049805, + "learning_rate": 1.503057031171695e-05, + "loss": 2.7137, + "step": 2480 + }, + { + "epoch": 0.75, + "grad_norm": 16.54846954345703, + "learning_rate": 1.502856570111256e-05, + "loss": 1.7074, + "step": 2481 + }, + { + "epoch": 0.75, + "grad_norm": 34.91267776489258, + "learning_rate": 1.502656109050817e-05, + "loss": 3.7392, + "step": 2482 + }, + { + "epoch": 0.75, + "grad_norm": 11.860535621643066, + "learning_rate": 1.502455647990378e-05, + "loss": 2.6046, + "step": 2483 + }, + { + "epoch": 0.75, + "grad_norm": 14.997228622436523, + "learning_rate": 1.5022551869299389e-05, + "loss": 2.761, + "step": 2484 + }, + { + "epoch": 0.75, + "grad_norm": 15.930320739746094, + "learning_rate": 1.5020547258695e-05, + "loss": 2.0798, + "step": 2485 + }, + { + "epoch": 0.75, + "grad_norm": 28.564027786254883, + "learning_rate": 1.501854264809061e-05, + "loss": 2.5476, + "step": 2486 + }, + { + "epoch": 0.75, + "grad_norm": 18.10663604736328, + "learning_rate": 1.5016538037486219e-05, + "loss": 2.6392, + "step": 2487 + }, + { + "epoch": 0.75, + "grad_norm": 22.97527313232422, + "learning_rate": 1.501453342688183e-05, + "loss": 2.0554, + "step": 2488 + }, + { + "epoch": 0.75, + "grad_norm": 18.91280174255371, + "learning_rate": 1.5012528816277439e-05, + "loss": 2.5951, + "step": 2489 + }, + { + "epoch": 0.75, + "grad_norm": 22.503339767456055, + "learning_rate": 1.5010524205673049e-05, + "loss": 1.7317, + "step": 2490 + }, + { + "epoch": 0.75, + "grad_norm": 20.57047462463379, + "learning_rate": 1.5008519595068661e-05, + "loss": 2.3906, + "step": 2491 + }, + { + "epoch": 0.75, + "grad_norm": 17.44088363647461, + "learning_rate": 1.500651498446427e-05, + "loss": 2.3833, + "step": 2492 + }, + { + "epoch": 0.75, + "grad_norm": 11.56372356414795, + "learning_rate": 1.500451037385988e-05, + "loss": 2.2393, + "step": 2493 + }, + { + "epoch": 0.75, + "grad_norm": 36.66616439819336, + "learning_rate": 1.5002505763255488e-05, + "loss": 2.0996, + "step": 2494 + }, + { + "epoch": 0.75, + "grad_norm": 59.76615905761719, + "learning_rate": 1.50005011526511e-05, + "loss": 1.3206, + "step": 2495 + }, + { + "epoch": 0.75, + "grad_norm": 10.487852096557617, + "learning_rate": 1.4998496542046708e-05, + "loss": 1.2596, + "step": 2496 + }, + { + "epoch": 0.75, + "grad_norm": 17.123096466064453, + "learning_rate": 1.4996491931442318e-05, + "loss": 2.1562, + "step": 2497 + }, + { + "epoch": 0.75, + "grad_norm": 13.120617866516113, + "learning_rate": 1.499448732083793e-05, + "loss": 1.2493, + "step": 2498 + }, + { + "epoch": 0.75, + "grad_norm": 11.029601097106934, + "learning_rate": 1.4992482710233538e-05, + "loss": 0.918, + "step": 2499 + }, + { + "epoch": 0.75, + "grad_norm": 38.185813903808594, + "learning_rate": 1.4990478099629148e-05, + "loss": 3.1123, + "step": 2500 + }, + { + "epoch": 0.75, + "grad_norm": 21.2143611907959, + "learning_rate": 1.4988473489024757e-05, + "loss": 1.9136, + "step": 2501 + }, + { + "epoch": 0.75, + "grad_norm": 18.287214279174805, + "learning_rate": 1.4986468878420368e-05, + "loss": 2.3058, + "step": 2502 + }, + { + "epoch": 0.75, + "grad_norm": 20.503644943237305, + "learning_rate": 1.4984464267815979e-05, + "loss": 3.3248, + "step": 2503 + }, + { + "epoch": 0.75, + "grad_norm": 18.117403030395508, + "learning_rate": 1.4982459657211587e-05, + "loss": 2.5097, + "step": 2504 + }, + { + "epoch": 0.75, + "grad_norm": 16.31001853942871, + "learning_rate": 1.4980455046607199e-05, + "loss": 1.7266, + "step": 2505 + }, + { + "epoch": 0.75, + "grad_norm": 12.371569633483887, + "learning_rate": 1.4978450436002807e-05, + "loss": 1.9252, + "step": 2506 + }, + { + "epoch": 0.75, + "grad_norm": 11.64188289642334, + "learning_rate": 1.4976445825398417e-05, + "loss": 2.0164, + "step": 2507 + }, + { + "epoch": 0.75, + "grad_norm": 17.393896102905273, + "learning_rate": 1.4974441214794027e-05, + "loss": 2.4099, + "step": 2508 + }, + { + "epoch": 0.75, + "grad_norm": 16.229440689086914, + "learning_rate": 1.4972436604189637e-05, + "loss": 2.8192, + "step": 2509 + }, + { + "epoch": 0.75, + "grad_norm": 19.683637619018555, + "learning_rate": 1.4970431993585247e-05, + "loss": 2.0972, + "step": 2510 + }, + { + "epoch": 0.75, + "grad_norm": 23.35310173034668, + "learning_rate": 1.4968427382980858e-05, + "loss": 2.0364, + "step": 2511 + }, + { + "epoch": 0.76, + "grad_norm": 10.506977081298828, + "learning_rate": 1.4966422772376468e-05, + "loss": 1.575, + "step": 2512 + }, + { + "epoch": 0.76, + "grad_norm": 24.63909339904785, + "learning_rate": 1.4964418161772076e-05, + "loss": 1.9545, + "step": 2513 + }, + { + "epoch": 0.76, + "grad_norm": 18.600322723388672, + "learning_rate": 1.4962413551167688e-05, + "loss": 2.0238, + "step": 2514 + }, + { + "epoch": 0.76, + "grad_norm": 25.803688049316406, + "learning_rate": 1.4960408940563296e-05, + "loss": 1.7973, + "step": 2515 + }, + { + "epoch": 0.76, + "grad_norm": 13.325533866882324, + "learning_rate": 1.4958404329958906e-05, + "loss": 1.5489, + "step": 2516 + }, + { + "epoch": 0.76, + "grad_norm": 17.538631439208984, + "learning_rate": 1.4956399719354518e-05, + "loss": 1.6318, + "step": 2517 + }, + { + "epoch": 0.76, + "grad_norm": 20.671701431274414, + "learning_rate": 1.4954395108750126e-05, + "loss": 2.2905, + "step": 2518 + }, + { + "epoch": 0.76, + "grad_norm": 18.96885871887207, + "learning_rate": 1.4952390498145736e-05, + "loss": 1.2961, + "step": 2519 + }, + { + "epoch": 0.76, + "grad_norm": 17.124177932739258, + "learning_rate": 1.4950385887541345e-05, + "loss": 2.1604, + "step": 2520 + }, + { + "epoch": 0.76, + "eval_loss": 0.269967257976532, + "eval_runtime": 43.2806, + "eval_samples_per_second": 34.172, + "eval_steps_per_second": 34.172, + "step": 2520 + }, + { + "epoch": 0.76, + "grad_norm": 22.06162452697754, + "learning_rate": 1.4948381276936957e-05, + "loss": 2.1107, + "step": 2521 + }, + { + "epoch": 0.76, + "grad_norm": 15.108476638793945, + "learning_rate": 1.4946376666332567e-05, + "loss": 1.7432, + "step": 2522 + }, + { + "epoch": 0.76, + "grad_norm": 13.295374870300293, + "learning_rate": 1.4944372055728175e-05, + "loss": 2.03, + "step": 2523 + }, + { + "epoch": 0.76, + "grad_norm": 19.436033248901367, + "learning_rate": 1.4942367445123787e-05, + "loss": 1.6658, + "step": 2524 + }, + { + "epoch": 0.76, + "grad_norm": 36.37664031982422, + "learning_rate": 1.4940362834519395e-05, + "loss": 3.2737, + "step": 2525 + }, + { + "epoch": 0.76, + "grad_norm": 11.738432884216309, + "learning_rate": 1.4938358223915005e-05, + "loss": 1.6208, + "step": 2526 + }, + { + "epoch": 0.76, + "grad_norm": 17.715194702148438, + "learning_rate": 1.4936353613310614e-05, + "loss": 2.3697, + "step": 2527 + }, + { + "epoch": 0.76, + "grad_norm": 21.08336639404297, + "learning_rate": 1.4934349002706226e-05, + "loss": 2.4212, + "step": 2528 + }, + { + "epoch": 0.76, + "grad_norm": 19.754703521728516, + "learning_rate": 1.4932344392101836e-05, + "loss": 1.71, + "step": 2529 + }, + { + "epoch": 0.76, + "grad_norm": 21.412639617919922, + "learning_rate": 1.4930339781497446e-05, + "loss": 2.807, + "step": 2530 + }, + { + "epoch": 0.76, + "grad_norm": 18.40741729736328, + "learning_rate": 1.4928335170893056e-05, + "loss": 2.2224, + "step": 2531 + }, + { + "epoch": 0.76, + "grad_norm": 33.87398147583008, + "learning_rate": 1.4926330560288664e-05, + "loss": 2.8005, + "step": 2532 + }, + { + "epoch": 0.76, + "grad_norm": 14.195319175720215, + "learning_rate": 1.4924325949684276e-05, + "loss": 2.4552, + "step": 2533 + }, + { + "epoch": 0.76, + "grad_norm": 14.160918235778809, + "learning_rate": 1.4922321339079886e-05, + "loss": 2.4736, + "step": 2534 + }, + { + "epoch": 0.76, + "grad_norm": 31.502965927124023, + "learning_rate": 1.4920316728475494e-05, + "loss": 2.5237, + "step": 2535 + }, + { + "epoch": 0.76, + "grad_norm": 17.290319442749023, + "learning_rate": 1.4918312117871106e-05, + "loss": 2.2399, + "step": 2536 + }, + { + "epoch": 0.76, + "grad_norm": 13.980632781982422, + "learning_rate": 1.4916307507266715e-05, + "loss": 2.3494, + "step": 2537 + }, + { + "epoch": 0.76, + "grad_norm": 14.719244956970215, + "learning_rate": 1.4914302896662325e-05, + "loss": 2.5073, + "step": 2538 + }, + { + "epoch": 0.76, + "grad_norm": 19.83281707763672, + "learning_rate": 1.4912298286057933e-05, + "loss": 2.4892, + "step": 2539 + }, + { + "epoch": 0.76, + "grad_norm": 13.933186531066895, + "learning_rate": 1.4910293675453545e-05, + "loss": 2.6791, + "step": 2540 + }, + { + "epoch": 0.76, + "grad_norm": 20.118640899658203, + "learning_rate": 1.4908289064849155e-05, + "loss": 1.996, + "step": 2541 + }, + { + "epoch": 0.76, + "grad_norm": 20.20134162902832, + "learning_rate": 1.4906284454244763e-05, + "loss": 2.2007, + "step": 2542 + }, + { + "epoch": 0.76, + "grad_norm": 14.23250961303711, + "learning_rate": 1.4904279843640375e-05, + "loss": 2.2191, + "step": 2543 + }, + { + "epoch": 0.76, + "grad_norm": 14.728852272033691, + "learning_rate": 1.4902275233035984e-05, + "loss": 1.9154, + "step": 2544 + }, + { + "epoch": 0.77, + "grad_norm": 38.36711120605469, + "learning_rate": 1.4900270622431594e-05, + "loss": 2.367, + "step": 2545 + }, + { + "epoch": 0.77, + "grad_norm": 17.614601135253906, + "learning_rate": 1.4898266011827205e-05, + "loss": 1.1838, + "step": 2546 + }, + { + "epoch": 0.77, + "grad_norm": 23.657424926757812, + "learning_rate": 1.4896261401222814e-05, + "loss": 2.6376, + "step": 2547 + }, + { + "epoch": 0.77, + "grad_norm": 10.91047191619873, + "learning_rate": 1.4894256790618424e-05, + "loss": 2.0975, + "step": 2548 + }, + { + "epoch": 0.77, + "grad_norm": 19.83192253112793, + "learning_rate": 1.4892252180014032e-05, + "loss": 2.7231, + "step": 2549 + }, + { + "epoch": 0.77, + "grad_norm": 6.836325645446777, + "learning_rate": 1.4890247569409644e-05, + "loss": 1.3988, + "step": 2550 + }, + { + "epoch": 0.77, + "grad_norm": 16.332334518432617, + "learning_rate": 1.4888242958805252e-05, + "loss": 1.9877, + "step": 2551 + }, + { + "epoch": 0.77, + "grad_norm": 23.387727737426758, + "learning_rate": 1.4886238348200862e-05, + "loss": 2.8849, + "step": 2552 + }, + { + "epoch": 0.77, + "grad_norm": 33.43765640258789, + "learning_rate": 1.4884233737596474e-05, + "loss": 2.3284, + "step": 2553 + }, + { + "epoch": 0.77, + "grad_norm": 10.308141708374023, + "learning_rate": 1.4882229126992083e-05, + "loss": 2.6766, + "step": 2554 + }, + { + "epoch": 0.77, + "grad_norm": 20.615131378173828, + "learning_rate": 1.4880224516387693e-05, + "loss": 2.4127, + "step": 2555 + }, + { + "epoch": 0.77, + "grad_norm": 18.643369674682617, + "learning_rate": 1.4878219905783303e-05, + "loss": 1.9807, + "step": 2556 + }, + { + "epoch": 0.77, + "grad_norm": 29.241533279418945, + "learning_rate": 1.4876215295178913e-05, + "loss": 2.6441, + "step": 2557 + }, + { + "epoch": 0.77, + "grad_norm": 19.864036560058594, + "learning_rate": 1.4874210684574521e-05, + "loss": 2.237, + "step": 2558 + }, + { + "epoch": 0.77, + "grad_norm": 9.182751655578613, + "learning_rate": 1.4872206073970133e-05, + "loss": 1.6721, + "step": 2559 + }, + { + "epoch": 0.77, + "grad_norm": 17.696592330932617, + "learning_rate": 1.4870201463365743e-05, + "loss": 1.8645, + "step": 2560 + }, + { + "epoch": 0.77, + "grad_norm": 10.09614086151123, + "learning_rate": 1.4868196852761352e-05, + "loss": 1.4887, + "step": 2561 + }, + { + "epoch": 0.77, + "grad_norm": 10.682119369506836, + "learning_rate": 1.4866192242156963e-05, + "loss": 1.296, + "step": 2562 + }, + { + "epoch": 0.77, + "grad_norm": 18.929485321044922, + "learning_rate": 1.4864187631552572e-05, + "loss": 2.7717, + "step": 2563 + }, + { + "epoch": 0.77, + "grad_norm": 21.349645614624023, + "learning_rate": 1.4862183020948182e-05, + "loss": 2.4609, + "step": 2564 + }, + { + "epoch": 0.77, + "grad_norm": 7.982182502746582, + "learning_rate": 1.4860178410343794e-05, + "loss": 1.9265, + "step": 2565 + }, + { + "epoch": 0.77, + "grad_norm": 15.188501358032227, + "learning_rate": 1.4858173799739402e-05, + "loss": 3.0483, + "step": 2566 + }, + { + "epoch": 0.77, + "grad_norm": 37.26816177368164, + "learning_rate": 1.4856169189135012e-05, + "loss": 2.585, + "step": 2567 + }, + { + "epoch": 0.77, + "grad_norm": 13.02489948272705, + "learning_rate": 1.485416457853062e-05, + "loss": 1.8137, + "step": 2568 + }, + { + "epoch": 0.77, + "grad_norm": 15.654376029968262, + "learning_rate": 1.4852159967926232e-05, + "loss": 2.2203, + "step": 2569 + }, + { + "epoch": 0.77, + "grad_norm": 9.267987251281738, + "learning_rate": 1.485015535732184e-05, + "loss": 1.362, + "step": 2570 + }, + { + "epoch": 0.77, + "grad_norm": 31.388477325439453, + "learning_rate": 1.484815074671745e-05, + "loss": 2.3867, + "step": 2571 + }, + { + "epoch": 0.77, + "grad_norm": 19.304162979125977, + "learning_rate": 1.4846146136113062e-05, + "loss": 2.6042, + "step": 2572 + }, + { + "epoch": 0.77, + "grad_norm": 21.53330421447754, + "learning_rate": 1.4844141525508671e-05, + "loss": 2.2464, + "step": 2573 + }, + { + "epoch": 0.77, + "grad_norm": 14.074399948120117, + "learning_rate": 1.4842136914904281e-05, + "loss": 1.6423, + "step": 2574 + }, + { + "epoch": 0.77, + "grad_norm": 30.000886917114258, + "learning_rate": 1.484013230429989e-05, + "loss": 1.9405, + "step": 2575 + }, + { + "epoch": 0.77, + "grad_norm": 12.38129711151123, + "learning_rate": 1.4838127693695501e-05, + "loss": 1.6343, + "step": 2576 + }, + { + "epoch": 0.77, + "grad_norm": 15.83830451965332, + "learning_rate": 1.4836123083091111e-05, + "loss": 1.4498, + "step": 2577 + }, + { + "epoch": 0.78, + "grad_norm": 20.503822326660156, + "learning_rate": 1.483411847248672e-05, + "loss": 2.8636, + "step": 2578 + }, + { + "epoch": 0.78, + "grad_norm": 15.516641616821289, + "learning_rate": 1.4832113861882331e-05, + "loss": 2.0966, + "step": 2579 + }, + { + "epoch": 0.78, + "grad_norm": 18.49781036376953, + "learning_rate": 1.483010925127794e-05, + "loss": 3.8588, + "step": 2580 + }, + { + "epoch": 0.78, + "grad_norm": 11.766064643859863, + "learning_rate": 1.4828104640673552e-05, + "loss": 3.0766, + "step": 2581 + }, + { + "epoch": 0.78, + "grad_norm": 28.008140563964844, + "learning_rate": 1.482610003006916e-05, + "loss": 2.8604, + "step": 2582 + }, + { + "epoch": 0.78, + "grad_norm": 11.355888366699219, + "learning_rate": 1.482409541946477e-05, + "loss": 1.8569, + "step": 2583 + }, + { + "epoch": 0.78, + "grad_norm": 24.921497344970703, + "learning_rate": 1.4822090808860382e-05, + "loss": 2.3311, + "step": 2584 + }, + { + "epoch": 0.78, + "grad_norm": 49.57196807861328, + "learning_rate": 1.482008619825599e-05, + "loss": 3.8008, + "step": 2585 + }, + { + "epoch": 0.78, + "grad_norm": 18.97112274169922, + "learning_rate": 1.48180815876516e-05, + "loss": 2.2731, + "step": 2586 + }, + { + "epoch": 0.78, + "grad_norm": 19.699682235717773, + "learning_rate": 1.4816076977047209e-05, + "loss": 3.2435, + "step": 2587 + }, + { + "epoch": 0.78, + "grad_norm": 15.111373901367188, + "learning_rate": 1.481407236644282e-05, + "loss": 2.6261, + "step": 2588 + }, + { + "epoch": 0.78, + "grad_norm": 39.739288330078125, + "learning_rate": 1.481206775583843e-05, + "loss": 3.0957, + "step": 2589 + }, + { + "epoch": 0.78, + "grad_norm": 7.469949245452881, + "learning_rate": 1.4810063145234039e-05, + "loss": 1.5037, + "step": 2590 + }, + { + "epoch": 0.78, + "grad_norm": 48.33216857910156, + "learning_rate": 1.480805853462965e-05, + "loss": 1.5074, + "step": 2591 + }, + { + "epoch": 0.78, + "grad_norm": 14.457548141479492, + "learning_rate": 1.4806053924025259e-05, + "loss": 2.0293, + "step": 2592 + }, + { + "epoch": 0.78, + "grad_norm": 9.994394302368164, + "learning_rate": 1.480404931342087e-05, + "loss": 1.6135, + "step": 2593 + }, + { + "epoch": 0.78, + "grad_norm": 16.707799911499023, + "learning_rate": 1.4802044702816478e-05, + "loss": 2.0803, + "step": 2594 + }, + { + "epoch": 0.78, + "grad_norm": 11.45969009399414, + "learning_rate": 1.480004009221209e-05, + "loss": 2.0687, + "step": 2595 + }, + { + "epoch": 0.78, + "grad_norm": 17.346237182617188, + "learning_rate": 1.47980354816077e-05, + "loss": 1.968, + "step": 2596 + }, + { + "epoch": 0.78, + "grad_norm": 16.296354293823242, + "learning_rate": 1.4796030871003308e-05, + "loss": 2.7071, + "step": 2597 + }, + { + "epoch": 0.78, + "grad_norm": 24.699426651000977, + "learning_rate": 1.479402626039892e-05, + "loss": 2.575, + "step": 2598 + }, + { + "epoch": 0.78, + "grad_norm": 15.403938293457031, + "learning_rate": 1.4792021649794528e-05, + "loss": 2.3392, + "step": 2599 + }, + { + "epoch": 0.78, + "grad_norm": 13.177908897399902, + "learning_rate": 1.4790017039190138e-05, + "loss": 1.4653, + "step": 2600 + }, + { + "epoch": 0.78, + "grad_norm": 34.016021728515625, + "learning_rate": 1.4788012428585748e-05, + "loss": 2.3153, + "step": 2601 + }, + { + "epoch": 0.78, + "grad_norm": 15.840597152709961, + "learning_rate": 1.4786007817981358e-05, + "loss": 2.5034, + "step": 2602 + }, + { + "epoch": 0.78, + "grad_norm": 31.41590118408203, + "learning_rate": 1.4784003207376968e-05, + "loss": 2.0677, + "step": 2603 + }, + { + "epoch": 0.78, + "grad_norm": 12.184178352355957, + "learning_rate": 1.4781998596772578e-05, + "loss": 2.0374, + "step": 2604 + }, + { + "epoch": 0.78, + "grad_norm": 16.264497756958008, + "learning_rate": 1.4779993986168188e-05, + "loss": 2.4106, + "step": 2605 + }, + { + "epoch": 0.78, + "grad_norm": 16.872928619384766, + "learning_rate": 1.4777989375563797e-05, + "loss": 2.3471, + "step": 2606 + }, + { + "epoch": 0.78, + "grad_norm": 47.74468994140625, + "learning_rate": 1.4775984764959409e-05, + "loss": 3.2204, + "step": 2607 + }, + { + "epoch": 0.78, + "grad_norm": 20.217355728149414, + "learning_rate": 1.4773980154355019e-05, + "loss": 1.9179, + "step": 2608 + }, + { + "epoch": 0.78, + "grad_norm": 13.135704040527344, + "learning_rate": 1.4771975543750627e-05, + "loss": 2.2236, + "step": 2609 + }, + { + "epoch": 0.78, + "grad_norm": 10.10952377319336, + "learning_rate": 1.4769970933146239e-05, + "loss": 1.2541, + "step": 2610 + }, + { + "epoch": 0.79, + "grad_norm": 14.991072654724121, + "learning_rate": 1.4767966322541847e-05, + "loss": 1.6626, + "step": 2611 + }, + { + "epoch": 0.79, + "grad_norm": 18.55371856689453, + "learning_rate": 1.4765961711937457e-05, + "loss": 2.59, + "step": 2612 + }, + { + "epoch": 0.79, + "grad_norm": 153.79713439941406, + "learning_rate": 1.4763957101333066e-05, + "loss": 1.6441, + "step": 2613 + }, + { + "epoch": 0.79, + "grad_norm": 20.791709899902344, + "learning_rate": 1.4761952490728678e-05, + "loss": 2.2488, + "step": 2614 + }, + { + "epoch": 0.79, + "grad_norm": 26.96784019470215, + "learning_rate": 1.4759947880124288e-05, + "loss": 2.4947, + "step": 2615 + }, + { + "epoch": 0.79, + "grad_norm": 31.95100975036621, + "learning_rate": 1.4757943269519896e-05, + "loss": 2.5303, + "step": 2616 + }, + { + "epoch": 0.79, + "grad_norm": 16.064725875854492, + "learning_rate": 1.4755938658915508e-05, + "loss": 2.2598, + "step": 2617 + }, + { + "epoch": 0.79, + "grad_norm": 19.35765266418457, + "learning_rate": 1.4753934048311116e-05, + "loss": 2.0552, + "step": 2618 + }, + { + "epoch": 0.79, + "grad_norm": 11.326220512390137, + "learning_rate": 1.4751929437706726e-05, + "loss": 2.1031, + "step": 2619 + }, + { + "epoch": 0.79, + "grad_norm": 8.919992446899414, + "learning_rate": 1.4749924827102338e-05, + "loss": 1.8168, + "step": 2620 + }, + { + "epoch": 0.79, + "grad_norm": 14.81724739074707, + "learning_rate": 1.4747920216497946e-05, + "loss": 1.9064, + "step": 2621 + }, + { + "epoch": 0.79, + "grad_norm": 31.04239845275879, + "learning_rate": 1.4745915605893557e-05, + "loss": 2.7644, + "step": 2622 + }, + { + "epoch": 0.79, + "grad_norm": 16.15270233154297, + "learning_rate": 1.4743910995289165e-05, + "loss": 2.0384, + "step": 2623 + }, + { + "epoch": 0.79, + "grad_norm": 20.225746154785156, + "learning_rate": 1.4741906384684777e-05, + "loss": 2.7575, + "step": 2624 + }, + { + "epoch": 0.79, + "grad_norm": 29.362613677978516, + "learning_rate": 1.4739901774080385e-05, + "loss": 2.0288, + "step": 2625 + }, + { + "epoch": 0.79, + "grad_norm": 18.233638763427734, + "learning_rate": 1.4737897163475995e-05, + "loss": 2.4319, + "step": 2626 + }, + { + "epoch": 0.79, + "grad_norm": 15.207660675048828, + "learning_rate": 1.4735892552871607e-05, + "loss": 2.1672, + "step": 2627 + }, + { + "epoch": 0.79, + "grad_norm": 21.931516647338867, + "learning_rate": 1.4733887942267215e-05, + "loss": 2.7001, + "step": 2628 + }, + { + "epoch": 0.79, + "grad_norm": 11.918092727661133, + "learning_rate": 1.4731883331662825e-05, + "loss": 2.1923, + "step": 2629 + }, + { + "epoch": 0.79, + "grad_norm": 24.173839569091797, + "learning_rate": 1.4729878721058436e-05, + "loss": 2.7606, + "step": 2630 + }, + { + "epoch": 0.79, + "grad_norm": 18.89200782775879, + "learning_rate": 1.4727874110454046e-05, + "loss": 2.3708, + "step": 2631 + }, + { + "epoch": 0.79, + "grad_norm": 18.636999130249023, + "learning_rate": 1.4725869499849654e-05, + "loss": 2.3505, + "step": 2632 + }, + { + "epoch": 0.79, + "grad_norm": 24.666654586791992, + "learning_rate": 1.4723864889245266e-05, + "loss": 2.4304, + "step": 2633 + }, + { + "epoch": 0.79, + "grad_norm": 10.553362846374512, + "learning_rate": 1.4721860278640876e-05, + "loss": 1.8944, + "step": 2634 + }, + { + "epoch": 0.79, + "grad_norm": 21.21739387512207, + "learning_rate": 1.4719855668036484e-05, + "loss": 2.6751, + "step": 2635 + }, + { + "epoch": 0.79, + "grad_norm": 38.01958465576172, + "learning_rate": 1.4717851057432096e-05, + "loss": 1.8805, + "step": 2636 + }, + { + "epoch": 0.79, + "grad_norm": 16.428316116333008, + "learning_rate": 1.4715846446827704e-05, + "loss": 2.9402, + "step": 2637 + }, + { + "epoch": 0.79, + "grad_norm": 22.54930305480957, + "learning_rate": 1.4713841836223315e-05, + "loss": 1.9364, + "step": 2638 + }, + { + "epoch": 0.79, + "grad_norm": 14.97050952911377, + "learning_rate": 1.4711837225618926e-05, + "loss": 1.7904, + "step": 2639 + }, + { + "epoch": 0.79, + "grad_norm": 16.823516845703125, + "learning_rate": 1.4709832615014535e-05, + "loss": 2.2004, + "step": 2640 + }, + { + "epoch": 0.79, + "eval_loss": 0.28386011719703674, + "eval_runtime": 44.8703, + "eval_samples_per_second": 32.962, + "eval_steps_per_second": 32.962, + "step": 2640 + }, + { + "epoch": 0.79, + "grad_norm": 14.531737327575684, + "learning_rate": 1.4707828004410145e-05, + "loss": 2.1963, + "step": 2641 + }, + { + "epoch": 0.79, + "grad_norm": 24.638132095336914, + "learning_rate": 1.4705823393805753e-05, + "loss": 2.2464, + "step": 2642 + }, + { + "epoch": 0.79, + "grad_norm": 17.104135513305664, + "learning_rate": 1.4703818783201365e-05, + "loss": 2.0654, + "step": 2643 + }, + { + "epoch": 0.79, + "grad_norm": 7.9517621994018555, + "learning_rate": 1.4701814172596973e-05, + "loss": 0.8721, + "step": 2644 + }, + { + "epoch": 0.8, + "grad_norm": 23.127620697021484, + "learning_rate": 1.4699809561992583e-05, + "loss": 2.0121, + "step": 2645 + }, + { + "epoch": 0.8, + "grad_norm": 18.799203872680664, + "learning_rate": 1.4697804951388195e-05, + "loss": 2.0454, + "step": 2646 + }, + { + "epoch": 0.8, + "grad_norm": 22.311729431152344, + "learning_rate": 1.4695800340783804e-05, + "loss": 2.1801, + "step": 2647 + }, + { + "epoch": 0.8, + "grad_norm": 17.162185668945312, + "learning_rate": 1.4693795730179414e-05, + "loss": 1.814, + "step": 2648 + }, + { + "epoch": 0.8, + "grad_norm": 9.871721267700195, + "learning_rate": 1.4691791119575024e-05, + "loss": 1.6796, + "step": 2649 + }, + { + "epoch": 0.8, + "grad_norm": 18.128135681152344, + "learning_rate": 1.4689786508970634e-05, + "loss": 2.3865, + "step": 2650 + }, + { + "epoch": 0.8, + "grad_norm": 15.663517951965332, + "learning_rate": 1.4687781898366244e-05, + "loss": 1.9406, + "step": 2651 + }, + { + "epoch": 0.8, + "grad_norm": 37.457672119140625, + "learning_rate": 1.4685777287761854e-05, + "loss": 2.7683, + "step": 2652 + }, + { + "epoch": 0.8, + "grad_norm": 19.439468383789062, + "learning_rate": 1.4683772677157464e-05, + "loss": 2.354, + "step": 2653 + }, + { + "epoch": 0.8, + "grad_norm": 29.902244567871094, + "learning_rate": 1.4681768066553072e-05, + "loss": 1.993, + "step": 2654 + }, + { + "epoch": 0.8, + "grad_norm": 11.236685752868652, + "learning_rate": 1.4679763455948684e-05, + "loss": 1.8682, + "step": 2655 + }, + { + "epoch": 0.8, + "grad_norm": 11.110152244567871, + "learning_rate": 1.4677758845344293e-05, + "loss": 1.378, + "step": 2656 + }, + { + "epoch": 0.8, + "grad_norm": 15.20004653930664, + "learning_rate": 1.4675754234739903e-05, + "loss": 2.4361, + "step": 2657 + }, + { + "epoch": 0.8, + "grad_norm": 44.016544342041016, + "learning_rate": 1.4673749624135514e-05, + "loss": 2.6364, + "step": 2658 + }, + { + "epoch": 0.8, + "grad_norm": 18.134666442871094, + "learning_rate": 1.4671745013531123e-05, + "loss": 2.1308, + "step": 2659 + }, + { + "epoch": 0.8, + "grad_norm": 83.64218139648438, + "learning_rate": 1.4669740402926733e-05, + "loss": 3.793, + "step": 2660 + }, + { + "epoch": 0.8, + "grad_norm": 18.634946823120117, + "learning_rate": 1.4667735792322341e-05, + "loss": 2.0182, + "step": 2661 + }, + { + "epoch": 0.8, + "grad_norm": 46.72085189819336, + "learning_rate": 1.4665731181717953e-05, + "loss": 2.1095, + "step": 2662 + }, + { + "epoch": 0.8, + "grad_norm": 18.656938552856445, + "learning_rate": 1.4663726571113563e-05, + "loss": 2.4388, + "step": 2663 + }, + { + "epoch": 0.8, + "grad_norm": 16.998310089111328, + "learning_rate": 1.4661721960509172e-05, + "loss": 1.9669, + "step": 2664 + }, + { + "epoch": 0.8, + "grad_norm": 25.790451049804688, + "learning_rate": 1.4659717349904783e-05, + "loss": 2.6492, + "step": 2665 + }, + { + "epoch": 0.8, + "grad_norm": 11.008011817932129, + "learning_rate": 1.4657712739300392e-05, + "loss": 1.926, + "step": 2666 + }, + { + "epoch": 0.8, + "grad_norm": 8.969502449035645, + "learning_rate": 1.4655708128696002e-05, + "loss": 0.8433, + "step": 2667 + }, + { + "epoch": 0.8, + "grad_norm": 24.146800994873047, + "learning_rate": 1.465370351809161e-05, + "loss": 1.7039, + "step": 2668 + }, + { + "epoch": 0.8, + "grad_norm": 32.524452209472656, + "learning_rate": 1.4651698907487222e-05, + "loss": 2.2074, + "step": 2669 + }, + { + "epoch": 0.8, + "grad_norm": 17.53997802734375, + "learning_rate": 1.4649694296882832e-05, + "loss": 2.1067, + "step": 2670 + }, + { + "epoch": 0.8, + "grad_norm": 70.99577331542969, + "learning_rate": 1.464768968627844e-05, + "loss": 2.5416, + "step": 2671 + }, + { + "epoch": 0.8, + "grad_norm": 15.582018852233887, + "learning_rate": 1.4645685075674052e-05, + "loss": 2.4462, + "step": 2672 + }, + { + "epoch": 0.8, + "grad_norm": 26.792186737060547, + "learning_rate": 1.464368046506966e-05, + "loss": 1.7403, + "step": 2673 + }, + { + "epoch": 0.8, + "grad_norm": 13.64623737335205, + "learning_rate": 1.464167585446527e-05, + "loss": 2.7516, + "step": 2674 + }, + { + "epoch": 0.8, + "grad_norm": 17.885374069213867, + "learning_rate": 1.463967124386088e-05, + "loss": 2.9638, + "step": 2675 + }, + { + "epoch": 0.8, + "grad_norm": 11.002893447875977, + "learning_rate": 1.4637666633256491e-05, + "loss": 1.5192, + "step": 2676 + }, + { + "epoch": 0.8, + "grad_norm": 18.869712829589844, + "learning_rate": 1.4635662022652101e-05, + "loss": 2.5806, + "step": 2677 + }, + { + "epoch": 0.81, + "grad_norm": 13.818095207214355, + "learning_rate": 1.4633657412047711e-05, + "loss": 2.1488, + "step": 2678 + }, + { + "epoch": 0.81, + "grad_norm": 28.199312210083008, + "learning_rate": 1.4631652801443321e-05, + "loss": 1.8242, + "step": 2679 + }, + { + "epoch": 0.81, + "grad_norm": 19.68671417236328, + "learning_rate": 1.462964819083893e-05, + "loss": 1.5373, + "step": 2680 + }, + { + "epoch": 0.81, + "grad_norm": 36.305686950683594, + "learning_rate": 1.4627643580234541e-05, + "loss": 3.0527, + "step": 2681 + }, + { + "epoch": 0.81, + "grad_norm": 22.186141967773438, + "learning_rate": 1.4625638969630151e-05, + "loss": 3.4701, + "step": 2682 + }, + { + "epoch": 0.81, + "grad_norm": 17.44498634338379, + "learning_rate": 1.462363435902576e-05, + "loss": 2.0777, + "step": 2683 + }, + { + "epoch": 0.81, + "grad_norm": 25.54764747619629, + "learning_rate": 1.4621629748421372e-05, + "loss": 1.9166, + "step": 2684 + }, + { + "epoch": 0.81, + "grad_norm": 15.869314193725586, + "learning_rate": 1.461962513781698e-05, + "loss": 2.0511, + "step": 2685 + }, + { + "epoch": 0.81, + "grad_norm": 16.541519165039062, + "learning_rate": 1.461762052721259e-05, + "loss": 2.2221, + "step": 2686 + }, + { + "epoch": 0.81, + "grad_norm": 12.096132278442383, + "learning_rate": 1.4615615916608198e-05, + "loss": 2.2477, + "step": 2687 + }, + { + "epoch": 0.81, + "grad_norm": 32.8545036315918, + "learning_rate": 1.461361130600381e-05, + "loss": 2.4819, + "step": 2688 + }, + { + "epoch": 0.81, + "grad_norm": 17.078535079956055, + "learning_rate": 1.461160669539942e-05, + "loss": 2.5493, + "step": 2689 + }, + { + "epoch": 0.81, + "grad_norm": 20.856630325317383, + "learning_rate": 1.4609602084795029e-05, + "loss": 2.9245, + "step": 2690 + }, + { + "epoch": 0.81, + "grad_norm": 30.489768981933594, + "learning_rate": 1.460759747419064e-05, + "loss": 2.2884, + "step": 2691 + }, + { + "epoch": 0.81, + "grad_norm": 13.105022430419922, + "learning_rate": 1.4605592863586249e-05, + "loss": 2.3658, + "step": 2692 + }, + { + "epoch": 0.81, + "grad_norm": 14.78238296508789, + "learning_rate": 1.4603588252981859e-05, + "loss": 2.5098, + "step": 2693 + }, + { + "epoch": 0.81, + "grad_norm": 17.489654541015625, + "learning_rate": 1.460158364237747e-05, + "loss": 1.8979, + "step": 2694 + }, + { + "epoch": 0.81, + "grad_norm": 15.65456771850586, + "learning_rate": 1.4599579031773079e-05, + "loss": 2.2142, + "step": 2695 + }, + { + "epoch": 0.81, + "grad_norm": 24.52695655822754, + "learning_rate": 1.459757442116869e-05, + "loss": 2.7726, + "step": 2696 + }, + { + "epoch": 0.81, + "grad_norm": 33.155067443847656, + "learning_rate": 1.4595569810564298e-05, + "loss": 2.2196, + "step": 2697 + }, + { + "epoch": 0.81, + "grad_norm": 9.755550384521484, + "learning_rate": 1.459356519995991e-05, + "loss": 1.7204, + "step": 2698 + }, + { + "epoch": 0.81, + "grad_norm": 19.83466911315918, + "learning_rate": 1.4591560589355518e-05, + "loss": 2.5923, + "step": 2699 + }, + { + "epoch": 0.81, + "grad_norm": 21.58110809326172, + "learning_rate": 1.458955597875113e-05, + "loss": 1.7443, + "step": 2700 + }, + { + "epoch": 0.81, + "grad_norm": 20.373117446899414, + "learning_rate": 1.458755136814674e-05, + "loss": 2.949, + "step": 2701 + }, + { + "epoch": 0.81, + "grad_norm": 12.08033275604248, + "learning_rate": 1.4585546757542348e-05, + "loss": 1.7417, + "step": 2702 + }, + { + "epoch": 0.81, + "grad_norm": 16.984411239624023, + "learning_rate": 1.458354214693796e-05, + "loss": 1.973, + "step": 2703 + }, + { + "epoch": 0.81, + "grad_norm": 24.945363998413086, + "learning_rate": 1.4581537536333568e-05, + "loss": 1.7648, + "step": 2704 + }, + { + "epoch": 0.81, + "grad_norm": 18.574451446533203, + "learning_rate": 1.4579532925729178e-05, + "loss": 1.326, + "step": 2705 + }, + { + "epoch": 0.81, + "grad_norm": 32.63951873779297, + "learning_rate": 1.457752831512479e-05, + "loss": 2.0216, + "step": 2706 + }, + { + "epoch": 0.81, + "grad_norm": 20.653427124023438, + "learning_rate": 1.4575523704520398e-05, + "loss": 2.0913, + "step": 2707 + }, + { + "epoch": 0.81, + "grad_norm": 41.5728645324707, + "learning_rate": 1.4573519093916009e-05, + "loss": 3.4579, + "step": 2708 + }, + { + "epoch": 0.81, + "grad_norm": 12.26309585571289, + "learning_rate": 1.4571514483311617e-05, + "loss": 1.4278, + "step": 2709 + }, + { + "epoch": 0.81, + "grad_norm": 17.044031143188477, + "learning_rate": 1.4569509872707229e-05, + "loss": 1.7787, + "step": 2710 + }, + { + "epoch": 0.82, + "grad_norm": 22.376020431518555, + "learning_rate": 1.4567505262102837e-05, + "loss": 2.2045, + "step": 2711 + }, + { + "epoch": 0.82, + "grad_norm": 15.766566276550293, + "learning_rate": 1.4565500651498447e-05, + "loss": 2.087, + "step": 2712 + }, + { + "epoch": 0.82, + "grad_norm": 10.13654899597168, + "learning_rate": 1.4563496040894059e-05, + "loss": 2.7784, + "step": 2713 + }, + { + "epoch": 0.82, + "grad_norm": 12.07814884185791, + "learning_rate": 1.4561491430289667e-05, + "loss": 2.2884, + "step": 2714 + }, + { + "epoch": 0.82, + "grad_norm": 15.864092826843262, + "learning_rate": 1.4559486819685277e-05, + "loss": 2.1431, + "step": 2715 + }, + { + "epoch": 0.82, + "grad_norm": 18.540136337280273, + "learning_rate": 1.4557482209080886e-05, + "loss": 2.4553, + "step": 2716 + }, + { + "epoch": 0.82, + "grad_norm": 9.907877922058105, + "learning_rate": 1.4555477598476498e-05, + "loss": 1.6814, + "step": 2717 + }, + { + "epoch": 0.82, + "grad_norm": 9.542145729064941, + "learning_rate": 1.4553472987872106e-05, + "loss": 1.1338, + "step": 2718 + }, + { + "epoch": 0.82, + "grad_norm": 18.332304000854492, + "learning_rate": 1.4551468377267716e-05, + "loss": 1.7549, + "step": 2719 + }, + { + "epoch": 0.82, + "grad_norm": 9.094773292541504, + "learning_rate": 1.4549463766663328e-05, + "loss": 1.1251, + "step": 2720 + }, + { + "epoch": 0.82, + "grad_norm": 15.948875427246094, + "learning_rate": 1.4547459156058936e-05, + "loss": 1.5578, + "step": 2721 + }, + { + "epoch": 0.82, + "grad_norm": 13.107511520385742, + "learning_rate": 1.4545454545454546e-05, + "loss": 1.8671, + "step": 2722 + }, + { + "epoch": 0.82, + "grad_norm": 17.999923706054688, + "learning_rate": 1.4543449934850156e-05, + "loss": 2.1096, + "step": 2723 + }, + { + "epoch": 0.82, + "grad_norm": 22.923246383666992, + "learning_rate": 1.4541445324245767e-05, + "loss": 2.1625, + "step": 2724 + }, + { + "epoch": 0.82, + "grad_norm": 12.2493314743042, + "learning_rate": 1.4539440713641377e-05, + "loss": 2.9069, + "step": 2725 + }, + { + "epoch": 0.82, + "grad_norm": 20.940412521362305, + "learning_rate": 1.4537436103036987e-05, + "loss": 2.6205, + "step": 2726 + }, + { + "epoch": 0.82, + "grad_norm": 28.031394958496094, + "learning_rate": 1.4535431492432597e-05, + "loss": 2.167, + "step": 2727 + }, + { + "epoch": 0.82, + "grad_norm": 12.496899604797363, + "learning_rate": 1.4533426881828205e-05, + "loss": 2.8417, + "step": 2728 + }, + { + "epoch": 0.82, + "grad_norm": 15.856078147888184, + "learning_rate": 1.4531422271223817e-05, + "loss": 2.5802, + "step": 2729 + }, + { + "epoch": 0.82, + "grad_norm": 19.049468994140625, + "learning_rate": 1.4529417660619425e-05, + "loss": 2.2789, + "step": 2730 + }, + { + "epoch": 0.82, + "grad_norm": 16.88075828552246, + "learning_rate": 1.4527413050015035e-05, + "loss": 1.6496, + "step": 2731 + }, + { + "epoch": 0.82, + "grad_norm": 23.054603576660156, + "learning_rate": 1.4525408439410647e-05, + "loss": 1.8223, + "step": 2732 + }, + { + "epoch": 0.82, + "grad_norm": 27.310646057128906, + "learning_rate": 1.4523403828806256e-05, + "loss": 2.5239, + "step": 2733 + }, + { + "epoch": 0.82, + "grad_norm": 20.423734664916992, + "learning_rate": 1.4521399218201866e-05, + "loss": 2.2311, + "step": 2734 + }, + { + "epoch": 0.82, + "grad_norm": 13.929975509643555, + "learning_rate": 1.4519394607597474e-05, + "loss": 2.5546, + "step": 2735 + }, + { + "epoch": 0.82, + "grad_norm": 12.425309181213379, + "learning_rate": 1.4517389996993086e-05, + "loss": 1.4081, + "step": 2736 + }, + { + "epoch": 0.82, + "grad_norm": 11.25351333618164, + "learning_rate": 1.4515385386388696e-05, + "loss": 1.7953, + "step": 2737 + }, + { + "epoch": 0.82, + "grad_norm": 19.53854751586914, + "learning_rate": 1.4513380775784304e-05, + "loss": 1.207, + "step": 2738 + }, + { + "epoch": 0.82, + "grad_norm": 22.41812515258789, + "learning_rate": 1.4511376165179916e-05, + "loss": 2.4787, + "step": 2739 + }, + { + "epoch": 0.82, + "grad_norm": 21.234926223754883, + "learning_rate": 1.4509371554575524e-05, + "loss": 2.2675, + "step": 2740 + }, + { + "epoch": 0.82, + "grad_norm": 19.57403564453125, + "learning_rate": 1.4507366943971135e-05, + "loss": 2.3916, + "step": 2741 + }, + { + "epoch": 0.82, + "grad_norm": 25.845134735107422, + "learning_rate": 1.4505362333366743e-05, + "loss": 2.4502, + "step": 2742 + }, + { + "epoch": 0.82, + "grad_norm": 42.33112716674805, + "learning_rate": 1.4503357722762355e-05, + "loss": 2.0345, + "step": 2743 + }, + { + "epoch": 0.83, + "grad_norm": 15.049954414367676, + "learning_rate": 1.4501353112157965e-05, + "loss": 2.8193, + "step": 2744 + }, + { + "epoch": 0.83, + "grad_norm": 25.87766456604004, + "learning_rate": 1.4499348501553573e-05, + "loss": 2.2518, + "step": 2745 + }, + { + "epoch": 0.83, + "grad_norm": 21.152681350708008, + "learning_rate": 1.4497343890949185e-05, + "loss": 2.3211, + "step": 2746 + }, + { + "epoch": 0.83, + "grad_norm": 36.79154586791992, + "learning_rate": 1.4495339280344793e-05, + "loss": 2.1298, + "step": 2747 + }, + { + "epoch": 0.83, + "grad_norm": 9.398030281066895, + "learning_rate": 1.4493334669740403e-05, + "loss": 1.8969, + "step": 2748 + }, + { + "epoch": 0.83, + "grad_norm": 20.946956634521484, + "learning_rate": 1.4491330059136015e-05, + "loss": 2.3464, + "step": 2749 + }, + { + "epoch": 0.83, + "grad_norm": 9.130115509033203, + "learning_rate": 1.4489325448531624e-05, + "loss": 1.7633, + "step": 2750 + }, + { + "epoch": 0.83, + "grad_norm": 14.199234962463379, + "learning_rate": 1.4487320837927234e-05, + "loss": 1.8352, + "step": 2751 + }, + { + "epoch": 0.83, + "grad_norm": 8.78415298461914, + "learning_rate": 1.4485316227322844e-05, + "loss": 1.6194, + "step": 2752 + }, + { + "epoch": 0.83, + "grad_norm": 32.84090805053711, + "learning_rate": 1.4483311616718454e-05, + "loss": 1.7946, + "step": 2753 + }, + { + "epoch": 0.83, + "grad_norm": 18.42530632019043, + "learning_rate": 1.4481307006114062e-05, + "loss": 2.4003, + "step": 2754 + }, + { + "epoch": 0.83, + "grad_norm": 10.56641960144043, + "learning_rate": 1.4479302395509674e-05, + "loss": 1.698, + "step": 2755 + }, + { + "epoch": 0.83, + "grad_norm": 13.460083961486816, + "learning_rate": 1.4477297784905284e-05, + "loss": 2.7324, + "step": 2756 + }, + { + "epoch": 0.83, + "grad_norm": 20.335878372192383, + "learning_rate": 1.4475293174300893e-05, + "loss": 1.5947, + "step": 2757 + }, + { + "epoch": 0.83, + "grad_norm": 44.001834869384766, + "learning_rate": 1.4473288563696504e-05, + "loss": 1.8942, + "step": 2758 + }, + { + "epoch": 0.83, + "grad_norm": 13.544523239135742, + "learning_rate": 1.4471283953092113e-05, + "loss": 2.016, + "step": 2759 + }, + { + "epoch": 0.83, + "grad_norm": 30.979524612426758, + "learning_rate": 1.4469279342487723e-05, + "loss": 3.1479, + "step": 2760 + }, + { + "epoch": 0.83, + "eval_loss": 0.2657427191734314, + "eval_runtime": 43.5175, + "eval_samples_per_second": 33.986, + "eval_steps_per_second": 33.986, + "step": 2760 + }, + { + "epoch": 0.83, + "grad_norm": 23.16549301147461, + "learning_rate": 1.4467274731883331e-05, + "loss": 2.2599, + "step": 2761 + }, + { + "epoch": 0.83, + "grad_norm": 18.973628997802734, + "learning_rate": 1.4465270121278943e-05, + "loss": 2.1258, + "step": 2762 + }, + { + "epoch": 0.83, + "grad_norm": 26.273101806640625, + "learning_rate": 1.4463265510674553e-05, + "loss": 3.4268, + "step": 2763 + }, + { + "epoch": 0.83, + "grad_norm": 14.230957984924316, + "learning_rate": 1.4461260900070161e-05, + "loss": 1.4143, + "step": 2764 + }, + { + "epoch": 0.83, + "grad_norm": 14.468798637390137, + "learning_rate": 1.4459256289465773e-05, + "loss": 2.3477, + "step": 2765 + }, + { + "epoch": 0.83, + "grad_norm": 16.977825164794922, + "learning_rate": 1.4457251678861382e-05, + "loss": 2.3843, + "step": 2766 + }, + { + "epoch": 0.83, + "grad_norm": 14.977238655090332, + "learning_rate": 1.4455247068256992e-05, + "loss": 2.0197, + "step": 2767 + }, + { + "epoch": 0.83, + "grad_norm": 14.926851272583008, + "learning_rate": 1.4453242457652603e-05, + "loss": 2.9558, + "step": 2768 + }, + { + "epoch": 0.83, + "grad_norm": 23.74448013305664, + "learning_rate": 1.4451237847048212e-05, + "loss": 2.1472, + "step": 2769 + }, + { + "epoch": 0.83, + "grad_norm": 41.35685729980469, + "learning_rate": 1.4449233236443822e-05, + "loss": 2.7331, + "step": 2770 + }, + { + "epoch": 0.83, + "grad_norm": 13.470633506774902, + "learning_rate": 1.4447228625839432e-05, + "loss": 1.9928, + "step": 2771 + }, + { + "epoch": 0.83, + "grad_norm": 12.274412155151367, + "learning_rate": 1.4445224015235042e-05, + "loss": 1.7747, + "step": 2772 + }, + { + "epoch": 0.83, + "grad_norm": 12.649324417114258, + "learning_rate": 1.444321940463065e-05, + "loss": 1.599, + "step": 2773 + }, + { + "epoch": 0.83, + "grad_norm": 13.914393424987793, + "learning_rate": 1.4441214794026262e-05, + "loss": 1.7285, + "step": 2774 + }, + { + "epoch": 0.83, + "grad_norm": 9.780823707580566, + "learning_rate": 1.4439210183421872e-05, + "loss": 1.3613, + "step": 2775 + }, + { + "epoch": 0.83, + "grad_norm": 13.444343566894531, + "learning_rate": 1.443720557281748e-05, + "loss": 1.4692, + "step": 2776 + }, + { + "epoch": 0.83, + "grad_norm": 11.597440719604492, + "learning_rate": 1.4435200962213093e-05, + "loss": 2.5432, + "step": 2777 + }, + { + "epoch": 0.84, + "grad_norm": 66.30996704101562, + "learning_rate": 1.4433196351608701e-05, + "loss": 1.9169, + "step": 2778 + }, + { + "epoch": 0.84, + "grad_norm": 42.77153778076172, + "learning_rate": 1.4431191741004311e-05, + "loss": 2.1056, + "step": 2779 + }, + { + "epoch": 0.84, + "grad_norm": 17.99850082397461, + "learning_rate": 1.4429187130399923e-05, + "loss": 1.6885, + "step": 2780 + }, + { + "epoch": 0.84, + "grad_norm": 11.443147659301758, + "learning_rate": 1.4427182519795531e-05, + "loss": 2.2285, + "step": 2781 + }, + { + "epoch": 0.84, + "grad_norm": 18.145082473754883, + "learning_rate": 1.4425177909191141e-05, + "loss": 2.14, + "step": 2782 + }, + { + "epoch": 0.84, + "grad_norm": 12.900897026062012, + "learning_rate": 1.442317329858675e-05, + "loss": 1.8419, + "step": 2783 + }, + { + "epoch": 0.84, + "grad_norm": 12.71401596069336, + "learning_rate": 1.4421168687982361e-05, + "loss": 1.6758, + "step": 2784 + }, + { + "epoch": 0.84, + "grad_norm": 10.644526481628418, + "learning_rate": 1.441916407737797e-05, + "loss": 1.5368, + "step": 2785 + }, + { + "epoch": 0.84, + "grad_norm": 11.878511428833008, + "learning_rate": 1.441715946677358e-05, + "loss": 2.8906, + "step": 2786 + }, + { + "epoch": 0.84, + "grad_norm": 23.1770076751709, + "learning_rate": 1.4415154856169192e-05, + "loss": 2.9468, + "step": 2787 + }, + { + "epoch": 0.84, + "grad_norm": 13.855196952819824, + "learning_rate": 1.44131502455648e-05, + "loss": 2.0071, + "step": 2788 + }, + { + "epoch": 0.84, + "grad_norm": 10.715313911437988, + "learning_rate": 1.441114563496041e-05, + "loss": 1.3851, + "step": 2789 + }, + { + "epoch": 0.84, + "grad_norm": 11.7968168258667, + "learning_rate": 1.4409141024356019e-05, + "loss": 1.7609, + "step": 2790 + }, + { + "epoch": 0.84, + "grad_norm": 15.877503395080566, + "learning_rate": 1.440713641375163e-05, + "loss": 3.124, + "step": 2791 + }, + { + "epoch": 0.84, + "grad_norm": 19.398963928222656, + "learning_rate": 1.440513180314724e-05, + "loss": 2.2776, + "step": 2792 + }, + { + "epoch": 0.84, + "grad_norm": 13.756909370422363, + "learning_rate": 1.4403127192542849e-05, + "loss": 2.429, + "step": 2793 + }, + { + "epoch": 0.84, + "grad_norm": 8.20438289642334, + "learning_rate": 1.440112258193846e-05, + "loss": 1.3021, + "step": 2794 + }, + { + "epoch": 0.84, + "grad_norm": 35.246524810791016, + "learning_rate": 1.4399117971334069e-05, + "loss": 3.4113, + "step": 2795 + }, + { + "epoch": 0.84, + "grad_norm": 15.750861167907715, + "learning_rate": 1.4397113360729679e-05, + "loss": 2.1455, + "step": 2796 + }, + { + "epoch": 0.84, + "grad_norm": 12.513425827026367, + "learning_rate": 1.4395108750125289e-05, + "loss": 2.1875, + "step": 2797 + }, + { + "epoch": 0.84, + "grad_norm": 20.23858070373535, + "learning_rate": 1.43931041395209e-05, + "loss": 2.686, + "step": 2798 + }, + { + "epoch": 0.84, + "grad_norm": 14.214842796325684, + "learning_rate": 1.439109952891651e-05, + "loss": 1.5095, + "step": 2799 + }, + { + "epoch": 0.84, + "grad_norm": 13.132567405700684, + "learning_rate": 1.438909491831212e-05, + "loss": 1.4686, + "step": 2800 + }, + { + "epoch": 0.84, + "grad_norm": 12.941914558410645, + "learning_rate": 1.438709030770773e-05, + "loss": 1.6349, + "step": 2801 + }, + { + "epoch": 0.84, + "grad_norm": 23.00050926208496, + "learning_rate": 1.4385085697103338e-05, + "loss": 2.3867, + "step": 2802 + }, + { + "epoch": 0.84, + "grad_norm": 27.743297576904297, + "learning_rate": 1.438308108649895e-05, + "loss": 2.8904, + "step": 2803 + }, + { + "epoch": 0.84, + "grad_norm": 28.265438079833984, + "learning_rate": 1.4381076475894558e-05, + "loss": 2.0912, + "step": 2804 + }, + { + "epoch": 0.84, + "grad_norm": 42.51979064941406, + "learning_rate": 1.4379071865290168e-05, + "loss": 2.9312, + "step": 2805 + }, + { + "epoch": 0.84, + "grad_norm": 13.225244522094727, + "learning_rate": 1.437706725468578e-05, + "loss": 2.3977, + "step": 2806 + }, + { + "epoch": 0.84, + "grad_norm": 21.915138244628906, + "learning_rate": 1.4375062644081388e-05, + "loss": 2.3075, + "step": 2807 + }, + { + "epoch": 0.84, + "grad_norm": 27.076114654541016, + "learning_rate": 1.4373058033476998e-05, + "loss": 2.9827, + "step": 2808 + }, + { + "epoch": 0.84, + "grad_norm": 29.377344131469727, + "learning_rate": 1.4371053422872607e-05, + "loss": 1.7643, + "step": 2809 + }, + { + "epoch": 0.84, + "grad_norm": 16.541988372802734, + "learning_rate": 1.4369048812268219e-05, + "loss": 2.0169, + "step": 2810 + }, + { + "epoch": 0.85, + "grad_norm": 11.847968101501465, + "learning_rate": 1.4367044201663829e-05, + "loss": 1.9688, + "step": 2811 + }, + { + "epoch": 0.85, + "grad_norm": 19.333833694458008, + "learning_rate": 1.4365039591059437e-05, + "loss": 1.6196, + "step": 2812 + }, + { + "epoch": 0.85, + "grad_norm": 11.35824966430664, + "learning_rate": 1.4363034980455049e-05, + "loss": 2.2096, + "step": 2813 + }, + { + "epoch": 0.85, + "grad_norm": 21.909666061401367, + "learning_rate": 1.4361030369850657e-05, + "loss": 2.0165, + "step": 2814 + }, + { + "epoch": 0.85, + "grad_norm": 12.537689208984375, + "learning_rate": 1.4359025759246267e-05, + "loss": 1.4654, + "step": 2815 + }, + { + "epoch": 0.85, + "grad_norm": 16.89116096496582, + "learning_rate": 1.4357021148641876e-05, + "loss": 2.2603, + "step": 2816 + }, + { + "epoch": 0.85, + "grad_norm": 29.52833366394043, + "learning_rate": 1.4355016538037487e-05, + "loss": 2.1104, + "step": 2817 + }, + { + "epoch": 0.85, + "grad_norm": 31.932817459106445, + "learning_rate": 1.4353011927433097e-05, + "loss": 3.2788, + "step": 2818 + }, + { + "epoch": 0.85, + "grad_norm": 13.009839057922363, + "learning_rate": 1.4351007316828708e-05, + "loss": 1.8396, + "step": 2819 + }, + { + "epoch": 0.85, + "grad_norm": 20.032115936279297, + "learning_rate": 1.4349002706224318e-05, + "loss": 2.5377, + "step": 2820 + }, + { + "epoch": 0.85, + "grad_norm": 30.521154403686523, + "learning_rate": 1.4346998095619926e-05, + "loss": 2.9662, + "step": 2821 + }, + { + "epoch": 0.85, + "grad_norm": 25.22045135498047, + "learning_rate": 1.4344993485015538e-05, + "loss": 2.4709, + "step": 2822 + }, + { + "epoch": 0.85, + "grad_norm": 36.31182861328125, + "learning_rate": 1.4342988874411148e-05, + "loss": 2.1305, + "step": 2823 + }, + { + "epoch": 0.85, + "grad_norm": 10.13243293762207, + "learning_rate": 1.4340984263806756e-05, + "loss": 1.2709, + "step": 2824 + }, + { + "epoch": 0.85, + "grad_norm": 13.83365535736084, + "learning_rate": 1.4338979653202368e-05, + "loss": 2.4132, + "step": 2825 + }, + { + "epoch": 0.85, + "grad_norm": 13.130005836486816, + "learning_rate": 1.4336975042597976e-05, + "loss": 1.4074, + "step": 2826 + }, + { + "epoch": 0.85, + "grad_norm": 20.482786178588867, + "learning_rate": 1.4334970431993587e-05, + "loss": 2.3476, + "step": 2827 + }, + { + "epoch": 0.85, + "grad_norm": 9.617236137390137, + "learning_rate": 1.4332965821389195e-05, + "loss": 1.595, + "step": 2828 + }, + { + "epoch": 0.85, + "grad_norm": 11.414641380310059, + "learning_rate": 1.4330961210784807e-05, + "loss": 1.5817, + "step": 2829 + }, + { + "epoch": 0.85, + "grad_norm": 16.799848556518555, + "learning_rate": 1.4328956600180417e-05, + "loss": 2.1956, + "step": 2830 + }, + { + "epoch": 0.85, + "grad_norm": 10.216323852539062, + "learning_rate": 1.4326951989576025e-05, + "loss": 1.4594, + "step": 2831 + }, + { + "epoch": 0.85, + "grad_norm": 28.910261154174805, + "learning_rate": 1.4324947378971637e-05, + "loss": 2.556, + "step": 2832 + }, + { + "epoch": 0.85, + "grad_norm": 26.716176986694336, + "learning_rate": 1.4322942768367245e-05, + "loss": 2.3864, + "step": 2833 + }, + { + "epoch": 0.85, + "grad_norm": 16.635345458984375, + "learning_rate": 1.4320938157762855e-05, + "loss": 2.3808, + "step": 2834 + }, + { + "epoch": 0.85, + "grad_norm": 21.195531845092773, + "learning_rate": 1.4318933547158467e-05, + "loss": 1.7784, + "step": 2835 + }, + { + "epoch": 0.85, + "grad_norm": 16.338502883911133, + "learning_rate": 1.4316928936554076e-05, + "loss": 2.1817, + "step": 2836 + }, + { + "epoch": 0.85, + "grad_norm": 10.698309898376465, + "learning_rate": 1.4314924325949686e-05, + "loss": 1.8357, + "step": 2837 + }, + { + "epoch": 0.85, + "grad_norm": 22.48438262939453, + "learning_rate": 1.4312919715345294e-05, + "loss": 1.9495, + "step": 2838 + }, + { + "epoch": 0.85, + "grad_norm": 72.1753921508789, + "learning_rate": 1.4310915104740906e-05, + "loss": 1.8956, + "step": 2839 + }, + { + "epoch": 0.85, + "grad_norm": 14.804839134216309, + "learning_rate": 1.4308910494136514e-05, + "loss": 2.0498, + "step": 2840 + }, + { + "epoch": 0.85, + "grad_norm": 46.27671432495117, + "learning_rate": 1.4306905883532124e-05, + "loss": 2.3295, + "step": 2841 + }, + { + "epoch": 0.85, + "grad_norm": 20.91038703918457, + "learning_rate": 1.4304901272927736e-05, + "loss": 1.7985, + "step": 2842 + }, + { + "epoch": 0.85, + "grad_norm": 15.907576560974121, + "learning_rate": 1.4302896662323345e-05, + "loss": 1.945, + "step": 2843 + }, + { + "epoch": 0.86, + "grad_norm": 11.249744415283203, + "learning_rate": 1.4300892051718955e-05, + "loss": 2.0269, + "step": 2844 + }, + { + "epoch": 0.86, + "grad_norm": 31.293607711791992, + "learning_rate": 1.4298887441114565e-05, + "loss": 2.0211, + "step": 2845 + }, + { + "epoch": 0.86, + "grad_norm": 15.472600936889648, + "learning_rate": 1.4296882830510175e-05, + "loss": 2.2907, + "step": 2846 + }, + { + "epoch": 0.86, + "grad_norm": 15.497085571289062, + "learning_rate": 1.4294878219905783e-05, + "loss": 2.4152, + "step": 2847 + }, + { + "epoch": 0.86, + "grad_norm": 16.131122589111328, + "learning_rate": 1.4292873609301395e-05, + "loss": 1.7781, + "step": 2848 + }, + { + "epoch": 0.86, + "grad_norm": 20.631099700927734, + "learning_rate": 1.4290868998697005e-05, + "loss": 2.6782, + "step": 2849 + }, + { + "epoch": 0.86, + "grad_norm": 28.31133460998535, + "learning_rate": 1.4288864388092613e-05, + "loss": 2.7064, + "step": 2850 + }, + { + "epoch": 0.86, + "grad_norm": 22.287992477416992, + "learning_rate": 1.4286859777488225e-05, + "loss": 1.5085, + "step": 2851 + }, + { + "epoch": 0.86, + "grad_norm": 14.099394798278809, + "learning_rate": 1.4284855166883834e-05, + "loss": 2.1791, + "step": 2852 + }, + { + "epoch": 0.86, + "grad_norm": 23.74030876159668, + "learning_rate": 1.4282850556279444e-05, + "loss": 1.5725, + "step": 2853 + }, + { + "epoch": 0.86, + "grad_norm": 18.97103500366211, + "learning_rate": 1.4280845945675055e-05, + "loss": 2.8069, + "step": 2854 + }, + { + "epoch": 0.86, + "grad_norm": 15.047412872314453, + "learning_rate": 1.4278841335070664e-05, + "loss": 2.3392, + "step": 2855 + }, + { + "epoch": 0.86, + "grad_norm": 11.085140228271484, + "learning_rate": 1.4276836724466274e-05, + "loss": 1.8397, + "step": 2856 + }, + { + "epoch": 0.86, + "grad_norm": 25.895097732543945, + "learning_rate": 1.4274832113861882e-05, + "loss": 2.0397, + "step": 2857 + }, + { + "epoch": 0.86, + "grad_norm": 15.016706466674805, + "learning_rate": 1.4272827503257494e-05, + "loss": 2.7215, + "step": 2858 + }, + { + "epoch": 0.86, + "grad_norm": 30.092390060424805, + "learning_rate": 1.4270822892653102e-05, + "loss": 2.8657, + "step": 2859 + }, + { + "epoch": 0.86, + "grad_norm": 16.425628662109375, + "learning_rate": 1.4268818282048713e-05, + "loss": 1.7927, + "step": 2860 + }, + { + "epoch": 0.86, + "grad_norm": 39.504478454589844, + "learning_rate": 1.4266813671444324e-05, + "loss": 2.4321, + "step": 2861 + }, + { + "epoch": 0.86, + "grad_norm": 15.988898277282715, + "learning_rate": 1.4264809060839933e-05, + "loss": 3.2789, + "step": 2862 + }, + { + "epoch": 0.86, + "grad_norm": 25.57854461669922, + "learning_rate": 1.4262804450235543e-05, + "loss": 2.2735, + "step": 2863 + }, + { + "epoch": 0.86, + "grad_norm": 24.288047790527344, + "learning_rate": 1.4260799839631151e-05, + "loss": 2.4726, + "step": 2864 + }, + { + "epoch": 0.86, + "grad_norm": 9.727755546569824, + "learning_rate": 1.4258795229026763e-05, + "loss": 1.2214, + "step": 2865 + }, + { + "epoch": 0.86, + "grad_norm": 11.615317344665527, + "learning_rate": 1.4256790618422373e-05, + "loss": 1.5758, + "step": 2866 + }, + { + "epoch": 0.86, + "grad_norm": 18.63646697998047, + "learning_rate": 1.4254786007817981e-05, + "loss": 3.0492, + "step": 2867 + }, + { + "epoch": 0.86, + "grad_norm": 18.21438217163086, + "learning_rate": 1.4252781397213593e-05, + "loss": 2.2775, + "step": 2868 + }, + { + "epoch": 0.86, + "grad_norm": 11.930097579956055, + "learning_rate": 1.4250776786609202e-05, + "loss": 1.8362, + "step": 2869 + }, + { + "epoch": 0.86, + "grad_norm": 18.075613021850586, + "learning_rate": 1.4248772176004812e-05, + "loss": 3.2771, + "step": 2870 + }, + { + "epoch": 0.86, + "grad_norm": 17.12186050415039, + "learning_rate": 1.4246767565400422e-05, + "loss": 1.9388, + "step": 2871 + }, + { + "epoch": 0.86, + "grad_norm": 20.949424743652344, + "learning_rate": 1.4244762954796032e-05, + "loss": 2.5683, + "step": 2872 + }, + { + "epoch": 0.86, + "grad_norm": 36.75067901611328, + "learning_rate": 1.4242758344191644e-05, + "loss": 1.8602, + "step": 2873 + }, + { + "epoch": 0.86, + "grad_norm": 12.314196586608887, + "learning_rate": 1.4240753733587252e-05, + "loss": 1.7533, + "step": 2874 + }, + { + "epoch": 0.86, + "grad_norm": 9.40181827545166, + "learning_rate": 1.4238749122982862e-05, + "loss": 1.0538, + "step": 2875 + }, + { + "epoch": 0.86, + "grad_norm": 18.72924041748047, + "learning_rate": 1.423674451237847e-05, + "loss": 1.741, + "step": 2876 + }, + { + "epoch": 0.87, + "grad_norm": 86.12344360351562, + "learning_rate": 1.4234739901774082e-05, + "loss": 1.8543, + "step": 2877 + }, + { + "epoch": 0.87, + "grad_norm": 11.64871883392334, + "learning_rate": 1.4232735291169692e-05, + "loss": 1.8377, + "step": 2878 + }, + { + "epoch": 0.87, + "grad_norm": 20.858604431152344, + "learning_rate": 1.42307306805653e-05, + "loss": 2.407, + "step": 2879 + }, + { + "epoch": 0.87, + "grad_norm": 16.45133399963379, + "learning_rate": 1.4228726069960913e-05, + "loss": 2.1642, + "step": 2880 + }, + { + "epoch": 0.87, + "eval_loss": 0.27140170335769653, + "eval_runtime": 43.4249, + "eval_samples_per_second": 34.059, + "eval_steps_per_second": 34.059, + "step": 2880 + }, + { + "epoch": 0.87, + "grad_norm": 16.983951568603516, + "learning_rate": 1.4226721459356521e-05, + "loss": 2.1911, + "step": 2881 + }, + { + "epoch": 0.87, + "grad_norm": 12.091941833496094, + "learning_rate": 1.4224716848752131e-05, + "loss": 1.6567, + "step": 2882 + }, + { + "epoch": 0.87, + "grad_norm": 22.655092239379883, + "learning_rate": 1.422271223814774e-05, + "loss": 1.7178, + "step": 2883 + }, + { + "epoch": 0.87, + "grad_norm": 13.33409595489502, + "learning_rate": 1.4220707627543351e-05, + "loss": 1.42, + "step": 2884 + }, + { + "epoch": 0.87, + "grad_norm": 41.79966735839844, + "learning_rate": 1.4218703016938961e-05, + "loss": 2.5635, + "step": 2885 + }, + { + "epoch": 0.87, + "grad_norm": 9.73848819732666, + "learning_rate": 1.421669840633457e-05, + "loss": 1.5657, + "step": 2886 + }, + { + "epoch": 0.87, + "grad_norm": 13.572857856750488, + "learning_rate": 1.4214693795730181e-05, + "loss": 1.8109, + "step": 2887 + }, + { + "epoch": 0.87, + "grad_norm": 19.471956253051758, + "learning_rate": 1.421268918512579e-05, + "loss": 1.7335, + "step": 2888 + }, + { + "epoch": 0.87, + "grad_norm": 16.479246139526367, + "learning_rate": 1.42106845745214e-05, + "loss": 2.1827, + "step": 2889 + }, + { + "epoch": 0.87, + "grad_norm": 42.60647964477539, + "learning_rate": 1.420867996391701e-05, + "loss": 4.2229, + "step": 2890 + }, + { + "epoch": 0.87, + "grad_norm": 29.324525833129883, + "learning_rate": 1.420667535331262e-05, + "loss": 2.2616, + "step": 2891 + }, + { + "epoch": 0.87, + "grad_norm": 17.365251541137695, + "learning_rate": 1.420467074270823e-05, + "loss": 1.9783, + "step": 2892 + }, + { + "epoch": 0.87, + "grad_norm": 14.282939910888672, + "learning_rate": 1.420266613210384e-05, + "loss": 1.9463, + "step": 2893 + }, + { + "epoch": 0.87, + "grad_norm": 18.807802200317383, + "learning_rate": 1.420066152149945e-05, + "loss": 1.9243, + "step": 2894 + }, + { + "epoch": 0.87, + "grad_norm": 12.65291976928711, + "learning_rate": 1.4198656910895059e-05, + "loss": 1.5824, + "step": 2895 + }, + { + "epoch": 0.87, + "grad_norm": 24.225215911865234, + "learning_rate": 1.419665230029067e-05, + "loss": 1.4894, + "step": 2896 + }, + { + "epoch": 0.87, + "grad_norm": 19.319774627685547, + "learning_rate": 1.419464768968628e-05, + "loss": 2.8796, + "step": 2897 + }, + { + "epoch": 0.87, + "grad_norm": 14.177779197692871, + "learning_rate": 1.4192643079081889e-05, + "loss": 1.7554, + "step": 2898 + }, + { + "epoch": 0.87, + "grad_norm": 17.083988189697266, + "learning_rate": 1.41906384684775e-05, + "loss": 2.0681, + "step": 2899 + }, + { + "epoch": 0.87, + "grad_norm": 40.30973434448242, + "learning_rate": 1.418863385787311e-05, + "loss": 2.4183, + "step": 2900 + }, + { + "epoch": 0.87, + "grad_norm": 9.512898445129395, + "learning_rate": 1.418662924726872e-05, + "loss": 1.5707, + "step": 2901 + }, + { + "epoch": 0.87, + "grad_norm": 52.70304870605469, + "learning_rate": 1.4184624636664328e-05, + "loss": 2.9803, + "step": 2902 + }, + { + "epoch": 0.87, + "grad_norm": 16.734039306640625, + "learning_rate": 1.418262002605994e-05, + "loss": 2.037, + "step": 2903 + }, + { + "epoch": 0.87, + "grad_norm": 19.740087509155273, + "learning_rate": 1.418061541545555e-05, + "loss": 1.8485, + "step": 2904 + }, + { + "epoch": 0.87, + "grad_norm": 14.386117935180664, + "learning_rate": 1.4178610804851158e-05, + "loss": 1.0815, + "step": 2905 + }, + { + "epoch": 0.87, + "grad_norm": 16.817588806152344, + "learning_rate": 1.417660619424677e-05, + "loss": 1.6195, + "step": 2906 + }, + { + "epoch": 0.87, + "grad_norm": 27.766712188720703, + "learning_rate": 1.4174601583642378e-05, + "loss": 1.9603, + "step": 2907 + }, + { + "epoch": 0.87, + "grad_norm": 33.24577713012695, + "learning_rate": 1.4172596973037988e-05, + "loss": 1.6909, + "step": 2908 + }, + { + "epoch": 0.87, + "grad_norm": 11.028932571411133, + "learning_rate": 1.41705923624336e-05, + "loss": 1.3208, + "step": 2909 + }, + { + "epoch": 0.87, + "grad_norm": 12.61074447631836, + "learning_rate": 1.4168587751829208e-05, + "loss": 1.9588, + "step": 2910 + }, + { + "epoch": 0.88, + "grad_norm": 11.678028106689453, + "learning_rate": 1.4166583141224818e-05, + "loss": 0.8482, + "step": 2911 + }, + { + "epoch": 0.88, + "grad_norm": 12.001538276672363, + "learning_rate": 1.4164578530620427e-05, + "loss": 1.9327, + "step": 2912 + }, + { + "epoch": 0.88, + "grad_norm": 16.76682472229004, + "learning_rate": 1.4162573920016039e-05, + "loss": 2.0711, + "step": 2913 + }, + { + "epoch": 0.88, + "grad_norm": 27.799612045288086, + "learning_rate": 1.4160569309411647e-05, + "loss": 2.1891, + "step": 2914 + }, + { + "epoch": 0.88, + "grad_norm": 24.618627548217773, + "learning_rate": 1.4158564698807257e-05, + "loss": 1.8592, + "step": 2915 + }, + { + "epoch": 0.88, + "grad_norm": 24.141916275024414, + "learning_rate": 1.4156560088202869e-05, + "loss": 2.2455, + "step": 2916 + }, + { + "epoch": 0.88, + "grad_norm": 10.506229400634766, + "learning_rate": 1.4154555477598477e-05, + "loss": 1.8823, + "step": 2917 + }, + { + "epoch": 0.88, + "grad_norm": 14.979269981384277, + "learning_rate": 1.4152550866994087e-05, + "loss": 2.0007, + "step": 2918 + }, + { + "epoch": 0.88, + "grad_norm": 34.168128967285156, + "learning_rate": 1.4150546256389697e-05, + "loss": 2.2599, + "step": 2919 + }, + { + "epoch": 0.88, + "grad_norm": 17.805347442626953, + "learning_rate": 1.4148541645785307e-05, + "loss": 2.8172, + "step": 2920 + }, + { + "epoch": 0.88, + "grad_norm": 18.875598907470703, + "learning_rate": 1.4146537035180918e-05, + "loss": 1.7194, + "step": 2921 + }, + { + "epoch": 0.88, + "grad_norm": 44.858428955078125, + "learning_rate": 1.4144532424576528e-05, + "loss": 1.6975, + "step": 2922 + }, + { + "epoch": 0.88, + "grad_norm": 10.500758171081543, + "learning_rate": 1.4142527813972138e-05, + "loss": 1.8214, + "step": 2923 + }, + { + "epoch": 0.88, + "grad_norm": 19.17152214050293, + "learning_rate": 1.4140523203367746e-05, + "loss": 1.9509, + "step": 2924 + }, + { + "epoch": 0.88, + "grad_norm": 19.903955459594727, + "learning_rate": 1.4138518592763358e-05, + "loss": 2.9421, + "step": 2925 + }, + { + "epoch": 0.88, + "grad_norm": 17.388938903808594, + "learning_rate": 1.4136513982158966e-05, + "loss": 1.8245, + "step": 2926 + }, + { + "epoch": 0.88, + "grad_norm": 15.076804161071777, + "learning_rate": 1.4134509371554576e-05, + "loss": 1.5359, + "step": 2927 + }, + { + "epoch": 0.88, + "grad_norm": 11.603935241699219, + "learning_rate": 1.4132504760950188e-05, + "loss": 1.5563, + "step": 2928 + }, + { + "epoch": 0.88, + "grad_norm": 25.9620418548584, + "learning_rate": 1.4130500150345797e-05, + "loss": 3.2783, + "step": 2929 + }, + { + "epoch": 0.88, + "grad_norm": 16.795143127441406, + "learning_rate": 1.4128495539741407e-05, + "loss": 1.5015, + "step": 2930 + }, + { + "epoch": 0.88, + "grad_norm": 11.010591506958008, + "learning_rate": 1.4126490929137015e-05, + "loss": 1.8603, + "step": 2931 + }, + { + "epoch": 0.88, + "grad_norm": 12.306628227233887, + "learning_rate": 1.4124486318532627e-05, + "loss": 1.4446, + "step": 2932 + }, + { + "epoch": 0.88, + "grad_norm": 14.734245300292969, + "learning_rate": 1.4122481707928235e-05, + "loss": 2.5827, + "step": 2933 + }, + { + "epoch": 0.88, + "grad_norm": 23.01751708984375, + "learning_rate": 1.4120477097323845e-05, + "loss": 2.9274, + "step": 2934 + }, + { + "epoch": 0.88, + "grad_norm": 20.132675170898438, + "learning_rate": 1.4118472486719457e-05, + "loss": 2.0589, + "step": 2935 + }, + { + "epoch": 0.88, + "grad_norm": 17.275482177734375, + "learning_rate": 1.4116467876115065e-05, + "loss": 2.1773, + "step": 2936 + }, + { + "epoch": 0.88, + "grad_norm": 40.08796310424805, + "learning_rate": 1.4114463265510675e-05, + "loss": 2.3652, + "step": 2937 + }, + { + "epoch": 0.88, + "grad_norm": 22.47896957397461, + "learning_rate": 1.4112458654906286e-05, + "loss": 2.069, + "step": 2938 + }, + { + "epoch": 0.88, + "grad_norm": 21.368188858032227, + "learning_rate": 1.4110454044301896e-05, + "loss": 2.5254, + "step": 2939 + }, + { + "epoch": 0.88, + "grad_norm": 8.838847160339355, + "learning_rate": 1.4108449433697506e-05, + "loss": 1.4187, + "step": 2940 + }, + { + "epoch": 0.88, + "grad_norm": 19.025794982910156, + "learning_rate": 1.4106444823093116e-05, + "loss": 1.7403, + "step": 2941 + }, + { + "epoch": 0.88, + "grad_norm": 26.310054779052734, + "learning_rate": 1.4104440212488726e-05, + "loss": 2.0741, + "step": 2942 + }, + { + "epoch": 0.88, + "grad_norm": 14.412237167358398, + "learning_rate": 1.4102435601884334e-05, + "loss": 1.6891, + "step": 2943 + }, + { + "epoch": 0.89, + "grad_norm": 15.92914867401123, + "learning_rate": 1.4100430991279946e-05, + "loss": 2.542, + "step": 2944 + }, + { + "epoch": 0.89, + "grad_norm": 26.77935028076172, + "learning_rate": 1.4098426380675554e-05, + "loss": 3.2923, + "step": 2945 + }, + { + "epoch": 0.89, + "grad_norm": 61.19276428222656, + "learning_rate": 1.4096421770071165e-05, + "loss": 2.9397, + "step": 2946 + }, + { + "epoch": 0.89, + "grad_norm": 15.064374923706055, + "learning_rate": 1.4094417159466776e-05, + "loss": 1.757, + "step": 2947 + }, + { + "epoch": 0.89, + "grad_norm": 14.561102867126465, + "learning_rate": 1.4092412548862385e-05, + "loss": 1.7057, + "step": 2948 + }, + { + "epoch": 0.89, + "grad_norm": 17.088661193847656, + "learning_rate": 1.4090407938257995e-05, + "loss": 2.2723, + "step": 2949 + }, + { + "epoch": 0.89, + "grad_norm": 7.7882256507873535, + "learning_rate": 1.4088403327653603e-05, + "loss": 1.8397, + "step": 2950 + }, + { + "epoch": 0.89, + "grad_norm": 11.358787536621094, + "learning_rate": 1.4086398717049215e-05, + "loss": 2.2978, + "step": 2951 + }, + { + "epoch": 0.89, + "grad_norm": 28.74677276611328, + "learning_rate": 1.4084394106444825e-05, + "loss": 2.7206, + "step": 2952 + }, + { + "epoch": 0.89, + "grad_norm": 16.747051239013672, + "learning_rate": 1.4082389495840433e-05, + "loss": 1.9734, + "step": 2953 + }, + { + "epoch": 0.89, + "grad_norm": 12.262884140014648, + "learning_rate": 1.4080384885236045e-05, + "loss": 2.0553, + "step": 2954 + }, + { + "epoch": 0.89, + "grad_norm": 20.407411575317383, + "learning_rate": 1.4078380274631654e-05, + "loss": 1.3774, + "step": 2955 + }, + { + "epoch": 0.89, + "grad_norm": 25.87983512878418, + "learning_rate": 1.4076375664027264e-05, + "loss": 2.7685, + "step": 2956 + }, + { + "epoch": 0.89, + "grad_norm": 14.667790412902832, + "learning_rate": 1.4074371053422872e-05, + "loss": 2.1229, + "step": 2957 + }, + { + "epoch": 0.89, + "grad_norm": 10.724139213562012, + "learning_rate": 1.4072366442818484e-05, + "loss": 1.6101, + "step": 2958 + }, + { + "epoch": 0.89, + "grad_norm": 25.648866653442383, + "learning_rate": 1.4070361832214094e-05, + "loss": 2.4417, + "step": 2959 + }, + { + "epoch": 0.89, + "grad_norm": 24.464935302734375, + "learning_rate": 1.4068357221609702e-05, + "loss": 1.071, + "step": 2960 + }, + { + "epoch": 0.89, + "grad_norm": 22.294677734375, + "learning_rate": 1.4066352611005314e-05, + "loss": 2.0512, + "step": 2961 + }, + { + "epoch": 0.89, + "grad_norm": 20.227296829223633, + "learning_rate": 1.4064348000400923e-05, + "loss": 1.4506, + "step": 2962 + }, + { + "epoch": 0.89, + "grad_norm": 35.15935134887695, + "learning_rate": 1.4062343389796533e-05, + "loss": 2.224, + "step": 2963 + }, + { + "epoch": 0.89, + "grad_norm": 15.452341079711914, + "learning_rate": 1.4060338779192144e-05, + "loss": 3.0408, + "step": 2964 + }, + { + "epoch": 0.89, + "grad_norm": 13.116132736206055, + "learning_rate": 1.4058334168587753e-05, + "loss": 2.6705, + "step": 2965 + }, + { + "epoch": 0.89, + "grad_norm": 22.434358596801758, + "learning_rate": 1.4056329557983363e-05, + "loss": 2.6559, + "step": 2966 + }, + { + "epoch": 0.89, + "grad_norm": 27.88434600830078, + "learning_rate": 1.4054324947378973e-05, + "loss": 2.2654, + "step": 2967 + }, + { + "epoch": 0.89, + "grad_norm": 29.59733772277832, + "learning_rate": 1.4052320336774583e-05, + "loss": 1.7697, + "step": 2968 + }, + { + "epoch": 0.89, + "grad_norm": 27.069503784179688, + "learning_rate": 1.4050315726170191e-05, + "loss": 2.1905, + "step": 2969 + }, + { + "epoch": 0.89, + "grad_norm": 29.36873435974121, + "learning_rate": 1.4048311115565803e-05, + "loss": 2.3816, + "step": 2970 + }, + { + "epoch": 0.89, + "grad_norm": 10.980319023132324, + "learning_rate": 1.4046306504961413e-05, + "loss": 1.6791, + "step": 2971 + }, + { + "epoch": 0.89, + "grad_norm": 13.023073196411133, + "learning_rate": 1.4044301894357022e-05, + "loss": 1.9126, + "step": 2972 + }, + { + "epoch": 0.89, + "grad_norm": 38.787940979003906, + "learning_rate": 1.4042297283752633e-05, + "loss": 2.6036, + "step": 2973 + }, + { + "epoch": 0.89, + "grad_norm": 29.560956954956055, + "learning_rate": 1.4040292673148242e-05, + "loss": 2.8441, + "step": 2974 + }, + { + "epoch": 0.89, + "grad_norm": 8.834044456481934, + "learning_rate": 1.4038288062543852e-05, + "loss": 1.4787, + "step": 2975 + }, + { + "epoch": 0.89, + "grad_norm": 11.226600646972656, + "learning_rate": 1.403628345193946e-05, + "loss": 1.5387, + "step": 2976 + }, + { + "epoch": 0.9, + "grad_norm": 16.720735549926758, + "learning_rate": 1.4034278841335072e-05, + "loss": 2.2926, + "step": 2977 + }, + { + "epoch": 0.9, + "grad_norm": 20.5881290435791, + "learning_rate": 1.4032274230730682e-05, + "loss": 2.5574, + "step": 2978 + }, + { + "epoch": 0.9, + "grad_norm": 31.438135147094727, + "learning_rate": 1.403026962012629e-05, + "loss": 1.8076, + "step": 2979 + }, + { + "epoch": 0.9, + "grad_norm": 12.068266868591309, + "learning_rate": 1.4028265009521902e-05, + "loss": 1.2816, + "step": 2980 + }, + { + "epoch": 0.9, + "grad_norm": 25.184410095214844, + "learning_rate": 1.402626039891751e-05, + "loss": 2.4355, + "step": 2981 + }, + { + "epoch": 0.9, + "grad_norm": 11.62043285369873, + "learning_rate": 1.402425578831312e-05, + "loss": 1.7278, + "step": 2982 + }, + { + "epoch": 0.9, + "grad_norm": 10.73161506652832, + "learning_rate": 1.4022251177708733e-05, + "loss": 1.5009, + "step": 2983 + }, + { + "epoch": 0.9, + "grad_norm": 18.191015243530273, + "learning_rate": 1.4020246567104341e-05, + "loss": 1.7434, + "step": 2984 + }, + { + "epoch": 0.9, + "grad_norm": 35.12432861328125, + "learning_rate": 1.4018241956499951e-05, + "loss": 1.6909, + "step": 2985 + }, + { + "epoch": 0.9, + "grad_norm": 11.526291847229004, + "learning_rate": 1.401623734589556e-05, + "loss": 1.4617, + "step": 2986 + }, + { + "epoch": 0.9, + "grad_norm": 61.24599075317383, + "learning_rate": 1.4014232735291171e-05, + "loss": 1.9207, + "step": 2987 + }, + { + "epoch": 0.9, + "grad_norm": 21.56028175354004, + "learning_rate": 1.401222812468678e-05, + "loss": 2.4295, + "step": 2988 + }, + { + "epoch": 0.9, + "grad_norm": 15.971736907958984, + "learning_rate": 1.4010223514082391e-05, + "loss": 1.9247, + "step": 2989 + }, + { + "epoch": 0.9, + "grad_norm": 18.692066192626953, + "learning_rate": 1.4008218903478001e-05, + "loss": 2.1351, + "step": 2990 + }, + { + "epoch": 0.9, + "grad_norm": 19.446582794189453, + "learning_rate": 1.400621429287361e-05, + "loss": 2.2271, + "step": 2991 + }, + { + "epoch": 0.9, + "grad_norm": 14.894234657287598, + "learning_rate": 1.4004209682269222e-05, + "loss": 2.0351, + "step": 2992 + }, + { + "epoch": 0.9, + "grad_norm": 10.384711265563965, + "learning_rate": 1.400220507166483e-05, + "loss": 1.3777, + "step": 2993 + }, + { + "epoch": 0.9, + "grad_norm": 15.35462760925293, + "learning_rate": 1.400020046106044e-05, + "loss": 1.4528, + "step": 2994 + }, + { + "epoch": 0.9, + "grad_norm": 9.53518009185791, + "learning_rate": 1.3998195850456052e-05, + "loss": 2.0396, + "step": 2995 + }, + { + "epoch": 0.9, + "grad_norm": 12.702194213867188, + "learning_rate": 1.399619123985166e-05, + "loss": 3.5201, + "step": 2996 + }, + { + "epoch": 0.9, + "grad_norm": 16.689231872558594, + "learning_rate": 1.399418662924727e-05, + "loss": 2.5204, + "step": 2997 + }, + { + "epoch": 0.9, + "grad_norm": 16.812484741210938, + "learning_rate": 1.3992182018642879e-05, + "loss": 1.823, + "step": 2998 + }, + { + "epoch": 0.9, + "grad_norm": 27.859556198120117, + "learning_rate": 1.399017740803849e-05, + "loss": 2.3145, + "step": 2999 + }, + { + "epoch": 0.9, + "grad_norm": 40.26885223388672, + "learning_rate": 1.3988172797434099e-05, + "loss": 1.981, + "step": 3000 + }, + { + "epoch": 0.9, + "eval_loss": 0.290047824382782, + "eval_runtime": 43.5045, + "eval_samples_per_second": 33.996, + "eval_steps_per_second": 33.996, + "step": 3000 + }, + { + "epoch": 0.9, + "grad_norm": 16.576269149780273, + "learning_rate": 1.3986168186829709e-05, + "loss": 1.7576, + "step": 3001 + }, + { + "epoch": 0.9, + "grad_norm": 11.136350631713867, + "learning_rate": 1.398416357622532e-05, + "loss": 1.1301, + "step": 3002 + }, + { + "epoch": 0.9, + "grad_norm": 7.8271684646606445, + "learning_rate": 1.398215896562093e-05, + "loss": 1.6797, + "step": 3003 + }, + { + "epoch": 0.9, + "grad_norm": 18.726409912109375, + "learning_rate": 1.398015435501654e-05, + "loss": 2.3383, + "step": 3004 + }, + { + "epoch": 0.9, + "grad_norm": 13.888818740844727, + "learning_rate": 1.3978149744412148e-05, + "loss": 1.7, + "step": 3005 + }, + { + "epoch": 0.9, + "grad_norm": 17.36934471130371, + "learning_rate": 1.397614513380776e-05, + "loss": 1.9836, + "step": 3006 + }, + { + "epoch": 0.9, + "grad_norm": 34.03765869140625, + "learning_rate": 1.3974140523203368e-05, + "loss": 2.0497, + "step": 3007 + }, + { + "epoch": 0.9, + "grad_norm": 24.520832061767578, + "learning_rate": 1.3972135912598978e-05, + "loss": 3.1885, + "step": 3008 + }, + { + "epoch": 0.9, + "grad_norm": 13.99000072479248, + "learning_rate": 1.397013130199459e-05, + "loss": 2.3549, + "step": 3009 + }, + { + "epoch": 0.9, + "grad_norm": 14.597472190856934, + "learning_rate": 1.3968126691390198e-05, + "loss": 2.2443, + "step": 3010 + }, + { + "epoch": 0.91, + "grad_norm": 10.168909072875977, + "learning_rate": 1.3966122080785808e-05, + "loss": 2.1224, + "step": 3011 + }, + { + "epoch": 0.91, + "grad_norm": 21.07402229309082, + "learning_rate": 1.3964117470181418e-05, + "loss": 2.2544, + "step": 3012 + }, + { + "epoch": 0.91, + "grad_norm": 35.29275894165039, + "learning_rate": 1.3962112859577028e-05, + "loss": 4.147, + "step": 3013 + }, + { + "epoch": 0.91, + "grad_norm": 19.924930572509766, + "learning_rate": 1.3960108248972638e-05, + "loss": 2.3426, + "step": 3014 + }, + { + "epoch": 0.91, + "grad_norm": 18.175373077392578, + "learning_rate": 1.3958103638368249e-05, + "loss": 1.6238, + "step": 3015 + }, + { + "epoch": 0.91, + "grad_norm": 16.749662399291992, + "learning_rate": 1.3956099027763859e-05, + "loss": 1.9444, + "step": 3016 + }, + { + "epoch": 0.91, + "grad_norm": 32.66911697387695, + "learning_rate": 1.3954094417159467e-05, + "loss": 2.5848, + "step": 3017 + }, + { + "epoch": 0.91, + "grad_norm": 10.309561729431152, + "learning_rate": 1.3952089806555079e-05, + "loss": 1.2661, + "step": 3018 + }, + { + "epoch": 0.91, + "grad_norm": 32.622840881347656, + "learning_rate": 1.3950085195950687e-05, + "loss": 3.1314, + "step": 3019 + }, + { + "epoch": 0.91, + "grad_norm": 51.04716491699219, + "learning_rate": 1.3948080585346297e-05, + "loss": 2.5912, + "step": 3020 + }, + { + "epoch": 0.91, + "grad_norm": 9.933348655700684, + "learning_rate": 1.3946075974741909e-05, + "loss": 1.6906, + "step": 3021 + }, + { + "epoch": 0.91, + "grad_norm": 11.241128921508789, + "learning_rate": 1.3944071364137517e-05, + "loss": 2.0141, + "step": 3022 + }, + { + "epoch": 0.91, + "grad_norm": 20.985105514526367, + "learning_rate": 1.3942066753533127e-05, + "loss": 2.3962, + "step": 3023 + }, + { + "epoch": 0.91, + "grad_norm": 14.399240493774414, + "learning_rate": 1.3940062142928736e-05, + "loss": 2.4056, + "step": 3024 + }, + { + "epoch": 0.91, + "grad_norm": 23.836753845214844, + "learning_rate": 1.3938057532324348e-05, + "loss": 1.492, + "step": 3025 + }, + { + "epoch": 0.91, + "grad_norm": 18.37982940673828, + "learning_rate": 1.3936052921719958e-05, + "loss": 1.8183, + "step": 3026 + }, + { + "epoch": 0.91, + "grad_norm": 13.537169456481934, + "learning_rate": 1.3934048311115566e-05, + "loss": 2.1759, + "step": 3027 + }, + { + "epoch": 0.91, + "grad_norm": 23.467910766601562, + "learning_rate": 1.3932043700511178e-05, + "loss": 2.6208, + "step": 3028 + }, + { + "epoch": 0.91, + "grad_norm": 17.59671401977539, + "learning_rate": 1.3930039089906786e-05, + "loss": 2.3887, + "step": 3029 + }, + { + "epoch": 0.91, + "grad_norm": 12.384592056274414, + "learning_rate": 1.3928034479302396e-05, + "loss": 2.0166, + "step": 3030 + }, + { + "epoch": 0.91, + "grad_norm": 17.471967697143555, + "learning_rate": 1.3926029868698005e-05, + "loss": 1.875, + "step": 3031 + }, + { + "epoch": 0.91, + "grad_norm": 12.046953201293945, + "learning_rate": 1.3924025258093617e-05, + "loss": 1.3814, + "step": 3032 + }, + { + "epoch": 0.91, + "grad_norm": 15.179215431213379, + "learning_rate": 1.3922020647489227e-05, + "loss": 1.6944, + "step": 3033 + }, + { + "epoch": 0.91, + "grad_norm": 10.748503684997559, + "learning_rate": 1.3920016036884835e-05, + "loss": 2.1539, + "step": 3034 + }, + { + "epoch": 0.91, + "grad_norm": 20.441787719726562, + "learning_rate": 1.3918011426280447e-05, + "loss": 2.9749, + "step": 3035 + }, + { + "epoch": 0.91, + "grad_norm": 23.096572875976562, + "learning_rate": 1.3916006815676055e-05, + "loss": 1.9606, + "step": 3036 + }, + { + "epoch": 0.91, + "grad_norm": 27.173370361328125, + "learning_rate": 1.3914002205071665e-05, + "loss": 2.5916, + "step": 3037 + }, + { + "epoch": 0.91, + "grad_norm": 11.456189155578613, + "learning_rate": 1.3911997594467277e-05, + "loss": 1.7153, + "step": 3038 + }, + { + "epoch": 0.91, + "grad_norm": 36.13789367675781, + "learning_rate": 1.3909992983862885e-05, + "loss": 2.2955, + "step": 3039 + }, + { + "epoch": 0.91, + "grad_norm": 25.46799659729004, + "learning_rate": 1.3907988373258496e-05, + "loss": 2.2377, + "step": 3040 + }, + { + "epoch": 0.91, + "grad_norm": 18.01449203491211, + "learning_rate": 1.3905983762654106e-05, + "loss": 2.5831, + "step": 3041 + }, + { + "epoch": 0.91, + "grad_norm": 25.396291732788086, + "learning_rate": 1.3903979152049716e-05, + "loss": 2.2237, + "step": 3042 + }, + { + "epoch": 0.91, + "grad_norm": 15.773884773254395, + "learning_rate": 1.3901974541445324e-05, + "loss": 2.6421, + "step": 3043 + }, + { + "epoch": 0.92, + "grad_norm": 16.961776733398438, + "learning_rate": 1.3899969930840936e-05, + "loss": 1.8037, + "step": 3044 + }, + { + "epoch": 0.92, + "grad_norm": 46.810585021972656, + "learning_rate": 1.3897965320236546e-05, + "loss": 1.4078, + "step": 3045 + }, + { + "epoch": 0.92, + "grad_norm": 19.385047912597656, + "learning_rate": 1.3895960709632154e-05, + "loss": 1.9391, + "step": 3046 + }, + { + "epoch": 0.92, + "grad_norm": 14.272333145141602, + "learning_rate": 1.3893956099027766e-05, + "loss": 1.7094, + "step": 3047 + }, + { + "epoch": 0.92, + "grad_norm": 12.611112594604492, + "learning_rate": 1.3891951488423375e-05, + "loss": 2.0959, + "step": 3048 + }, + { + "epoch": 0.92, + "grad_norm": 17.307199478149414, + "learning_rate": 1.3889946877818985e-05, + "loss": 2.1561, + "step": 3049 + }, + { + "epoch": 0.92, + "grad_norm": 20.245845794677734, + "learning_rate": 1.3887942267214593e-05, + "loss": 2.8128, + "step": 3050 + }, + { + "epoch": 0.92, + "grad_norm": 17.531375885009766, + "learning_rate": 1.3885937656610205e-05, + "loss": 1.6429, + "step": 3051 + }, + { + "epoch": 0.92, + "grad_norm": 12.451351165771484, + "learning_rate": 1.3883933046005815e-05, + "loss": 2.0644, + "step": 3052 + }, + { + "epoch": 0.92, + "grad_norm": 16.824329376220703, + "learning_rate": 1.3881928435401423e-05, + "loss": 2.1751, + "step": 3053 + }, + { + "epoch": 0.92, + "grad_norm": 19.493457794189453, + "learning_rate": 1.3879923824797035e-05, + "loss": 1.8751, + "step": 3054 + }, + { + "epoch": 0.92, + "grad_norm": 16.869348526000977, + "learning_rate": 1.3877919214192643e-05, + "loss": 1.9817, + "step": 3055 + }, + { + "epoch": 0.92, + "grad_norm": 29.737041473388672, + "learning_rate": 1.3875914603588254e-05, + "loss": 2.4454, + "step": 3056 + }, + { + "epoch": 0.92, + "grad_norm": 22.09736442565918, + "learning_rate": 1.3873909992983865e-05, + "loss": 1.7898, + "step": 3057 + }, + { + "epoch": 0.92, + "grad_norm": 17.416990280151367, + "learning_rate": 1.3871905382379474e-05, + "loss": 2.9486, + "step": 3058 + }, + { + "epoch": 0.92, + "grad_norm": 13.20331859588623, + "learning_rate": 1.3869900771775084e-05, + "loss": 2.2757, + "step": 3059 + }, + { + "epoch": 0.92, + "grad_norm": 7.922008514404297, + "learning_rate": 1.3867896161170694e-05, + "loss": 2.0063, + "step": 3060 + }, + { + "epoch": 0.92, + "grad_norm": 14.735453605651855, + "learning_rate": 1.3865891550566304e-05, + "loss": 2.4153, + "step": 3061 + }, + { + "epoch": 0.92, + "grad_norm": 13.794514656066895, + "learning_rate": 1.3863886939961912e-05, + "loss": 1.8875, + "step": 3062 + }, + { + "epoch": 0.92, + "grad_norm": 11.291130065917969, + "learning_rate": 1.3861882329357524e-05, + "loss": 1.3319, + "step": 3063 + }, + { + "epoch": 0.92, + "grad_norm": 19.629484176635742, + "learning_rate": 1.3859877718753134e-05, + "loss": 1.4707, + "step": 3064 + }, + { + "epoch": 0.92, + "grad_norm": 15.42160415649414, + "learning_rate": 1.3857873108148743e-05, + "loss": 1.5667, + "step": 3065 + }, + { + "epoch": 0.92, + "grad_norm": 14.26404094696045, + "learning_rate": 1.3855868497544354e-05, + "loss": 2.0599, + "step": 3066 + }, + { + "epoch": 0.92, + "grad_norm": 8.3535795211792, + "learning_rate": 1.3853863886939963e-05, + "loss": 1.3769, + "step": 3067 + }, + { + "epoch": 0.92, + "grad_norm": 13.800082206726074, + "learning_rate": 1.3851859276335573e-05, + "loss": 1.5491, + "step": 3068 + }, + { + "epoch": 0.92, + "grad_norm": 16.2960262298584, + "learning_rate": 1.3849854665731185e-05, + "loss": 1.8461, + "step": 3069 + }, + { + "epoch": 0.92, + "grad_norm": 12.411531448364258, + "learning_rate": 1.3847850055126793e-05, + "loss": 2.1916, + "step": 3070 + }, + { + "epoch": 0.92, + "grad_norm": 10.797046661376953, + "learning_rate": 1.3845845444522403e-05, + "loss": 1.2897, + "step": 3071 + }, + { + "epoch": 0.92, + "grad_norm": 17.1668643951416, + "learning_rate": 1.3843840833918011e-05, + "loss": 2.0636, + "step": 3072 + }, + { + "epoch": 0.92, + "grad_norm": 12.581513404846191, + "learning_rate": 1.3841836223313623e-05, + "loss": 2.0915, + "step": 3073 + }, + { + "epoch": 0.92, + "grad_norm": 18.048904418945312, + "learning_rate": 1.3839831612709232e-05, + "loss": 1.8037, + "step": 3074 + }, + { + "epoch": 0.92, + "grad_norm": 25.745256423950195, + "learning_rate": 1.3837827002104842e-05, + "loss": 2.4617, + "step": 3075 + }, + { + "epoch": 0.92, + "grad_norm": 23.290611267089844, + "learning_rate": 1.3835822391500453e-05, + "loss": 2.3306, + "step": 3076 + }, + { + "epoch": 0.93, + "grad_norm": 18.817665100097656, + "learning_rate": 1.3833817780896062e-05, + "loss": 2.0089, + "step": 3077 + }, + { + "epoch": 0.93, + "grad_norm": 16.671045303344727, + "learning_rate": 1.3831813170291672e-05, + "loss": 2.0306, + "step": 3078 + }, + { + "epoch": 0.93, + "grad_norm": 17.590496063232422, + "learning_rate": 1.382980855968728e-05, + "loss": 2.5062, + "step": 3079 + }, + { + "epoch": 0.93, + "grad_norm": 15.332666397094727, + "learning_rate": 1.3827803949082892e-05, + "loss": 1.5195, + "step": 3080 + }, + { + "epoch": 0.93, + "grad_norm": 28.62594985961914, + "learning_rate": 1.3825799338478502e-05, + "loss": 2.3762, + "step": 3081 + }, + { + "epoch": 0.93, + "grad_norm": 15.981874465942383, + "learning_rate": 1.382379472787411e-05, + "loss": 2.0642, + "step": 3082 + }, + { + "epoch": 0.93, + "grad_norm": 16.672204971313477, + "learning_rate": 1.3821790117269722e-05, + "loss": 2.0906, + "step": 3083 + }, + { + "epoch": 0.93, + "grad_norm": 15.016446113586426, + "learning_rate": 1.381978550666533e-05, + "loss": 1.7059, + "step": 3084 + }, + { + "epoch": 0.93, + "grad_norm": 9.689031600952148, + "learning_rate": 1.3817780896060941e-05, + "loss": 1.625, + "step": 3085 + }, + { + "epoch": 0.93, + "grad_norm": 8.496919631958008, + "learning_rate": 1.3815776285456551e-05, + "loss": 0.9668, + "step": 3086 + }, + { + "epoch": 0.93, + "grad_norm": 20.545669555664062, + "learning_rate": 1.3813771674852161e-05, + "loss": 1.827, + "step": 3087 + }, + { + "epoch": 0.93, + "grad_norm": 14.81933307647705, + "learning_rate": 1.3811767064247771e-05, + "loss": 2.3411, + "step": 3088 + }, + { + "epoch": 0.93, + "grad_norm": 11.812262535095215, + "learning_rate": 1.3809762453643381e-05, + "loss": 3.2352, + "step": 3089 + }, + { + "epoch": 0.93, + "grad_norm": 50.63083267211914, + "learning_rate": 1.3807757843038991e-05, + "loss": 1.491, + "step": 3090 + }, + { + "epoch": 0.93, + "grad_norm": 17.194198608398438, + "learning_rate": 1.38057532324346e-05, + "loss": 1.45, + "step": 3091 + }, + { + "epoch": 0.93, + "grad_norm": 25.95490074157715, + "learning_rate": 1.3803748621830211e-05, + "loss": 3.7476, + "step": 3092 + }, + { + "epoch": 0.93, + "grad_norm": 12.458473205566406, + "learning_rate": 1.380174401122582e-05, + "loss": 2.0199, + "step": 3093 + }, + { + "epoch": 0.93, + "grad_norm": 17.711647033691406, + "learning_rate": 1.379973940062143e-05, + "loss": 2.1909, + "step": 3094 + }, + { + "epoch": 0.93, + "grad_norm": 21.15656852722168, + "learning_rate": 1.3797734790017042e-05, + "loss": 2.6897, + "step": 3095 + }, + { + "epoch": 0.93, + "grad_norm": 14.197000503540039, + "learning_rate": 1.379573017941265e-05, + "loss": 2.5904, + "step": 3096 + }, + { + "epoch": 0.93, + "grad_norm": 24.11342430114746, + "learning_rate": 1.379372556880826e-05, + "loss": 2.4864, + "step": 3097 + }, + { + "epoch": 0.93, + "grad_norm": 18.00290298461914, + "learning_rate": 1.3791720958203869e-05, + "loss": 2.6676, + "step": 3098 + }, + { + "epoch": 0.93, + "grad_norm": 10.474432945251465, + "learning_rate": 1.378971634759948e-05, + "loss": 1.9275, + "step": 3099 + }, + { + "epoch": 0.93, + "grad_norm": 15.15256118774414, + "learning_rate": 1.378771173699509e-05, + "loss": 2.0671, + "step": 3100 + }, + { + "epoch": 0.93, + "grad_norm": 16.475624084472656, + "learning_rate": 1.3785707126390699e-05, + "loss": 2.206, + "step": 3101 + }, + { + "epoch": 0.93, + "grad_norm": 32.260719299316406, + "learning_rate": 1.378370251578631e-05, + "loss": 1.7881, + "step": 3102 + }, + { + "epoch": 0.93, + "grad_norm": 15.899197578430176, + "learning_rate": 1.3781697905181919e-05, + "loss": 1.865, + "step": 3103 + }, + { + "epoch": 0.93, + "grad_norm": 16.820289611816406, + "learning_rate": 1.3779693294577529e-05, + "loss": 1.9125, + "step": 3104 + }, + { + "epoch": 0.93, + "grad_norm": 24.38895606994629, + "learning_rate": 1.3777688683973137e-05, + "loss": 1.5985, + "step": 3105 + }, + { + "epoch": 0.93, + "grad_norm": 34.085914611816406, + "learning_rate": 1.377568407336875e-05, + "loss": 2.2492, + "step": 3106 + }, + { + "epoch": 0.93, + "grad_norm": 14.358930587768555, + "learning_rate": 1.377367946276436e-05, + "loss": 1.9382, + "step": 3107 + }, + { + "epoch": 0.93, + "grad_norm": 25.895401000976562, + "learning_rate": 1.377167485215997e-05, + "loss": 1.1291, + "step": 3108 + }, + { + "epoch": 0.93, + "grad_norm": 17.190685272216797, + "learning_rate": 1.376967024155558e-05, + "loss": 1.4162, + "step": 3109 + }, + { + "epoch": 0.94, + "grad_norm": 13.299299240112305, + "learning_rate": 1.3767665630951188e-05, + "loss": 1.6711, + "step": 3110 + }, + { + "epoch": 0.94, + "grad_norm": 14.426549911499023, + "learning_rate": 1.37656610203468e-05, + "loss": 1.9699, + "step": 3111 + }, + { + "epoch": 0.94, + "grad_norm": 25.35927963256836, + "learning_rate": 1.376365640974241e-05, + "loss": 2.6309, + "step": 3112 + }, + { + "epoch": 0.94, + "grad_norm": 9.705031394958496, + "learning_rate": 1.3761651799138018e-05, + "loss": 2.4938, + "step": 3113 + }, + { + "epoch": 0.94, + "grad_norm": 12.233379364013672, + "learning_rate": 1.375964718853363e-05, + "loss": 1.8221, + "step": 3114 + }, + { + "epoch": 0.94, + "grad_norm": 15.394095420837402, + "learning_rate": 1.3757642577929238e-05, + "loss": 1.7775, + "step": 3115 + }, + { + "epoch": 0.94, + "grad_norm": 13.07430362701416, + "learning_rate": 1.3755637967324848e-05, + "loss": 1.0702, + "step": 3116 + }, + { + "epoch": 0.94, + "grad_norm": 12.21866226196289, + "learning_rate": 1.3753633356720457e-05, + "loss": 1.6868, + "step": 3117 + }, + { + "epoch": 0.94, + "grad_norm": 10.357650756835938, + "learning_rate": 1.3751628746116069e-05, + "loss": 2.2551, + "step": 3118 + }, + { + "epoch": 0.94, + "grad_norm": 18.206022262573242, + "learning_rate": 1.3749624135511679e-05, + "loss": 1.8297, + "step": 3119 + }, + { + "epoch": 0.94, + "grad_norm": 15.580822944641113, + "learning_rate": 1.3747619524907287e-05, + "loss": 1.9925, + "step": 3120 + }, + { + "epoch": 0.94, + "eval_loss": 0.2486942857503891, + "eval_runtime": 43.5686, + "eval_samples_per_second": 33.946, + "eval_steps_per_second": 33.946, + "step": 3120 + }, + { + "epoch": 0.94, + "grad_norm": 10.770172119140625, + "learning_rate": 1.3745614914302899e-05, + "loss": 2.3822, + "step": 3121 + }, + { + "epoch": 0.94, + "grad_norm": 25.76287078857422, + "learning_rate": 1.3743610303698507e-05, + "loss": 2.8554, + "step": 3122 + }, + { + "epoch": 0.94, + "grad_norm": 11.431665420532227, + "learning_rate": 1.3741605693094117e-05, + "loss": 2.0408, + "step": 3123 + }, + { + "epoch": 0.94, + "grad_norm": 12.141244888305664, + "learning_rate": 1.3739601082489729e-05, + "loss": 1.5182, + "step": 3124 + }, + { + "epoch": 0.94, + "grad_norm": 12.981203079223633, + "learning_rate": 1.3737596471885337e-05, + "loss": 1.3628, + "step": 3125 + }, + { + "epoch": 0.94, + "grad_norm": 12.477705955505371, + "learning_rate": 1.3735591861280948e-05, + "loss": 1.9388, + "step": 3126 + }, + { + "epoch": 0.94, + "grad_norm": 21.606971740722656, + "learning_rate": 1.3733587250676556e-05, + "loss": 1.7253, + "step": 3127 + }, + { + "epoch": 0.94, + "grad_norm": 15.517189025878906, + "learning_rate": 1.3731582640072168e-05, + "loss": 2.0146, + "step": 3128 + }, + { + "epoch": 0.94, + "grad_norm": 30.338869094848633, + "learning_rate": 1.3729578029467776e-05, + "loss": 1.8816, + "step": 3129 + }, + { + "epoch": 0.94, + "grad_norm": 34.77288818359375, + "learning_rate": 1.3727573418863386e-05, + "loss": 3.4102, + "step": 3130 + }, + { + "epoch": 0.94, + "grad_norm": 29.494319915771484, + "learning_rate": 1.3725568808258998e-05, + "loss": 1.3876, + "step": 3131 + }, + { + "epoch": 0.94, + "grad_norm": 15.238880157470703, + "learning_rate": 1.3723564197654606e-05, + "loss": 2.0381, + "step": 3132 + }, + { + "epoch": 0.94, + "grad_norm": 13.510786056518555, + "learning_rate": 1.3721559587050216e-05, + "loss": 2.5507, + "step": 3133 + }, + { + "epoch": 0.94, + "grad_norm": 24.444250106811523, + "learning_rate": 1.3719554976445827e-05, + "loss": 1.7889, + "step": 3134 + }, + { + "epoch": 0.94, + "grad_norm": 12.126347541809082, + "learning_rate": 1.3717550365841437e-05, + "loss": 1.2326, + "step": 3135 + }, + { + "epoch": 0.94, + "grad_norm": 10.488152503967285, + "learning_rate": 1.3715545755237045e-05, + "loss": 1.2664, + "step": 3136 + }, + { + "epoch": 0.94, + "grad_norm": 30.025480270385742, + "learning_rate": 1.3713541144632657e-05, + "loss": 1.5859, + "step": 3137 + }, + { + "epoch": 0.94, + "grad_norm": 13.164788246154785, + "learning_rate": 1.3711536534028267e-05, + "loss": 2.1662, + "step": 3138 + }, + { + "epoch": 0.94, + "grad_norm": 18.30267333984375, + "learning_rate": 1.3709531923423875e-05, + "loss": 2.1693, + "step": 3139 + }, + { + "epoch": 0.94, + "grad_norm": 16.636390686035156, + "learning_rate": 1.3707527312819487e-05, + "loss": 2.3369, + "step": 3140 + }, + { + "epoch": 0.94, + "grad_norm": 13.153757095336914, + "learning_rate": 1.3705522702215095e-05, + "loss": 2.3091, + "step": 3141 + }, + { + "epoch": 0.94, + "grad_norm": 30.390827178955078, + "learning_rate": 1.3703518091610706e-05, + "loss": 1.786, + "step": 3142 + }, + { + "epoch": 0.94, + "grad_norm": 41.70283889770508, + "learning_rate": 1.3701513481006317e-05, + "loss": 3.5995, + "step": 3143 + }, + { + "epoch": 0.95, + "grad_norm": 32.86235809326172, + "learning_rate": 1.3699508870401926e-05, + "loss": 2.4132, + "step": 3144 + }, + { + "epoch": 0.95, + "grad_norm": 14.464203834533691, + "learning_rate": 1.3697504259797536e-05, + "loss": 2.0061, + "step": 3145 + }, + { + "epoch": 0.95, + "grad_norm": 11.750015258789062, + "learning_rate": 1.3695499649193144e-05, + "loss": 2.1382, + "step": 3146 + }, + { + "epoch": 0.95, + "grad_norm": 13.602632522583008, + "learning_rate": 1.3693495038588756e-05, + "loss": 2.4135, + "step": 3147 + }, + { + "epoch": 0.95, + "grad_norm": 15.727476119995117, + "learning_rate": 1.3691490427984364e-05, + "loss": 2.9052, + "step": 3148 + }, + { + "epoch": 0.95, + "grad_norm": 17.43680763244629, + "learning_rate": 1.3689485817379974e-05, + "loss": 1.3355, + "step": 3149 + }, + { + "epoch": 0.95, + "grad_norm": 14.329429626464844, + "learning_rate": 1.3687481206775586e-05, + "loss": 3.1391, + "step": 3150 + }, + { + "epoch": 0.95, + "grad_norm": 25.729965209960938, + "learning_rate": 1.3685476596171195e-05, + "loss": 1.6923, + "step": 3151 + }, + { + "epoch": 0.95, + "grad_norm": 95.43783569335938, + "learning_rate": 1.3683471985566805e-05, + "loss": 2.0273, + "step": 3152 + }, + { + "epoch": 0.95, + "grad_norm": 17.75636100769043, + "learning_rate": 1.3681467374962413e-05, + "loss": 1.5341, + "step": 3153 + }, + { + "epoch": 0.95, + "grad_norm": 19.18214988708496, + "learning_rate": 1.3679462764358025e-05, + "loss": 1.8777, + "step": 3154 + }, + { + "epoch": 0.95, + "grad_norm": 15.892659187316895, + "learning_rate": 1.3677458153753635e-05, + "loss": 1.9435, + "step": 3155 + }, + { + "epoch": 0.95, + "grad_norm": 20.043859481811523, + "learning_rate": 1.3675453543149243e-05, + "loss": 2.1202, + "step": 3156 + }, + { + "epoch": 0.95, + "grad_norm": 13.109265327453613, + "learning_rate": 1.3673448932544855e-05, + "loss": 2.429, + "step": 3157 + }, + { + "epoch": 0.95, + "grad_norm": 15.493414878845215, + "learning_rate": 1.3671444321940463e-05, + "loss": 1.768, + "step": 3158 + }, + { + "epoch": 0.95, + "grad_norm": 36.695735931396484, + "learning_rate": 1.3669439711336074e-05, + "loss": 2.0781, + "step": 3159 + }, + { + "epoch": 0.95, + "grad_norm": 14.086923599243164, + "learning_rate": 1.3667435100731684e-05, + "loss": 0.8197, + "step": 3160 + }, + { + "epoch": 0.95, + "grad_norm": 13.209424018859863, + "learning_rate": 1.3665430490127294e-05, + "loss": 1.9733, + "step": 3161 + }, + { + "epoch": 0.95, + "grad_norm": 17.077037811279297, + "learning_rate": 1.3663425879522905e-05, + "loss": 2.5923, + "step": 3162 + }, + { + "epoch": 0.95, + "grad_norm": 49.46474075317383, + "learning_rate": 1.3661421268918514e-05, + "loss": 2.0692, + "step": 3163 + }, + { + "epoch": 0.95, + "grad_norm": 14.314860343933105, + "learning_rate": 1.3659416658314124e-05, + "loss": 2.8229, + "step": 3164 + }, + { + "epoch": 0.95, + "grad_norm": 8.489893913269043, + "learning_rate": 1.3657412047709732e-05, + "loss": 1.3168, + "step": 3165 + }, + { + "epoch": 0.95, + "grad_norm": 32.04702377319336, + "learning_rate": 1.3655407437105344e-05, + "loss": 1.6409, + "step": 3166 + }, + { + "epoch": 0.95, + "grad_norm": 16.088096618652344, + "learning_rate": 1.3653402826500954e-05, + "loss": 1.1645, + "step": 3167 + }, + { + "epoch": 0.95, + "grad_norm": 12.829047203063965, + "learning_rate": 1.3651398215896563e-05, + "loss": 1.3659, + "step": 3168 + }, + { + "epoch": 0.95, + "grad_norm": 19.811403274536133, + "learning_rate": 1.3649393605292174e-05, + "loss": 2.6096, + "step": 3169 + }, + { + "epoch": 0.95, + "grad_norm": 15.218836784362793, + "learning_rate": 1.3647388994687783e-05, + "loss": 1.4327, + "step": 3170 + }, + { + "epoch": 0.95, + "grad_norm": 11.305002212524414, + "learning_rate": 1.3645384384083393e-05, + "loss": 1.1141, + "step": 3171 + }, + { + "epoch": 0.95, + "grad_norm": 12.872888565063477, + "learning_rate": 1.3643379773479001e-05, + "loss": 1.6382, + "step": 3172 + }, + { + "epoch": 0.95, + "grad_norm": 18.41777801513672, + "learning_rate": 1.3641375162874613e-05, + "loss": 2.1711, + "step": 3173 + }, + { + "epoch": 0.95, + "grad_norm": 14.97694206237793, + "learning_rate": 1.3639370552270223e-05, + "loss": 2.1335, + "step": 3174 + }, + { + "epoch": 0.95, + "grad_norm": 17.031770706176758, + "learning_rate": 1.3637365941665832e-05, + "loss": 2.1181, + "step": 3175 + }, + { + "epoch": 0.95, + "grad_norm": 15.743291854858398, + "learning_rate": 1.3635361331061443e-05, + "loss": 2.4674, + "step": 3176 + }, + { + "epoch": 0.96, + "grad_norm": 9.414546966552734, + "learning_rate": 1.3633356720457052e-05, + "loss": 2.1942, + "step": 3177 + }, + { + "epoch": 0.96, + "grad_norm": 10.637990951538086, + "learning_rate": 1.3631352109852662e-05, + "loss": 1.5859, + "step": 3178 + }, + { + "epoch": 0.96, + "grad_norm": 13.900732040405273, + "learning_rate": 1.3629347499248272e-05, + "loss": 1.6751, + "step": 3179 + }, + { + "epoch": 0.96, + "grad_norm": 20.666290283203125, + "learning_rate": 1.3627342888643882e-05, + "loss": 2.6414, + "step": 3180 + }, + { + "epoch": 0.96, + "grad_norm": 35.015647888183594, + "learning_rate": 1.3625338278039492e-05, + "loss": 2.7838, + "step": 3181 + }, + { + "epoch": 0.96, + "grad_norm": 15.191834449768066, + "learning_rate": 1.3623333667435102e-05, + "loss": 1.831, + "step": 3182 + }, + { + "epoch": 0.96, + "grad_norm": 34.37846755981445, + "learning_rate": 1.3621329056830712e-05, + "loss": 2.7531, + "step": 3183 + }, + { + "epoch": 0.96, + "grad_norm": 12.335997581481934, + "learning_rate": 1.361932444622632e-05, + "loss": 1.0529, + "step": 3184 + }, + { + "epoch": 0.96, + "grad_norm": 25.120954513549805, + "learning_rate": 1.3617319835621932e-05, + "loss": 2.2635, + "step": 3185 + }, + { + "epoch": 0.96, + "grad_norm": 11.803996086120605, + "learning_rate": 1.3615315225017542e-05, + "loss": 1.8602, + "step": 3186 + }, + { + "epoch": 0.96, + "grad_norm": 14.749687194824219, + "learning_rate": 1.361331061441315e-05, + "loss": 1.0415, + "step": 3187 + }, + { + "epoch": 0.96, + "grad_norm": 15.90971851348877, + "learning_rate": 1.3611306003808763e-05, + "loss": 1.9429, + "step": 3188 + }, + { + "epoch": 0.96, + "grad_norm": 42.85434341430664, + "learning_rate": 1.3609301393204371e-05, + "loss": 2.2651, + "step": 3189 + }, + { + "epoch": 0.96, + "grad_norm": 22.296979904174805, + "learning_rate": 1.3607296782599981e-05, + "loss": 2.2514, + "step": 3190 + }, + { + "epoch": 0.96, + "grad_norm": 16.979684829711914, + "learning_rate": 1.360529217199559e-05, + "loss": 1.9144, + "step": 3191 + }, + { + "epoch": 0.96, + "grad_norm": 51.25554656982422, + "learning_rate": 1.3603287561391201e-05, + "loss": 1.4266, + "step": 3192 + }, + { + "epoch": 0.96, + "grad_norm": 32.474002838134766, + "learning_rate": 1.3601282950786811e-05, + "loss": 2.3617, + "step": 3193 + }, + { + "epoch": 0.96, + "grad_norm": 12.489779472351074, + "learning_rate": 1.359927834018242e-05, + "loss": 1.4284, + "step": 3194 + }, + { + "epoch": 0.96, + "grad_norm": 14.795775413513184, + "learning_rate": 1.3597273729578032e-05, + "loss": 0.9304, + "step": 3195 + }, + { + "epoch": 0.96, + "grad_norm": 35.45854568481445, + "learning_rate": 1.359526911897364e-05, + "loss": 3.7488, + "step": 3196 + }, + { + "epoch": 0.96, + "grad_norm": 6.524545669555664, + "learning_rate": 1.359326450836925e-05, + "loss": 1.08, + "step": 3197 + }, + { + "epoch": 0.96, + "grad_norm": 13.13879680633545, + "learning_rate": 1.3591259897764862e-05, + "loss": 2.1342, + "step": 3198 + }, + { + "epoch": 0.96, + "grad_norm": 8.252395629882812, + "learning_rate": 1.358925528716047e-05, + "loss": 1.6381, + "step": 3199 + }, + { + "epoch": 0.96, + "grad_norm": 12.134532928466797, + "learning_rate": 1.358725067655608e-05, + "loss": 1.7298, + "step": 3200 + }, + { + "epoch": 0.96, + "grad_norm": 17.264564514160156, + "learning_rate": 1.3585246065951689e-05, + "loss": 1.6213, + "step": 3201 + }, + { + "epoch": 0.96, + "grad_norm": 17.299379348754883, + "learning_rate": 1.35832414553473e-05, + "loss": 1.9645, + "step": 3202 + }, + { + "epoch": 0.96, + "grad_norm": 9.386181831359863, + "learning_rate": 1.3581236844742909e-05, + "loss": 1.0434, + "step": 3203 + }, + { + "epoch": 0.96, + "grad_norm": 10.785475730895996, + "learning_rate": 1.3579232234138519e-05, + "loss": 1.387, + "step": 3204 + }, + { + "epoch": 0.96, + "grad_norm": 18.141578674316406, + "learning_rate": 1.357722762353413e-05, + "loss": 1.816, + "step": 3205 + }, + { + "epoch": 0.96, + "grad_norm": 10.100055694580078, + "learning_rate": 1.3575223012929739e-05, + "loss": 2.0753, + "step": 3206 + }, + { + "epoch": 0.96, + "grad_norm": 13.844127655029297, + "learning_rate": 1.3573218402325349e-05, + "loss": 1.4975, + "step": 3207 + }, + { + "epoch": 0.96, + "grad_norm": 13.211896896362305, + "learning_rate": 1.357121379172096e-05, + "loss": 2.011, + "step": 3208 + }, + { + "epoch": 0.96, + "grad_norm": 16.434125900268555, + "learning_rate": 1.356920918111657e-05, + "loss": 1.3007, + "step": 3209 + }, + { + "epoch": 0.97, + "grad_norm": 25.20293426513672, + "learning_rate": 1.356720457051218e-05, + "loss": 1.4808, + "step": 3210 + }, + { + "epoch": 0.97, + "grad_norm": 17.422204971313477, + "learning_rate": 1.356519995990779e-05, + "loss": 2.0575, + "step": 3211 + }, + { + "epoch": 0.97, + "grad_norm": 22.5146484375, + "learning_rate": 1.35631953493034e-05, + "loss": 2.0943, + "step": 3212 + }, + { + "epoch": 0.97, + "grad_norm": 46.014678955078125, + "learning_rate": 1.3561190738699008e-05, + "loss": 2.7204, + "step": 3213 + }, + { + "epoch": 0.97, + "grad_norm": 16.03790283203125, + "learning_rate": 1.355918612809462e-05, + "loss": 2.5307, + "step": 3214 + }, + { + "epoch": 0.97, + "grad_norm": 17.003280639648438, + "learning_rate": 1.3557181517490228e-05, + "loss": 2.766, + "step": 3215 + }, + { + "epoch": 0.97, + "grad_norm": 21.383115768432617, + "learning_rate": 1.3555176906885838e-05, + "loss": 1.8888, + "step": 3216 + }, + { + "epoch": 0.97, + "grad_norm": 29.028945922851562, + "learning_rate": 1.355317229628145e-05, + "loss": 1.2596, + "step": 3217 + }, + { + "epoch": 0.97, + "grad_norm": 14.80164623260498, + "learning_rate": 1.3551167685677058e-05, + "loss": 2.1869, + "step": 3218 + }, + { + "epoch": 0.97, + "grad_norm": 35.633094787597656, + "learning_rate": 1.3549163075072668e-05, + "loss": 3.2196, + "step": 3219 + }, + { + "epoch": 0.97, + "grad_norm": 17.463098526000977, + "learning_rate": 1.3547158464468277e-05, + "loss": 2.077, + "step": 3220 + }, + { + "epoch": 0.97, + "grad_norm": 15.604183197021484, + "learning_rate": 1.3545153853863889e-05, + "loss": 1.6204, + "step": 3221 + }, + { + "epoch": 0.97, + "grad_norm": 28.1143798828125, + "learning_rate": 1.3543149243259497e-05, + "loss": 2.9363, + "step": 3222 + }, + { + "epoch": 0.97, + "grad_norm": 18.119604110717773, + "learning_rate": 1.3541144632655107e-05, + "loss": 1.3793, + "step": 3223 + }, + { + "epoch": 0.97, + "grad_norm": 16.877845764160156, + "learning_rate": 1.3539140022050719e-05, + "loss": 2.0204, + "step": 3224 + }, + { + "epoch": 0.97, + "grad_norm": 20.121807098388672, + "learning_rate": 1.3537135411446327e-05, + "loss": 2.4456, + "step": 3225 + }, + { + "epoch": 0.97, + "grad_norm": 15.562324523925781, + "learning_rate": 1.3535130800841937e-05, + "loss": 1.8663, + "step": 3226 + }, + { + "epoch": 0.97, + "grad_norm": 16.265233993530273, + "learning_rate": 1.3533126190237547e-05, + "loss": 1.7686, + "step": 3227 + }, + { + "epoch": 0.97, + "grad_norm": 24.303163528442383, + "learning_rate": 1.3531121579633158e-05, + "loss": 2.5792, + "step": 3228 + }, + { + "epoch": 0.97, + "grad_norm": 19.46893310546875, + "learning_rate": 1.3529116969028768e-05, + "loss": 1.8132, + "step": 3229 + }, + { + "epoch": 0.97, + "grad_norm": 39.980438232421875, + "learning_rate": 1.3527112358424378e-05, + "loss": 2.3918, + "step": 3230 + }, + { + "epoch": 0.97, + "grad_norm": 18.492206573486328, + "learning_rate": 1.3525107747819988e-05, + "loss": 1.6302, + "step": 3231 + }, + { + "epoch": 0.97, + "grad_norm": 11.828949928283691, + "learning_rate": 1.3523103137215596e-05, + "loss": 1.7631, + "step": 3232 + }, + { + "epoch": 0.97, + "grad_norm": 13.994562149047852, + "learning_rate": 1.3521098526611208e-05, + "loss": 1.9207, + "step": 3233 + }, + { + "epoch": 0.97, + "grad_norm": 22.90715789794922, + "learning_rate": 1.3519093916006816e-05, + "loss": 3.6763, + "step": 3234 + }, + { + "epoch": 0.97, + "grad_norm": 45.03823471069336, + "learning_rate": 1.3517089305402426e-05, + "loss": 2.1425, + "step": 3235 + }, + { + "epoch": 0.97, + "grad_norm": 12.193424224853516, + "learning_rate": 1.3515084694798038e-05, + "loss": 1.4887, + "step": 3236 + }, + { + "epoch": 0.97, + "grad_norm": 21.530868530273438, + "learning_rate": 1.3513080084193647e-05, + "loss": 2.9298, + "step": 3237 + }, + { + "epoch": 0.97, + "grad_norm": 30.308269500732422, + "learning_rate": 1.3511075473589257e-05, + "loss": 2.4928, + "step": 3238 + }, + { + "epoch": 0.97, + "grad_norm": 11.13912582397461, + "learning_rate": 1.3509070862984865e-05, + "loss": 1.7033, + "step": 3239 + }, + { + "epoch": 0.97, + "grad_norm": 11.577181816101074, + "learning_rate": 1.3507066252380477e-05, + "loss": 1.4173, + "step": 3240 + }, + { + "epoch": 0.97, + "eval_loss": 0.22535113990306854, + "eval_runtime": 43.4738, + "eval_samples_per_second": 34.021, + "eval_steps_per_second": 34.021, + "step": 3240 + }, + { + "epoch": 0.97, + "grad_norm": 27.40611457824707, + "learning_rate": 1.3505061641776087e-05, + "loss": 1.9135, + "step": 3241 + }, + { + "epoch": 0.97, + "grad_norm": 19.149837493896484, + "learning_rate": 1.3503057031171695e-05, + "loss": 1.7921, + "step": 3242 + }, + { + "epoch": 0.98, + "grad_norm": 15.974568367004395, + "learning_rate": 1.3501052420567307e-05, + "loss": 2.5266, + "step": 3243 + }, + { + "epoch": 0.98, + "grad_norm": 19.187986373901367, + "learning_rate": 1.3499047809962915e-05, + "loss": 1.9754, + "step": 3244 + }, + { + "epoch": 0.98, + "grad_norm": 10.218606948852539, + "learning_rate": 1.3497043199358526e-05, + "loss": 0.9435, + "step": 3245 + }, + { + "epoch": 0.98, + "grad_norm": 16.653217315673828, + "learning_rate": 1.3495038588754134e-05, + "loss": 1.8804, + "step": 3246 + }, + { + "epoch": 0.98, + "grad_norm": 19.304424285888672, + "learning_rate": 1.3493033978149746e-05, + "loss": 2.038, + "step": 3247 + }, + { + "epoch": 0.98, + "grad_norm": 17.04471206665039, + "learning_rate": 1.3491029367545356e-05, + "loss": 1.9324, + "step": 3248 + }, + { + "epoch": 0.98, + "grad_norm": 16.119550704956055, + "learning_rate": 1.3489024756940964e-05, + "loss": 2.1206, + "step": 3249 + }, + { + "epoch": 0.98, + "grad_norm": 16.321861267089844, + "learning_rate": 1.3487020146336576e-05, + "loss": 2.3402, + "step": 3250 + }, + { + "epoch": 0.98, + "grad_norm": 30.2368221282959, + "learning_rate": 1.3485015535732184e-05, + "loss": 1.683, + "step": 3251 + }, + { + "epoch": 0.98, + "grad_norm": 18.461002349853516, + "learning_rate": 1.3483010925127794e-05, + "loss": 2.1962, + "step": 3252 + }, + { + "epoch": 0.98, + "grad_norm": 20.721721649169922, + "learning_rate": 1.3481006314523406e-05, + "loss": 2.1013, + "step": 3253 + }, + { + "epoch": 0.98, + "grad_norm": 31.358562469482422, + "learning_rate": 1.3479001703919015e-05, + "loss": 2.7242, + "step": 3254 + }, + { + "epoch": 0.98, + "grad_norm": 20.12774085998535, + "learning_rate": 1.3476997093314625e-05, + "loss": 1.5558, + "step": 3255 + }, + { + "epoch": 0.98, + "grad_norm": 28.31780242919922, + "learning_rate": 1.3474992482710235e-05, + "loss": 2.1186, + "step": 3256 + }, + { + "epoch": 0.98, + "grad_norm": 11.361303329467773, + "learning_rate": 1.3472987872105845e-05, + "loss": 1.4945, + "step": 3257 + }, + { + "epoch": 0.98, + "grad_norm": 112.40892028808594, + "learning_rate": 1.3470983261501453e-05, + "loss": 1.87, + "step": 3258 + }, + { + "epoch": 0.98, + "grad_norm": 48.792320251464844, + "learning_rate": 1.3468978650897065e-05, + "loss": 2.829, + "step": 3259 + }, + { + "epoch": 0.98, + "grad_norm": 15.994609832763672, + "learning_rate": 1.3466974040292675e-05, + "loss": 2.33, + "step": 3260 + }, + { + "epoch": 0.98, + "grad_norm": 14.680255889892578, + "learning_rate": 1.3464969429688284e-05, + "loss": 1.5227, + "step": 3261 + }, + { + "epoch": 0.98, + "grad_norm": 20.413484573364258, + "learning_rate": 1.3462964819083895e-05, + "loss": 1.3934, + "step": 3262 + }, + { + "epoch": 0.98, + "grad_norm": 10.785961151123047, + "learning_rate": 1.3460960208479504e-05, + "loss": 1.6991, + "step": 3263 + }, + { + "epoch": 0.98, + "grad_norm": 23.43621063232422, + "learning_rate": 1.3458955597875114e-05, + "loss": 2.1189, + "step": 3264 + }, + { + "epoch": 0.98, + "grad_norm": 18.55811882019043, + "learning_rate": 1.3456950987270722e-05, + "loss": 1.6837, + "step": 3265 + }, + { + "epoch": 0.98, + "grad_norm": 25.324520111083984, + "learning_rate": 1.3454946376666334e-05, + "loss": 2.5444, + "step": 3266 + }, + { + "epoch": 0.98, + "grad_norm": 29.4528751373291, + "learning_rate": 1.3452941766061944e-05, + "loss": 1.7111, + "step": 3267 + }, + { + "epoch": 0.98, + "grad_norm": 10.598390579223633, + "learning_rate": 1.3450937155457552e-05, + "loss": 1.1832, + "step": 3268 + }, + { + "epoch": 0.98, + "grad_norm": 16.23030662536621, + "learning_rate": 1.3448932544853164e-05, + "loss": 1.845, + "step": 3269 + }, + { + "epoch": 0.98, + "grad_norm": 16.992780685424805, + "learning_rate": 1.3446927934248773e-05, + "loss": 1.5488, + "step": 3270 + }, + { + "epoch": 0.98, + "grad_norm": 16.57977294921875, + "learning_rate": 1.3444923323644383e-05, + "loss": 2.4538, + "step": 3271 + }, + { + "epoch": 0.98, + "grad_norm": 17.394916534423828, + "learning_rate": 1.3442918713039994e-05, + "loss": 2.4952, + "step": 3272 + }, + { + "epoch": 0.98, + "grad_norm": 10.907772064208984, + "learning_rate": 1.3440914102435603e-05, + "loss": 2.2693, + "step": 3273 + }, + { + "epoch": 0.98, + "grad_norm": 15.381023406982422, + "learning_rate": 1.3438909491831213e-05, + "loss": 1.9541, + "step": 3274 + }, + { + "epoch": 0.98, + "grad_norm": 13.303698539733887, + "learning_rate": 1.3436904881226821e-05, + "loss": 1.7353, + "step": 3275 + }, + { + "epoch": 0.98, + "grad_norm": 22.471403121948242, + "learning_rate": 1.3434900270622433e-05, + "loss": 1.8707, + "step": 3276 + }, + { + "epoch": 0.99, + "grad_norm": 12.483773231506348, + "learning_rate": 1.3432895660018041e-05, + "loss": 1.862, + "step": 3277 + }, + { + "epoch": 0.99, + "grad_norm": 24.261032104492188, + "learning_rate": 1.3430891049413652e-05, + "loss": 3.0355, + "step": 3278 + }, + { + "epoch": 0.99, + "grad_norm": 27.064979553222656, + "learning_rate": 1.3428886438809263e-05, + "loss": 2.6927, + "step": 3279 + }, + { + "epoch": 0.99, + "grad_norm": 23.447118759155273, + "learning_rate": 1.3426881828204872e-05, + "loss": 2.2596, + "step": 3280 + }, + { + "epoch": 0.99, + "grad_norm": 19.78642463684082, + "learning_rate": 1.3424877217600484e-05, + "loss": 2.1217, + "step": 3281 + }, + { + "epoch": 0.99, + "grad_norm": 11.758055686950684, + "learning_rate": 1.3422872606996092e-05, + "loss": 1.0749, + "step": 3282 + }, + { + "epoch": 0.99, + "grad_norm": 15.738198280334473, + "learning_rate": 1.3420867996391702e-05, + "loss": 2.25, + "step": 3283 + }, + { + "epoch": 0.99, + "grad_norm": 21.461877822875977, + "learning_rate": 1.3418863385787314e-05, + "loss": 2.1378, + "step": 3284 + }, + { + "epoch": 0.99, + "grad_norm": 39.54304504394531, + "learning_rate": 1.3416858775182922e-05, + "loss": 3.1118, + "step": 3285 + }, + { + "epoch": 0.99, + "grad_norm": 14.904977798461914, + "learning_rate": 1.3414854164578532e-05, + "loss": 1.2001, + "step": 3286 + }, + { + "epoch": 0.99, + "grad_norm": 19.5516414642334, + "learning_rate": 1.341284955397414e-05, + "loss": 2.2941, + "step": 3287 + }, + { + "epoch": 0.99, + "grad_norm": 18.030540466308594, + "learning_rate": 1.3410844943369752e-05, + "loss": 2.1409, + "step": 3288 + }, + { + "epoch": 0.99, + "grad_norm": 17.5920352935791, + "learning_rate": 1.340884033276536e-05, + "loss": 3.0416, + "step": 3289 + }, + { + "epoch": 0.99, + "grad_norm": 18.530750274658203, + "learning_rate": 1.3406835722160971e-05, + "loss": 1.1762, + "step": 3290 + }, + { + "epoch": 0.99, + "grad_norm": 15.765362739562988, + "learning_rate": 1.3404831111556583e-05, + "loss": 2.0459, + "step": 3291 + }, + { + "epoch": 0.99, + "grad_norm": 27.75279998779297, + "learning_rate": 1.3402826500952191e-05, + "loss": 2.1783, + "step": 3292 + }, + { + "epoch": 0.99, + "grad_norm": 16.64308738708496, + "learning_rate": 1.3400821890347801e-05, + "loss": 2.4156, + "step": 3293 + }, + { + "epoch": 0.99, + "grad_norm": 19.890602111816406, + "learning_rate": 1.339881727974341e-05, + "loss": 1.0157, + "step": 3294 + }, + { + "epoch": 0.99, + "grad_norm": 15.657976150512695, + "learning_rate": 1.3396812669139021e-05, + "loss": 1.3915, + "step": 3295 + }, + { + "epoch": 0.99, + "grad_norm": 14.942137718200684, + "learning_rate": 1.3394808058534631e-05, + "loss": 2.0334, + "step": 3296 + }, + { + "epoch": 0.99, + "grad_norm": 10.705955505371094, + "learning_rate": 1.339280344793024e-05, + "loss": 1.7803, + "step": 3297 + }, + { + "epoch": 0.99, + "grad_norm": 11.542771339416504, + "learning_rate": 1.3390798837325852e-05, + "loss": 1.2149, + "step": 3298 + }, + { + "epoch": 0.99, + "grad_norm": 16.316530227661133, + "learning_rate": 1.338879422672146e-05, + "loss": 2.4369, + "step": 3299 + }, + { + "epoch": 0.99, + "grad_norm": 23.6247615814209, + "learning_rate": 1.338678961611707e-05, + "loss": 2.1207, + "step": 3300 + }, + { + "epoch": 0.99, + "grad_norm": 17.557518005371094, + "learning_rate": 1.338478500551268e-05, + "loss": 1.5633, + "step": 3301 + }, + { + "epoch": 0.99, + "grad_norm": 13.458683967590332, + "learning_rate": 1.338278039490829e-05, + "loss": 2.0549, + "step": 3302 + }, + { + "epoch": 0.99, + "grad_norm": 10.493450164794922, + "learning_rate": 1.33807757843039e-05, + "loss": 2.0356, + "step": 3303 + }, + { + "epoch": 0.99, + "grad_norm": 16.16809844970703, + "learning_rate": 1.337877117369951e-05, + "loss": 1.918, + "step": 3304 + }, + { + "epoch": 0.99, + "grad_norm": 15.41076374053955, + "learning_rate": 1.337676656309512e-05, + "loss": 2.8471, + "step": 3305 + }, + { + "epoch": 0.99, + "grad_norm": 18.323535919189453, + "learning_rate": 1.3374761952490729e-05, + "loss": 1.6805, + "step": 3306 + }, + { + "epoch": 0.99, + "grad_norm": 10.888114929199219, + "learning_rate": 1.337275734188634e-05, + "loss": 1.4534, + "step": 3307 + }, + { + "epoch": 0.99, + "grad_norm": 23.824207305908203, + "learning_rate": 1.3370752731281949e-05, + "loss": 2.1954, + "step": 3308 + }, + { + "epoch": 0.99, + "grad_norm": 12.056889533996582, + "learning_rate": 1.3368748120677559e-05, + "loss": 1.9769, + "step": 3309 + }, + { + "epoch": 1.0, + "grad_norm": 20.176801681518555, + "learning_rate": 1.3366743510073171e-05, + "loss": 2.3137, + "step": 3310 + }, + { + "epoch": 1.0, + "grad_norm": 12.740705490112305, + "learning_rate": 1.336473889946878e-05, + "loss": 2.1326, + "step": 3311 + }, + { + "epoch": 1.0, + "grad_norm": 13.5363130569458, + "learning_rate": 1.336273428886439e-05, + "loss": 1.5278, + "step": 3312 + }, + { + "epoch": 1.0, + "grad_norm": 11.068243026733398, + "learning_rate": 1.3360729678259998e-05, + "loss": 1.186, + "step": 3313 + }, + { + "epoch": 1.0, + "grad_norm": 16.938138961791992, + "learning_rate": 1.335872506765561e-05, + "loss": 1.0064, + "step": 3314 + }, + { + "epoch": 1.0, + "grad_norm": 13.62263298034668, + "learning_rate": 1.335672045705122e-05, + "loss": 1.909, + "step": 3315 + }, + { + "epoch": 1.0, + "grad_norm": 11.283188819885254, + "learning_rate": 1.3354715846446828e-05, + "loss": 1.6886, + "step": 3316 + }, + { + "epoch": 1.0, + "grad_norm": 14.703868865966797, + "learning_rate": 1.335271123584244e-05, + "loss": 1.9791, + "step": 3317 + }, + { + "epoch": 1.0, + "grad_norm": 13.806490898132324, + "learning_rate": 1.3350706625238048e-05, + "loss": 1.516, + "step": 3318 + }, + { + "epoch": 1.0, + "grad_norm": 14.787043571472168, + "learning_rate": 1.3348702014633658e-05, + "loss": 1.8667, + "step": 3319 + }, + { + "epoch": 1.0, + "grad_norm": 17.8718318939209, + "learning_rate": 1.3346697404029267e-05, + "loss": 2.0014, + "step": 3320 + }, + { + "epoch": 1.0, + "grad_norm": 23.68500518798828, + "learning_rate": 1.3344692793424878e-05, + "loss": 1.7045, + "step": 3321 + }, + { + "epoch": 1.0, + "grad_norm": 10.535503387451172, + "learning_rate": 1.3342688182820488e-05, + "loss": 1.183, + "step": 3322 + }, + { + "epoch": 1.0, + "grad_norm": 25.851089477539062, + "learning_rate": 1.3340683572216097e-05, + "loss": 2.7104, + "step": 3323 + }, + { + "epoch": 1.0, + "grad_norm": 10.395030975341797, + "learning_rate": 1.3338678961611709e-05, + "loss": 1.7227, + "step": 3324 + }, + { + "epoch": 1.0, + "grad_norm": 14.099095344543457, + "learning_rate": 1.3336674351007317e-05, + "loss": 1.7941, + "step": 3325 + }, + { + "epoch": 1.0, + "grad_norm": 15.447168350219727, + "learning_rate": 1.3334669740402927e-05, + "loss": 2.0854, + "step": 3326 + }, + { + "epoch": 1.0, + "grad_norm": 20.244766235351562, + "learning_rate": 1.3332665129798539e-05, + "loss": 3.0187, + "step": 3327 + }, + { + "epoch": 1.0, + "grad_norm": 56.30267333984375, + "learning_rate": 1.3330660519194147e-05, + "loss": 2.1849, + "step": 3328 + }, + { + "epoch": 1.0, + "grad_norm": 28.362627029418945, + "learning_rate": 1.3328655908589757e-05, + "loss": 1.7998, + "step": 3329 + }, + { + "epoch": 1.0, + "grad_norm": 12.778199195861816, + "learning_rate": 1.3326651297985367e-05, + "loss": 1.5023, + "step": 3330 + }, + { + "epoch": 1.0, + "grad_norm": 9.352411270141602, + "learning_rate": 1.3324646687380978e-05, + "loss": 1.3491, + "step": 3331 + }, + { + "epoch": 1.0, + "grad_norm": 32.7597770690918, + "learning_rate": 1.3322642076776586e-05, + "loss": 2.2412, + "step": 3332 + }, + { + "epoch": 1.0, + "grad_norm": 14.371981620788574, + "learning_rate": 1.3320637466172198e-05, + "loss": 2.2198, + "step": 3333 + }, + { + "epoch": 1.0, + "grad_norm": 15.051918029785156, + "learning_rate": 1.3318632855567808e-05, + "loss": 1.5761, + "step": 3334 + }, + { + "epoch": 1.0, + "grad_norm": 18.553064346313477, + "learning_rate": 1.3316628244963416e-05, + "loss": 2.2955, + "step": 3335 + }, + { + "epoch": 1.0, + "grad_norm": 13.871806144714355, + "learning_rate": 1.3314623634359028e-05, + "loss": 1.7798, + "step": 3336 + }, + { + "epoch": 1.0, + "grad_norm": 24.853288650512695, + "learning_rate": 1.3312619023754636e-05, + "loss": 2.5074, + "step": 3337 + }, + { + "epoch": 1.0, + "grad_norm": 7.72559928894043, + "learning_rate": 1.3310614413150246e-05, + "loss": 2.0241, + "step": 3338 + }, + { + "epoch": 1.0, + "grad_norm": 67.96630096435547, + "learning_rate": 1.3308609802545858e-05, + "loss": 2.9062, + "step": 3339 + }, + { + "epoch": 1.0, + "grad_norm": 15.404458045959473, + "learning_rate": 1.3306605191941467e-05, + "loss": 1.2167, + "step": 3340 + }, + { + "epoch": 1.0, + "grad_norm": 20.57508087158203, + "learning_rate": 1.3304600581337077e-05, + "loss": 2.3393, + "step": 3341 + }, + { + "epoch": 1.0, + "grad_norm": 13.644542694091797, + "learning_rate": 1.3302595970732685e-05, + "loss": 2.7504, + "step": 3342 + }, + { + "epoch": 1.01, + "grad_norm": 27.4990291595459, + "learning_rate": 1.3300591360128297e-05, + "loss": 2.4179, + "step": 3343 + }, + { + "epoch": 1.01, + "grad_norm": 18.17327308654785, + "learning_rate": 1.3298586749523905e-05, + "loss": 1.8711, + "step": 3344 + }, + { + "epoch": 1.01, + "grad_norm": 11.454172134399414, + "learning_rate": 1.3296582138919515e-05, + "loss": 2.2247, + "step": 3345 + }, + { + "epoch": 1.01, + "grad_norm": 14.537096977233887, + "learning_rate": 1.3294577528315127e-05, + "loss": 1.3509, + "step": 3346 + }, + { + "epoch": 1.01, + "grad_norm": 17.769039154052734, + "learning_rate": 1.3292572917710736e-05, + "loss": 1.6675, + "step": 3347 + }, + { + "epoch": 1.01, + "grad_norm": 20.219247817993164, + "learning_rate": 1.3290568307106346e-05, + "loss": 2.1718, + "step": 3348 + }, + { + "epoch": 1.01, + "grad_norm": 12.704767227172852, + "learning_rate": 1.3288563696501956e-05, + "loss": 1.2209, + "step": 3349 + }, + { + "epoch": 1.01, + "grad_norm": 15.621160507202148, + "learning_rate": 1.3286559085897566e-05, + "loss": 1.5298, + "step": 3350 + }, + { + "epoch": 1.01, + "grad_norm": 17.41335678100586, + "learning_rate": 1.3284554475293174e-05, + "loss": 2.7119, + "step": 3351 + }, + { + "epoch": 1.01, + "grad_norm": 27.469499588012695, + "learning_rate": 1.3282549864688786e-05, + "loss": 2.821, + "step": 3352 + }, + { + "epoch": 1.01, + "grad_norm": 47.88648986816406, + "learning_rate": 1.3280545254084396e-05, + "loss": 2.4446, + "step": 3353 + }, + { + "epoch": 1.01, + "grad_norm": 9.525798797607422, + "learning_rate": 1.3278540643480004e-05, + "loss": 0.9091, + "step": 3354 + }, + { + "epoch": 1.01, + "grad_norm": 22.738584518432617, + "learning_rate": 1.3276536032875616e-05, + "loss": 2.3079, + "step": 3355 + }, + { + "epoch": 1.01, + "grad_norm": 11.072502136230469, + "learning_rate": 1.3274531422271225e-05, + "loss": 2.7678, + "step": 3356 + }, + { + "epoch": 1.01, + "grad_norm": 12.42871379852295, + "learning_rate": 1.3272526811666835e-05, + "loss": 1.1835, + "step": 3357 + }, + { + "epoch": 1.01, + "grad_norm": 10.291549682617188, + "learning_rate": 1.3270522201062446e-05, + "loss": 1.3435, + "step": 3358 + }, + { + "epoch": 1.01, + "grad_norm": 18.563772201538086, + "learning_rate": 1.3268517590458055e-05, + "loss": 1.2068, + "step": 3359 + }, + { + "epoch": 1.01, + "grad_norm": 16.957143783569336, + "learning_rate": 1.3266512979853665e-05, + "loss": 2.197, + "step": 3360 + }, + { + "epoch": 1.01, + "eval_loss": 0.22327160835266113, + "eval_runtime": 43.5349, + "eval_samples_per_second": 33.973, + "eval_steps_per_second": 33.973, + "step": 3360 + }, + { + "epoch": 1.01, + "grad_norm": 12.542174339294434, + "learning_rate": 1.3264508369249273e-05, + "loss": 1.7032, + "step": 3361 + }, + { + "epoch": 1.01, + "grad_norm": 12.516151428222656, + "learning_rate": 1.3262503758644885e-05, + "loss": 1.4823, + "step": 3362 + }, + { + "epoch": 1.01, + "grad_norm": 13.449585914611816, + "learning_rate": 1.3260499148040493e-05, + "loss": 1.9671, + "step": 3363 + }, + { + "epoch": 1.01, + "grad_norm": 20.43147850036621, + "learning_rate": 1.3258494537436104e-05, + "loss": 1.5756, + "step": 3364 + }, + { + "epoch": 1.01, + "grad_norm": 9.373948097229004, + "learning_rate": 1.3256489926831715e-05, + "loss": 2.1166, + "step": 3365 + }, + { + "epoch": 1.01, + "grad_norm": 28.55181312561035, + "learning_rate": 1.3254485316227324e-05, + "loss": 2.7417, + "step": 3366 + }, + { + "epoch": 1.01, + "grad_norm": 23.936809539794922, + "learning_rate": 1.3252480705622934e-05, + "loss": 1.5005, + "step": 3367 + }, + { + "epoch": 1.01, + "grad_norm": 14.939528465270996, + "learning_rate": 1.3250476095018542e-05, + "loss": 2.3812, + "step": 3368 + }, + { + "epoch": 1.01, + "grad_norm": 23.387401580810547, + "learning_rate": 1.3248471484414154e-05, + "loss": 2.5629, + "step": 3369 + }, + { + "epoch": 1.01, + "grad_norm": 17.187917709350586, + "learning_rate": 1.3246466873809764e-05, + "loss": 1.8393, + "step": 3370 + }, + { + "epoch": 1.01, + "grad_norm": 15.523499488830566, + "learning_rate": 1.3244462263205372e-05, + "loss": 2.066, + "step": 3371 + }, + { + "epoch": 1.01, + "grad_norm": 8.489646911621094, + "learning_rate": 1.3242457652600984e-05, + "loss": 1.5775, + "step": 3372 + }, + { + "epoch": 1.01, + "grad_norm": 12.514199256896973, + "learning_rate": 1.3240453041996593e-05, + "loss": 1.3532, + "step": 3373 + }, + { + "epoch": 1.01, + "grad_norm": 39.9799919128418, + "learning_rate": 1.3238448431392203e-05, + "loss": 1.7734, + "step": 3374 + }, + { + "epoch": 1.01, + "grad_norm": 12.430253028869629, + "learning_rate": 1.3236443820787813e-05, + "loss": 2.0699, + "step": 3375 + }, + { + "epoch": 1.02, + "grad_norm": 32.21778106689453, + "learning_rate": 1.3234439210183423e-05, + "loss": 2.4161, + "step": 3376 + }, + { + "epoch": 1.02, + "grad_norm": 10.491954803466797, + "learning_rate": 1.3232434599579033e-05, + "loss": 2.0041, + "step": 3377 + }, + { + "epoch": 1.02, + "grad_norm": 24.133899688720703, + "learning_rate": 1.3230429988974643e-05, + "loss": 2.884, + "step": 3378 + }, + { + "epoch": 1.02, + "grad_norm": 13.717731475830078, + "learning_rate": 1.3228425378370253e-05, + "loss": 2.4387, + "step": 3379 + }, + { + "epoch": 1.02, + "grad_norm": 20.04202651977539, + "learning_rate": 1.3226420767765862e-05, + "loss": 1.764, + "step": 3380 + }, + { + "epoch": 1.02, + "grad_norm": 16.394018173217773, + "learning_rate": 1.3224416157161473e-05, + "loss": 1.6656, + "step": 3381 + }, + { + "epoch": 1.02, + "grad_norm": 6.424349308013916, + "learning_rate": 1.3222411546557083e-05, + "loss": 1.2659, + "step": 3382 + }, + { + "epoch": 1.02, + "grad_norm": 12.825157165527344, + "learning_rate": 1.3220406935952692e-05, + "loss": 1.5172, + "step": 3383 + }, + { + "epoch": 1.02, + "grad_norm": 32.225006103515625, + "learning_rate": 1.3218402325348304e-05, + "loss": 2.4318, + "step": 3384 + }, + { + "epoch": 1.02, + "grad_norm": 10.880355834960938, + "learning_rate": 1.3216397714743912e-05, + "loss": 1.5311, + "step": 3385 + }, + { + "epoch": 1.02, + "grad_norm": 23.808944702148438, + "learning_rate": 1.3214393104139522e-05, + "loss": 2.809, + "step": 3386 + }, + { + "epoch": 1.02, + "grad_norm": 18.621950149536133, + "learning_rate": 1.321238849353513e-05, + "loss": 1.5185, + "step": 3387 + }, + { + "epoch": 1.02, + "grad_norm": 13.235946655273438, + "learning_rate": 1.3210383882930742e-05, + "loss": 1.5264, + "step": 3388 + }, + { + "epoch": 1.02, + "grad_norm": 12.991135597229004, + "learning_rate": 1.3208379272326352e-05, + "loss": 1.8659, + "step": 3389 + }, + { + "epoch": 1.02, + "grad_norm": 38.41598129272461, + "learning_rate": 1.320637466172196e-05, + "loss": 2.2308, + "step": 3390 + }, + { + "epoch": 1.02, + "grad_norm": 16.32402801513672, + "learning_rate": 1.3204370051117572e-05, + "loss": 1.7613, + "step": 3391 + }, + { + "epoch": 1.02, + "grad_norm": 18.9952335357666, + "learning_rate": 1.320236544051318e-05, + "loss": 1.6083, + "step": 3392 + }, + { + "epoch": 1.02, + "grad_norm": 20.658859252929688, + "learning_rate": 1.3200360829908791e-05, + "loss": 1.7974, + "step": 3393 + }, + { + "epoch": 1.02, + "grad_norm": 15.612691879272461, + "learning_rate": 1.31983562193044e-05, + "loss": 2.5692, + "step": 3394 + }, + { + "epoch": 1.02, + "grad_norm": 49.652915954589844, + "learning_rate": 1.3196351608700011e-05, + "loss": 2.6861, + "step": 3395 + }, + { + "epoch": 1.02, + "grad_norm": 10.916786193847656, + "learning_rate": 1.3194346998095621e-05, + "loss": 1.4085, + "step": 3396 + }, + { + "epoch": 1.02, + "grad_norm": 21.12449073791504, + "learning_rate": 1.3192342387491231e-05, + "loss": 2.6789, + "step": 3397 + }, + { + "epoch": 1.02, + "grad_norm": 18.018606185913086, + "learning_rate": 1.3190337776886841e-05, + "loss": 1.6874, + "step": 3398 + }, + { + "epoch": 1.02, + "grad_norm": 22.249736785888672, + "learning_rate": 1.318833316628245e-05, + "loss": 2.0898, + "step": 3399 + }, + { + "epoch": 1.02, + "grad_norm": 14.536703109741211, + "learning_rate": 1.3186328555678062e-05, + "loss": 2.2679, + "step": 3400 + }, + { + "epoch": 1.02, + "grad_norm": 16.38414764404297, + "learning_rate": 1.3184323945073672e-05, + "loss": 1.3831, + "step": 3401 + }, + { + "epoch": 1.02, + "grad_norm": 16.283510208129883, + "learning_rate": 1.318231933446928e-05, + "loss": 1.622, + "step": 3402 + }, + { + "epoch": 1.02, + "grad_norm": 12.426440238952637, + "learning_rate": 1.3180314723864892e-05, + "loss": 1.574, + "step": 3403 + }, + { + "epoch": 1.02, + "grad_norm": 10.712419509887695, + "learning_rate": 1.31783101132605e-05, + "loss": 1.8414, + "step": 3404 + }, + { + "epoch": 1.02, + "grad_norm": 12.0217924118042, + "learning_rate": 1.317630550265611e-05, + "loss": 1.817, + "step": 3405 + }, + { + "epoch": 1.02, + "grad_norm": 10.142142295837402, + "learning_rate": 1.3174300892051719e-05, + "loss": 1.7129, + "step": 3406 + }, + { + "epoch": 1.02, + "grad_norm": 17.107044219970703, + "learning_rate": 1.317229628144733e-05, + "loss": 2.2242, + "step": 3407 + }, + { + "epoch": 1.02, + "grad_norm": 30.633485794067383, + "learning_rate": 1.317029167084294e-05, + "loss": 2.3054, + "step": 3408 + }, + { + "epoch": 1.02, + "grad_norm": 10.093416213989258, + "learning_rate": 1.3168287060238549e-05, + "loss": 1.6337, + "step": 3409 + }, + { + "epoch": 1.03, + "grad_norm": 16.132856369018555, + "learning_rate": 1.316628244963416e-05, + "loss": 1.8572, + "step": 3410 + }, + { + "epoch": 1.03, + "grad_norm": 26.83838653564453, + "learning_rate": 1.3164277839029769e-05, + "loss": 1.6212, + "step": 3411 + }, + { + "epoch": 1.03, + "grad_norm": 18.7099609375, + "learning_rate": 1.3162273228425379e-05, + "loss": 2.4694, + "step": 3412 + }, + { + "epoch": 1.03, + "grad_norm": 15.113785743713379, + "learning_rate": 1.3160268617820991e-05, + "loss": 1.8307, + "step": 3413 + }, + { + "epoch": 1.03, + "grad_norm": 13.659159660339355, + "learning_rate": 1.31582640072166e-05, + "loss": 2.0593, + "step": 3414 + }, + { + "epoch": 1.03, + "grad_norm": 25.388036727905273, + "learning_rate": 1.315625939661221e-05, + "loss": 2.2341, + "step": 3415 + }, + { + "epoch": 1.03, + "grad_norm": 7.588253021240234, + "learning_rate": 1.3154254786007818e-05, + "loss": 1.8049, + "step": 3416 + }, + { + "epoch": 1.03, + "grad_norm": 18.153966903686523, + "learning_rate": 1.315225017540343e-05, + "loss": 2.1183, + "step": 3417 + }, + { + "epoch": 1.03, + "grad_norm": 12.177376747131348, + "learning_rate": 1.3150245564799038e-05, + "loss": 1.611, + "step": 3418 + }, + { + "epoch": 1.03, + "grad_norm": 15.74305534362793, + "learning_rate": 1.3148240954194648e-05, + "loss": 2.5822, + "step": 3419 + }, + { + "epoch": 1.03, + "grad_norm": 13.683473587036133, + "learning_rate": 1.314623634359026e-05, + "loss": 1.4373, + "step": 3420 + }, + { + "epoch": 1.03, + "grad_norm": 17.658405303955078, + "learning_rate": 1.3144231732985868e-05, + "loss": 1.9642, + "step": 3421 + }, + { + "epoch": 1.03, + "grad_norm": 10.713934898376465, + "learning_rate": 1.3142227122381478e-05, + "loss": 1.7652, + "step": 3422 + }, + { + "epoch": 1.03, + "grad_norm": 10.191012382507324, + "learning_rate": 1.3140222511777088e-05, + "loss": 1.7921, + "step": 3423 + }, + { + "epoch": 1.03, + "grad_norm": 9.874408721923828, + "learning_rate": 1.3138217901172698e-05, + "loss": 0.9375, + "step": 3424 + }, + { + "epoch": 1.03, + "grad_norm": 18.47734260559082, + "learning_rate": 1.3136213290568307e-05, + "loss": 2.3192, + "step": 3425 + }, + { + "epoch": 1.03, + "grad_norm": 16.030399322509766, + "learning_rate": 1.3134208679963919e-05, + "loss": 1.957, + "step": 3426 + }, + { + "epoch": 1.03, + "grad_norm": 8.997057914733887, + "learning_rate": 1.3132204069359529e-05, + "loss": 1.5734, + "step": 3427 + }, + { + "epoch": 1.03, + "grad_norm": 174.50979614257812, + "learning_rate": 1.3130199458755137e-05, + "loss": 1.8404, + "step": 3428 + }, + { + "epoch": 1.03, + "grad_norm": 15.005576133728027, + "learning_rate": 1.3128194848150749e-05, + "loss": 1.4347, + "step": 3429 + }, + { + "epoch": 1.03, + "grad_norm": 13.528023719787598, + "learning_rate": 1.3126190237546357e-05, + "loss": 2.0983, + "step": 3430 + }, + { + "epoch": 1.03, + "grad_norm": 61.08424377441406, + "learning_rate": 1.3124185626941967e-05, + "loss": 1.3849, + "step": 3431 + }, + { + "epoch": 1.03, + "grad_norm": 19.432321548461914, + "learning_rate": 1.3122181016337579e-05, + "loss": 2.5755, + "step": 3432 + }, + { + "epoch": 1.03, + "grad_norm": 10.8148775100708, + "learning_rate": 1.3120176405733188e-05, + "loss": 2.1681, + "step": 3433 + }, + { + "epoch": 1.03, + "grad_norm": 19.197267532348633, + "learning_rate": 1.3118171795128798e-05, + "loss": 1.7698, + "step": 3434 + }, + { + "epoch": 1.03, + "grad_norm": 16.004776000976562, + "learning_rate": 1.3116167184524406e-05, + "loss": 2.1169, + "step": 3435 + }, + { + "epoch": 1.03, + "grad_norm": 36.55406188964844, + "learning_rate": 1.3114162573920018e-05, + "loss": 2.0371, + "step": 3436 + }, + { + "epoch": 1.03, + "grad_norm": 20.354867935180664, + "learning_rate": 1.3112157963315626e-05, + "loss": 1.5596, + "step": 3437 + }, + { + "epoch": 1.03, + "grad_norm": 13.395331382751465, + "learning_rate": 1.3110153352711236e-05, + "loss": 1.3172, + "step": 3438 + }, + { + "epoch": 1.03, + "grad_norm": 17.100326538085938, + "learning_rate": 1.3108148742106848e-05, + "loss": 1.9774, + "step": 3439 + }, + { + "epoch": 1.03, + "grad_norm": 13.835416793823242, + "learning_rate": 1.3106144131502456e-05, + "loss": 1.8643, + "step": 3440 + }, + { + "epoch": 1.03, + "grad_norm": 15.755050659179688, + "learning_rate": 1.3104139520898067e-05, + "loss": 3.0466, + "step": 3441 + }, + { + "epoch": 1.03, + "grad_norm": 19.994091033935547, + "learning_rate": 1.3102134910293675e-05, + "loss": 1.9489, + "step": 3442 + }, + { + "epoch": 1.04, + "grad_norm": 10.072383880615234, + "learning_rate": 1.3100130299689287e-05, + "loss": 2.0764, + "step": 3443 + }, + { + "epoch": 1.04, + "grad_norm": 10.777514457702637, + "learning_rate": 1.3098125689084897e-05, + "loss": 1.2584, + "step": 3444 + }, + { + "epoch": 1.04, + "grad_norm": 24.992101669311523, + "learning_rate": 1.3096121078480505e-05, + "loss": 2.3589, + "step": 3445 + }, + { + "epoch": 1.04, + "grad_norm": 13.744710922241211, + "learning_rate": 1.3094116467876117e-05, + "loss": 1.6164, + "step": 3446 + }, + { + "epoch": 1.04, + "grad_norm": 9.5302095413208, + "learning_rate": 1.3092111857271725e-05, + "loss": 1.4418, + "step": 3447 + }, + { + "epoch": 1.04, + "grad_norm": 16.148855209350586, + "learning_rate": 1.3090107246667335e-05, + "loss": 1.4431, + "step": 3448 + }, + { + "epoch": 1.04, + "grad_norm": 31.971755981445312, + "learning_rate": 1.3088102636062945e-05, + "loss": 1.506, + "step": 3449 + }, + { + "epoch": 1.04, + "grad_norm": 18.346967697143555, + "learning_rate": 1.3086098025458556e-05, + "loss": 1.6131, + "step": 3450 + }, + { + "epoch": 1.04, + "grad_norm": 29.8282527923584, + "learning_rate": 1.3084093414854167e-05, + "loss": 2.3438, + "step": 3451 + }, + { + "epoch": 1.04, + "grad_norm": 19.195152282714844, + "learning_rate": 1.3082088804249776e-05, + "loss": 1.9727, + "step": 3452 + }, + { + "epoch": 1.04, + "grad_norm": 17.298931121826172, + "learning_rate": 1.3080084193645386e-05, + "loss": 1.5604, + "step": 3453 + }, + { + "epoch": 1.04, + "grad_norm": 10.650527954101562, + "learning_rate": 1.3078079583040994e-05, + "loss": 1.0794, + "step": 3454 + }, + { + "epoch": 1.04, + "grad_norm": 16.008625030517578, + "learning_rate": 1.3076074972436606e-05, + "loss": 1.7114, + "step": 3455 + }, + { + "epoch": 1.04, + "grad_norm": 15.615730285644531, + "learning_rate": 1.3074070361832216e-05, + "loss": 2.0135, + "step": 3456 + }, + { + "epoch": 1.04, + "grad_norm": 17.91425132751465, + "learning_rate": 1.3072065751227824e-05, + "loss": 1.8132, + "step": 3457 + }, + { + "epoch": 1.04, + "grad_norm": 16.860435485839844, + "learning_rate": 1.3070061140623436e-05, + "loss": 1.9834, + "step": 3458 + }, + { + "epoch": 1.04, + "grad_norm": 12.117476463317871, + "learning_rate": 1.3068056530019045e-05, + "loss": 1.62, + "step": 3459 + }, + { + "epoch": 1.04, + "grad_norm": 18.71920394897461, + "learning_rate": 1.3066051919414655e-05, + "loss": 2.2109, + "step": 3460 + }, + { + "epoch": 1.04, + "grad_norm": 18.306224822998047, + "learning_rate": 1.3064047308810263e-05, + "loss": 2.2506, + "step": 3461 + }, + { + "epoch": 1.04, + "grad_norm": 27.221975326538086, + "learning_rate": 1.3062042698205875e-05, + "loss": 2.4873, + "step": 3462 + }, + { + "epoch": 1.04, + "grad_norm": 31.973318099975586, + "learning_rate": 1.3060038087601485e-05, + "loss": 2.6493, + "step": 3463 + }, + { + "epoch": 1.04, + "grad_norm": 10.039262771606445, + "learning_rate": 1.3058033476997093e-05, + "loss": 1.6217, + "step": 3464 + }, + { + "epoch": 1.04, + "grad_norm": 17.775131225585938, + "learning_rate": 1.3056028866392705e-05, + "loss": 1.6111, + "step": 3465 + }, + { + "epoch": 1.04, + "grad_norm": 41.91878890991211, + "learning_rate": 1.3054024255788314e-05, + "loss": 2.4916, + "step": 3466 + }, + { + "epoch": 1.04, + "grad_norm": 19.899276733398438, + "learning_rate": 1.3052019645183924e-05, + "loss": 2.2337, + "step": 3467 + }, + { + "epoch": 1.04, + "grad_norm": 12.127092361450195, + "learning_rate": 1.3050015034579534e-05, + "loss": 1.9265, + "step": 3468 + }, + { + "epoch": 1.04, + "grad_norm": 28.322265625, + "learning_rate": 1.3048010423975144e-05, + "loss": 1.8237, + "step": 3469 + }, + { + "epoch": 1.04, + "grad_norm": 11.918156623840332, + "learning_rate": 1.3046005813370754e-05, + "loss": 2.1019, + "step": 3470 + }, + { + "epoch": 1.04, + "grad_norm": 10.887943267822266, + "learning_rate": 1.3044001202766364e-05, + "loss": 2.118, + "step": 3471 + }, + { + "epoch": 1.04, + "grad_norm": 14.735898971557617, + "learning_rate": 1.3041996592161974e-05, + "loss": 1.5392, + "step": 3472 + }, + { + "epoch": 1.04, + "grad_norm": 22.728878021240234, + "learning_rate": 1.3039991981557582e-05, + "loss": 1.9459, + "step": 3473 + }, + { + "epoch": 1.04, + "grad_norm": 9.619507789611816, + "learning_rate": 1.3037987370953194e-05, + "loss": 1.1765, + "step": 3474 + }, + { + "epoch": 1.04, + "grad_norm": 13.704413414001465, + "learning_rate": 1.3035982760348804e-05, + "loss": 1.5395, + "step": 3475 + }, + { + "epoch": 1.05, + "grad_norm": 13.936915397644043, + "learning_rate": 1.3033978149744413e-05, + "loss": 1.2055, + "step": 3476 + }, + { + "epoch": 1.05, + "grad_norm": 23.567424774169922, + "learning_rate": 1.3031973539140024e-05, + "loss": 1.7821, + "step": 3477 + }, + { + "epoch": 1.05, + "grad_norm": 17.09320640563965, + "learning_rate": 1.3029968928535633e-05, + "loss": 2.4582, + "step": 3478 + }, + { + "epoch": 1.05, + "grad_norm": 12.296806335449219, + "learning_rate": 1.3027964317931243e-05, + "loss": 1.4382, + "step": 3479 + }, + { + "epoch": 1.05, + "grad_norm": 12.535650253295898, + "learning_rate": 1.3025959707326851e-05, + "loss": 1.2141, + "step": 3480 + }, + { + "epoch": 1.05, + "eval_loss": 0.22333772480487823, + "eval_runtime": 43.6637, + "eval_samples_per_second": 33.873, + "eval_steps_per_second": 33.873, + "step": 3480 + }, + { + "epoch": 1.05, + "grad_norm": 8.423393249511719, + "learning_rate": 1.3023955096722463e-05, + "loss": 1.2467, + "step": 3481 + }, + { + "epoch": 1.05, + "grad_norm": 13.054819107055664, + "learning_rate": 1.3021950486118073e-05, + "loss": 1.2692, + "step": 3482 + }, + { + "epoch": 1.05, + "grad_norm": 67.5218734741211, + "learning_rate": 1.3019945875513682e-05, + "loss": 2.0009, + "step": 3483 + }, + { + "epoch": 1.05, + "grad_norm": 51.58919143676758, + "learning_rate": 1.3017941264909293e-05, + "loss": 3.2618, + "step": 3484 + }, + { + "epoch": 1.05, + "grad_norm": 24.077619552612305, + "learning_rate": 1.3015936654304902e-05, + "loss": 2.4753, + "step": 3485 + }, + { + "epoch": 1.05, + "grad_norm": 25.09623146057129, + "learning_rate": 1.3013932043700512e-05, + "loss": 1.9661, + "step": 3486 + }, + { + "epoch": 1.05, + "grad_norm": 33.02434539794922, + "learning_rate": 1.3011927433096124e-05, + "loss": 2.3949, + "step": 3487 + }, + { + "epoch": 1.05, + "grad_norm": 16.488039016723633, + "learning_rate": 1.3009922822491732e-05, + "loss": 2.4064, + "step": 3488 + }, + { + "epoch": 1.05, + "grad_norm": 24.7977237701416, + "learning_rate": 1.3007918211887342e-05, + "loss": 1.2102, + "step": 3489 + }, + { + "epoch": 1.05, + "grad_norm": 16.36675262451172, + "learning_rate": 1.300591360128295e-05, + "loss": 1.4976, + "step": 3490 + }, + { + "epoch": 1.05, + "grad_norm": 16.940837860107422, + "learning_rate": 1.3003908990678562e-05, + "loss": 1.9573, + "step": 3491 + }, + { + "epoch": 1.05, + "grad_norm": 17.5327091217041, + "learning_rate": 1.300190438007417e-05, + "loss": 1.9867, + "step": 3492 + }, + { + "epoch": 1.05, + "grad_norm": 14.204058647155762, + "learning_rate": 1.299989976946978e-05, + "loss": 1.8624, + "step": 3493 + }, + { + "epoch": 1.05, + "grad_norm": 41.586021423339844, + "learning_rate": 1.2997895158865392e-05, + "loss": 3.3605, + "step": 3494 + }, + { + "epoch": 1.05, + "grad_norm": 18.94367218017578, + "learning_rate": 1.2995890548261001e-05, + "loss": 1.3739, + "step": 3495 + }, + { + "epoch": 1.05, + "grad_norm": 23.99241828918457, + "learning_rate": 1.2993885937656611e-05, + "loss": 2.0589, + "step": 3496 + }, + { + "epoch": 1.05, + "grad_norm": 11.752195358276367, + "learning_rate": 1.2991881327052221e-05, + "loss": 1.3307, + "step": 3497 + }, + { + "epoch": 1.05, + "grad_norm": 16.153244018554688, + "learning_rate": 1.2989876716447831e-05, + "loss": 1.6504, + "step": 3498 + }, + { + "epoch": 1.05, + "grad_norm": 15.57225513458252, + "learning_rate": 1.2987872105843441e-05, + "loss": 2.247, + "step": 3499 + }, + { + "epoch": 1.05, + "grad_norm": 14.485071182250977, + "learning_rate": 1.2985867495239051e-05, + "loss": 1.7557, + "step": 3500 + }, + { + "epoch": 1.05, + "grad_norm": 8.054557800292969, + "learning_rate": 1.2983862884634661e-05, + "loss": 1.3367, + "step": 3501 + }, + { + "epoch": 1.05, + "grad_norm": 19.42696189880371, + "learning_rate": 1.298185827403027e-05, + "loss": 1.7359, + "step": 3502 + }, + { + "epoch": 1.05, + "grad_norm": 21.340444564819336, + "learning_rate": 1.2979853663425882e-05, + "loss": 2.8674, + "step": 3503 + }, + { + "epoch": 1.05, + "grad_norm": 16.97898292541504, + "learning_rate": 1.297784905282149e-05, + "loss": 2.4501, + "step": 3504 + }, + { + "epoch": 1.05, + "grad_norm": 17.556812286376953, + "learning_rate": 1.29758444422171e-05, + "loss": 1.421, + "step": 3505 + }, + { + "epoch": 1.05, + "grad_norm": 12.702077865600586, + "learning_rate": 1.2973839831612712e-05, + "loss": 2.1779, + "step": 3506 + }, + { + "epoch": 1.05, + "grad_norm": 17.422500610351562, + "learning_rate": 1.297183522100832e-05, + "loss": 2.1388, + "step": 3507 + }, + { + "epoch": 1.05, + "grad_norm": 27.59862518310547, + "learning_rate": 1.296983061040393e-05, + "loss": 3.23, + "step": 3508 + }, + { + "epoch": 1.06, + "grad_norm": 11.814319610595703, + "learning_rate": 1.2967825999799539e-05, + "loss": 1.5021, + "step": 3509 + }, + { + "epoch": 1.06, + "grad_norm": 10.947737693786621, + "learning_rate": 1.296582138919515e-05, + "loss": 1.298, + "step": 3510 + }, + { + "epoch": 1.06, + "grad_norm": 14.834168434143066, + "learning_rate": 1.2963816778590759e-05, + "loss": 1.6569, + "step": 3511 + }, + { + "epoch": 1.06, + "grad_norm": 27.23177719116211, + "learning_rate": 1.2961812167986369e-05, + "loss": 1.6755, + "step": 3512 + }, + { + "epoch": 1.06, + "grad_norm": 16.08486557006836, + "learning_rate": 1.295980755738198e-05, + "loss": 1.8572, + "step": 3513 + }, + { + "epoch": 1.06, + "grad_norm": 23.723173141479492, + "learning_rate": 1.2957802946777589e-05, + "loss": 2.7123, + "step": 3514 + }, + { + "epoch": 1.06, + "grad_norm": 15.125565528869629, + "learning_rate": 1.29557983361732e-05, + "loss": 2.5594, + "step": 3515 + }, + { + "epoch": 1.06, + "grad_norm": 25.76398468017578, + "learning_rate": 1.295379372556881e-05, + "loss": 1.7984, + "step": 3516 + }, + { + "epoch": 1.06, + "grad_norm": 18.840059280395508, + "learning_rate": 1.295178911496442e-05, + "loss": 2.232, + "step": 3517 + }, + { + "epoch": 1.06, + "grad_norm": 35.28987503051758, + "learning_rate": 1.294978450436003e-05, + "loss": 2.267, + "step": 3518 + }, + { + "epoch": 1.06, + "grad_norm": 7.588108062744141, + "learning_rate": 1.294777989375564e-05, + "loss": 1.2528, + "step": 3519 + }, + { + "epoch": 1.06, + "grad_norm": 13.926599502563477, + "learning_rate": 1.294577528315125e-05, + "loss": 1.7358, + "step": 3520 + }, + { + "epoch": 1.06, + "grad_norm": 16.25490951538086, + "learning_rate": 1.2943770672546858e-05, + "loss": 2.2304, + "step": 3521 + }, + { + "epoch": 1.06, + "grad_norm": 30.820024490356445, + "learning_rate": 1.294176606194247e-05, + "loss": 2.0343, + "step": 3522 + }, + { + "epoch": 1.06, + "grad_norm": 15.902755737304688, + "learning_rate": 1.2939761451338078e-05, + "loss": 1.5815, + "step": 3523 + }, + { + "epoch": 1.06, + "grad_norm": 22.500484466552734, + "learning_rate": 1.2937756840733688e-05, + "loss": 2.0932, + "step": 3524 + }, + { + "epoch": 1.06, + "grad_norm": 11.80160903930664, + "learning_rate": 1.29357522301293e-05, + "loss": 1.8902, + "step": 3525 + }, + { + "epoch": 1.06, + "grad_norm": 24.187225341796875, + "learning_rate": 1.2933747619524908e-05, + "loss": 1.9731, + "step": 3526 + }, + { + "epoch": 1.06, + "grad_norm": 13.519028663635254, + "learning_rate": 1.2931743008920519e-05, + "loss": 1.9801, + "step": 3527 + }, + { + "epoch": 1.06, + "grad_norm": 54.62118911743164, + "learning_rate": 1.2929738398316127e-05, + "loss": 3.4219, + "step": 3528 + }, + { + "epoch": 1.06, + "grad_norm": 12.734724044799805, + "learning_rate": 1.2927733787711739e-05, + "loss": 2.3175, + "step": 3529 + }, + { + "epoch": 1.06, + "grad_norm": 15.261761665344238, + "learning_rate": 1.2925729177107349e-05, + "loss": 2.6037, + "step": 3530 + }, + { + "epoch": 1.06, + "grad_norm": 19.809280395507812, + "learning_rate": 1.2923724566502957e-05, + "loss": 1.9901, + "step": 3531 + }, + { + "epoch": 1.06, + "grad_norm": 18.573335647583008, + "learning_rate": 1.2921719955898569e-05, + "loss": 2.4825, + "step": 3532 + }, + { + "epoch": 1.06, + "grad_norm": 17.13604736328125, + "learning_rate": 1.2919715345294177e-05, + "loss": 1.3552, + "step": 3533 + }, + { + "epoch": 1.06, + "grad_norm": 14.92817497253418, + "learning_rate": 1.2917710734689787e-05, + "loss": 2.1547, + "step": 3534 + }, + { + "epoch": 1.06, + "grad_norm": 26.906400680541992, + "learning_rate": 1.2915706124085396e-05, + "loss": 1.741, + "step": 3535 + }, + { + "epoch": 1.06, + "grad_norm": 17.950481414794922, + "learning_rate": 1.2913701513481008e-05, + "loss": 1.7533, + "step": 3536 + }, + { + "epoch": 1.06, + "grad_norm": 18.598140716552734, + "learning_rate": 1.2911696902876618e-05, + "loss": 1.9817, + "step": 3537 + }, + { + "epoch": 1.06, + "grad_norm": 20.901409149169922, + "learning_rate": 1.2909692292272226e-05, + "loss": 1.8785, + "step": 3538 + }, + { + "epoch": 1.06, + "grad_norm": 13.569160461425781, + "learning_rate": 1.2907687681667838e-05, + "loss": 2.0731, + "step": 3539 + }, + { + "epoch": 1.06, + "grad_norm": 16.909326553344727, + "learning_rate": 1.2905683071063446e-05, + "loss": 1.912, + "step": 3540 + }, + { + "epoch": 1.06, + "grad_norm": 15.05936050415039, + "learning_rate": 1.2903678460459056e-05, + "loss": 3.6215, + "step": 3541 + }, + { + "epoch": 1.06, + "grad_norm": 12.6626615524292, + "learning_rate": 1.2901673849854668e-05, + "loss": 1.6078, + "step": 3542 + }, + { + "epoch": 1.07, + "grad_norm": 12.250699043273926, + "learning_rate": 1.2899669239250276e-05, + "loss": 1.9456, + "step": 3543 + }, + { + "epoch": 1.07, + "grad_norm": 28.121313095092773, + "learning_rate": 1.2897664628645887e-05, + "loss": 2.6878, + "step": 3544 + }, + { + "epoch": 1.07, + "grad_norm": 21.93025779724121, + "learning_rate": 1.2895660018041497e-05, + "loss": 2.364, + "step": 3545 + }, + { + "epoch": 1.07, + "grad_norm": 11.087515830993652, + "learning_rate": 1.2893655407437107e-05, + "loss": 2.0794, + "step": 3546 + }, + { + "epoch": 1.07, + "grad_norm": 38.5289421081543, + "learning_rate": 1.2891650796832715e-05, + "loss": 2.4577, + "step": 3547 + }, + { + "epoch": 1.07, + "grad_norm": 29.22125816345215, + "learning_rate": 1.2889646186228327e-05, + "loss": 2.4881, + "step": 3548 + }, + { + "epoch": 1.07, + "grad_norm": 25.916982650756836, + "learning_rate": 1.2887641575623937e-05, + "loss": 1.4721, + "step": 3549 + }, + { + "epoch": 1.07, + "grad_norm": 20.641904830932617, + "learning_rate": 1.2885636965019545e-05, + "loss": 1.6015, + "step": 3550 + }, + { + "epoch": 1.07, + "grad_norm": 14.579924583435059, + "learning_rate": 1.2883632354415157e-05, + "loss": 1.638, + "step": 3551 + }, + { + "epoch": 1.07, + "grad_norm": 36.379661560058594, + "learning_rate": 1.2881627743810766e-05, + "loss": 2.2669, + "step": 3552 + }, + { + "epoch": 1.07, + "grad_norm": 16.51966094970703, + "learning_rate": 1.2879623133206376e-05, + "loss": 1.7893, + "step": 3553 + }, + { + "epoch": 1.07, + "grad_norm": 23.127784729003906, + "learning_rate": 1.2877618522601984e-05, + "loss": 2.266, + "step": 3554 + }, + { + "epoch": 1.07, + "grad_norm": 17.028398513793945, + "learning_rate": 1.2875613911997596e-05, + "loss": 1.6085, + "step": 3555 + }, + { + "epoch": 1.07, + "grad_norm": 9.327510833740234, + "learning_rate": 1.2873609301393206e-05, + "loss": 1.155, + "step": 3556 + }, + { + "epoch": 1.07, + "grad_norm": 9.496240615844727, + "learning_rate": 1.2871604690788814e-05, + "loss": 1.3495, + "step": 3557 + }, + { + "epoch": 1.07, + "grad_norm": 10.282184600830078, + "learning_rate": 1.2869600080184426e-05, + "loss": 1.4117, + "step": 3558 + }, + { + "epoch": 1.07, + "grad_norm": 15.490508079528809, + "learning_rate": 1.2867595469580034e-05, + "loss": 1.5863, + "step": 3559 + }, + { + "epoch": 1.07, + "grad_norm": 17.8187198638916, + "learning_rate": 1.2865590858975645e-05, + "loss": 1.2819, + "step": 3560 + }, + { + "epoch": 1.07, + "grad_norm": 17.21719741821289, + "learning_rate": 1.2863586248371256e-05, + "loss": 2.0228, + "step": 3561 + }, + { + "epoch": 1.07, + "grad_norm": 23.23194694519043, + "learning_rate": 1.2861581637766865e-05, + "loss": 1.858, + "step": 3562 + }, + { + "epoch": 1.07, + "grad_norm": 25.972341537475586, + "learning_rate": 1.2859577027162475e-05, + "loss": 1.3268, + "step": 3563 + }, + { + "epoch": 1.07, + "grad_norm": 27.290193557739258, + "learning_rate": 1.2857572416558083e-05, + "loss": 2.174, + "step": 3564 + }, + { + "epoch": 1.07, + "grad_norm": 14.838099479675293, + "learning_rate": 1.2855567805953695e-05, + "loss": 1.9506, + "step": 3565 + }, + { + "epoch": 1.07, + "grad_norm": 8.05460262298584, + "learning_rate": 1.2853563195349303e-05, + "loss": 1.2423, + "step": 3566 + }, + { + "epoch": 1.07, + "grad_norm": 10.019596099853516, + "learning_rate": 1.2851558584744913e-05, + "loss": 1.405, + "step": 3567 + }, + { + "epoch": 1.07, + "grad_norm": 16.532649993896484, + "learning_rate": 1.2849553974140525e-05, + "loss": 1.5799, + "step": 3568 + }, + { + "epoch": 1.07, + "grad_norm": 14.281306266784668, + "learning_rate": 1.2847549363536134e-05, + "loss": 1.1582, + "step": 3569 + }, + { + "epoch": 1.07, + "grad_norm": 9.972299575805664, + "learning_rate": 1.2845544752931745e-05, + "loss": 1.2031, + "step": 3570 + }, + { + "epoch": 1.07, + "grad_norm": 14.451814651489258, + "learning_rate": 1.2843540142327354e-05, + "loss": 1.5313, + "step": 3571 + }, + { + "epoch": 1.07, + "grad_norm": 15.139348030090332, + "learning_rate": 1.2841535531722964e-05, + "loss": 1.9977, + "step": 3572 + }, + { + "epoch": 1.07, + "grad_norm": 25.56646156311035, + "learning_rate": 1.2839530921118576e-05, + "loss": 2.2934, + "step": 3573 + }, + { + "epoch": 1.07, + "grad_norm": 11.112027168273926, + "learning_rate": 1.2837526310514184e-05, + "loss": 1.5678, + "step": 3574 + }, + { + "epoch": 1.07, + "grad_norm": 14.32687759399414, + "learning_rate": 1.2835521699909794e-05, + "loss": 1.1919, + "step": 3575 + }, + { + "epoch": 1.08, + "grad_norm": 11.444255828857422, + "learning_rate": 1.2833517089305402e-05, + "loss": 2.2952, + "step": 3576 + }, + { + "epoch": 1.08, + "grad_norm": 34.31683349609375, + "learning_rate": 1.2831512478701014e-05, + "loss": 1.8054, + "step": 3577 + }, + { + "epoch": 1.08, + "grad_norm": 16.4061336517334, + "learning_rate": 1.2829507868096623e-05, + "loss": 1.1196, + "step": 3578 + }, + { + "epoch": 1.08, + "grad_norm": 12.305190086364746, + "learning_rate": 1.2827503257492233e-05, + "loss": 2.2021, + "step": 3579 + }, + { + "epoch": 1.08, + "grad_norm": 16.25429916381836, + "learning_rate": 1.2825498646887845e-05, + "loss": 1.629, + "step": 3580 + }, + { + "epoch": 1.08, + "grad_norm": 25.61264419555664, + "learning_rate": 1.2823494036283453e-05, + "loss": 2.7876, + "step": 3581 + }, + { + "epoch": 1.08, + "grad_norm": 20.26491928100586, + "learning_rate": 1.2821489425679063e-05, + "loss": 1.1916, + "step": 3582 + }, + { + "epoch": 1.08, + "grad_norm": 10.56248950958252, + "learning_rate": 1.2819484815074671e-05, + "loss": 1.302, + "step": 3583 + }, + { + "epoch": 1.08, + "grad_norm": 15.701934814453125, + "learning_rate": 1.2817480204470283e-05, + "loss": 1.5957, + "step": 3584 + }, + { + "epoch": 1.08, + "grad_norm": 17.989017486572266, + "learning_rate": 1.2815475593865893e-05, + "loss": 1.2969, + "step": 3585 + }, + { + "epoch": 1.08, + "grad_norm": 22.148818969726562, + "learning_rate": 1.2813470983261502e-05, + "loss": 2.5261, + "step": 3586 + }, + { + "epoch": 1.08, + "grad_norm": 26.945255279541016, + "learning_rate": 1.2811466372657113e-05, + "loss": 1.8008, + "step": 3587 + }, + { + "epoch": 1.08, + "grad_norm": 13.003181457519531, + "learning_rate": 1.2809461762052722e-05, + "loss": 1.4448, + "step": 3588 + }, + { + "epoch": 1.08, + "grad_norm": 13.033709526062012, + "learning_rate": 1.2807457151448332e-05, + "loss": 2.1353, + "step": 3589 + }, + { + "epoch": 1.08, + "grad_norm": 16.16973876953125, + "learning_rate": 1.2805452540843942e-05, + "loss": 1.847, + "step": 3590 + }, + { + "epoch": 1.08, + "grad_norm": 14.109622955322266, + "learning_rate": 1.2803447930239552e-05, + "loss": 2.0083, + "step": 3591 + }, + { + "epoch": 1.08, + "grad_norm": 13.897767066955566, + "learning_rate": 1.2801443319635162e-05, + "loss": 1.2688, + "step": 3592 + }, + { + "epoch": 1.08, + "grad_norm": 18.122596740722656, + "learning_rate": 1.2799438709030772e-05, + "loss": 1.9293, + "step": 3593 + }, + { + "epoch": 1.08, + "grad_norm": 41.87525939941406, + "learning_rate": 1.2797434098426382e-05, + "loss": 3.1812, + "step": 3594 + }, + { + "epoch": 1.08, + "grad_norm": 22.537267684936523, + "learning_rate": 1.279542948782199e-05, + "loss": 1.9805, + "step": 3595 + }, + { + "epoch": 1.08, + "grad_norm": 16.393918991088867, + "learning_rate": 1.2793424877217602e-05, + "loss": 1.6751, + "step": 3596 + }, + { + "epoch": 1.08, + "grad_norm": 15.888949394226074, + "learning_rate": 1.2791420266613211e-05, + "loss": 2.2047, + "step": 3597 + }, + { + "epoch": 1.08, + "grad_norm": 13.120323181152344, + "learning_rate": 1.2789415656008821e-05, + "loss": 1.527, + "step": 3598 + }, + { + "epoch": 1.08, + "grad_norm": 14.314374923706055, + "learning_rate": 1.2787411045404433e-05, + "loss": 2.3968, + "step": 3599 + }, + { + "epoch": 1.08, + "grad_norm": 20.696285247802734, + "learning_rate": 1.2785406434800041e-05, + "loss": 1.8987, + "step": 3600 + }, + { + "epoch": 1.08, + "eval_loss": 0.23791223764419556, + "eval_runtime": 43.6058, + "eval_samples_per_second": 33.917, + "eval_steps_per_second": 33.917, + "step": 3600 + }, + { + "epoch": 1.08, + "grad_norm": 24.901500701904297, + "learning_rate": 1.2783401824195651e-05, + "loss": 1.9859, + "step": 3601 + }, + { + "epoch": 1.08, + "grad_norm": 15.03045654296875, + "learning_rate": 1.278139721359126e-05, + "loss": 1.6967, + "step": 3602 + }, + { + "epoch": 1.08, + "grad_norm": 16.57175064086914, + "learning_rate": 1.2779392602986871e-05, + "loss": 1.8149, + "step": 3603 + }, + { + "epoch": 1.08, + "grad_norm": 15.594796180725098, + "learning_rate": 1.2777387992382481e-05, + "loss": 1.4157, + "step": 3604 + }, + { + "epoch": 1.08, + "grad_norm": 14.430771827697754, + "learning_rate": 1.277538338177809e-05, + "loss": 2.3501, + "step": 3605 + }, + { + "epoch": 1.08, + "grad_norm": 38.617862701416016, + "learning_rate": 1.2773378771173702e-05, + "loss": 1.98, + "step": 3606 + }, + { + "epoch": 1.08, + "grad_norm": 19.550251007080078, + "learning_rate": 1.277137416056931e-05, + "loss": 1.2882, + "step": 3607 + }, + { + "epoch": 1.08, + "grad_norm": 19.823959350585938, + "learning_rate": 1.276936954996492e-05, + "loss": 2.7407, + "step": 3608 + }, + { + "epoch": 1.09, + "grad_norm": 46.88768005371094, + "learning_rate": 1.2767364939360528e-05, + "loss": 1.9532, + "step": 3609 + }, + { + "epoch": 1.09, + "grad_norm": 15.034628868103027, + "learning_rate": 1.276536032875614e-05, + "loss": 1.2125, + "step": 3610 + }, + { + "epoch": 1.09, + "grad_norm": 24.965927124023438, + "learning_rate": 1.276335571815175e-05, + "loss": 2.4233, + "step": 3611 + }, + { + "epoch": 1.09, + "grad_norm": 26.463823318481445, + "learning_rate": 1.2761351107547359e-05, + "loss": 2.3175, + "step": 3612 + }, + { + "epoch": 1.09, + "grad_norm": 15.110565185546875, + "learning_rate": 1.275934649694297e-05, + "loss": 2.4443, + "step": 3613 + }, + { + "epoch": 1.09, + "grad_norm": 18.711685180664062, + "learning_rate": 1.2757341886338579e-05, + "loss": 1.5244, + "step": 3614 + }, + { + "epoch": 1.09, + "grad_norm": 21.489322662353516, + "learning_rate": 1.2755337275734189e-05, + "loss": 1.9058, + "step": 3615 + }, + { + "epoch": 1.09, + "grad_norm": 15.04526424407959, + "learning_rate": 1.27533326651298e-05, + "loss": 1.4516, + "step": 3616 + }, + { + "epoch": 1.09, + "grad_norm": 12.51637077331543, + "learning_rate": 1.2751328054525409e-05, + "loss": 1.7847, + "step": 3617 + }, + { + "epoch": 1.09, + "grad_norm": 29.387882232666016, + "learning_rate": 1.274932344392102e-05, + "loss": 2.5068, + "step": 3618 + }, + { + "epoch": 1.09, + "grad_norm": 18.071352005004883, + "learning_rate": 1.274731883331663e-05, + "loss": 2.6933, + "step": 3619 + }, + { + "epoch": 1.09, + "grad_norm": 20.221973419189453, + "learning_rate": 1.274531422271224e-05, + "loss": 1.3898, + "step": 3620 + }, + { + "epoch": 1.09, + "grad_norm": 8.028789520263672, + "learning_rate": 1.2743309612107848e-05, + "loss": 1.2193, + "step": 3621 + }, + { + "epoch": 1.09, + "grad_norm": 14.530386924743652, + "learning_rate": 1.274130500150346e-05, + "loss": 2.0765, + "step": 3622 + }, + { + "epoch": 1.09, + "grad_norm": 16.220090866088867, + "learning_rate": 1.273930039089907e-05, + "loss": 1.7681, + "step": 3623 + }, + { + "epoch": 1.09, + "grad_norm": 21.331134796142578, + "learning_rate": 1.2737295780294678e-05, + "loss": 2.7775, + "step": 3624 + }, + { + "epoch": 1.09, + "grad_norm": 11.270216941833496, + "learning_rate": 1.273529116969029e-05, + "loss": 2.3103, + "step": 3625 + }, + { + "epoch": 1.09, + "grad_norm": 16.944839477539062, + "learning_rate": 1.2733286559085898e-05, + "loss": 1.7782, + "step": 3626 + }, + { + "epoch": 1.09, + "grad_norm": 8.570146560668945, + "learning_rate": 1.2731281948481508e-05, + "loss": 1.6687, + "step": 3627 + }, + { + "epoch": 1.09, + "grad_norm": 53.58586502075195, + "learning_rate": 1.272927733787712e-05, + "loss": 3.6285, + "step": 3628 + }, + { + "epoch": 1.09, + "grad_norm": 24.34233856201172, + "learning_rate": 1.2727272727272728e-05, + "loss": 2.7752, + "step": 3629 + }, + { + "epoch": 1.09, + "grad_norm": 14.564997673034668, + "learning_rate": 1.2725268116668339e-05, + "loss": 2.5383, + "step": 3630 + }, + { + "epoch": 1.09, + "grad_norm": 6.960912227630615, + "learning_rate": 1.2723263506063947e-05, + "loss": 0.6939, + "step": 3631 + }, + { + "epoch": 1.09, + "grad_norm": 17.01753044128418, + "learning_rate": 1.2721258895459559e-05, + "loss": 1.6664, + "step": 3632 + }, + { + "epoch": 1.09, + "grad_norm": 11.095540046691895, + "learning_rate": 1.2719254284855167e-05, + "loss": 1.6678, + "step": 3633 + }, + { + "epoch": 1.09, + "grad_norm": 11.197050094604492, + "learning_rate": 1.2717249674250777e-05, + "loss": 1.9901, + "step": 3634 + }, + { + "epoch": 1.09, + "grad_norm": 11.85187816619873, + "learning_rate": 1.2715245063646389e-05, + "loss": 0.7637, + "step": 3635 + }, + { + "epoch": 1.09, + "grad_norm": 46.356109619140625, + "learning_rate": 1.2713240453041997e-05, + "loss": 1.686, + "step": 3636 + }, + { + "epoch": 1.09, + "grad_norm": 72.00715637207031, + "learning_rate": 1.2711235842437607e-05, + "loss": 2.1146, + "step": 3637 + }, + { + "epoch": 1.09, + "grad_norm": 23.22759437561035, + "learning_rate": 1.2709231231833218e-05, + "loss": 1.1876, + "step": 3638 + }, + { + "epoch": 1.09, + "grad_norm": 16.499618530273438, + "learning_rate": 1.2707226621228828e-05, + "loss": 1.4633, + "step": 3639 + }, + { + "epoch": 1.09, + "grad_norm": 35.80891418457031, + "learning_rate": 1.2705222010624436e-05, + "loss": 1.7426, + "step": 3640 + }, + { + "epoch": 1.09, + "grad_norm": 18.7283992767334, + "learning_rate": 1.2703217400020048e-05, + "loss": 1.8939, + "step": 3641 + }, + { + "epoch": 1.1, + "grad_norm": 14.170825958251953, + "learning_rate": 1.2701212789415658e-05, + "loss": 1.992, + "step": 3642 + }, + { + "epoch": 1.1, + "grad_norm": 11.59808349609375, + "learning_rate": 1.2699208178811266e-05, + "loss": 1.9479, + "step": 3643 + }, + { + "epoch": 1.1, + "grad_norm": 27.232282638549805, + "learning_rate": 1.2697203568206878e-05, + "loss": 1.9665, + "step": 3644 + }, + { + "epoch": 1.1, + "grad_norm": 13.292805671691895, + "learning_rate": 1.2695198957602486e-05, + "loss": 1.4353, + "step": 3645 + }, + { + "epoch": 1.1, + "grad_norm": 8.74724292755127, + "learning_rate": 1.2693194346998097e-05, + "loss": 1.3016, + "step": 3646 + }, + { + "epoch": 1.1, + "grad_norm": 19.11276626586914, + "learning_rate": 1.2691189736393708e-05, + "loss": 2.3302, + "step": 3647 + }, + { + "epoch": 1.1, + "grad_norm": 46.45100021362305, + "learning_rate": 1.2689185125789317e-05, + "loss": 2.1202, + "step": 3648 + }, + { + "epoch": 1.1, + "grad_norm": 15.36929702758789, + "learning_rate": 1.2687180515184927e-05, + "loss": 1.7896, + "step": 3649 + }, + { + "epoch": 1.1, + "grad_norm": 20.07041358947754, + "learning_rate": 1.2685175904580535e-05, + "loss": 1.826, + "step": 3650 + }, + { + "epoch": 1.1, + "grad_norm": 20.192039489746094, + "learning_rate": 1.2683171293976147e-05, + "loss": 1.8944, + "step": 3651 + }, + { + "epoch": 1.1, + "grad_norm": 15.358658790588379, + "learning_rate": 1.2681166683371755e-05, + "loss": 2.1073, + "step": 3652 + }, + { + "epoch": 1.1, + "grad_norm": 11.35305404663086, + "learning_rate": 1.2679162072767365e-05, + "loss": 1.167, + "step": 3653 + }, + { + "epoch": 1.1, + "grad_norm": 14.477778434753418, + "learning_rate": 1.2677157462162977e-05, + "loss": 2.1406, + "step": 3654 + }, + { + "epoch": 1.1, + "grad_norm": 25.864349365234375, + "learning_rate": 1.2675152851558586e-05, + "loss": 1.9479, + "step": 3655 + }, + { + "epoch": 1.1, + "grad_norm": 14.707035064697266, + "learning_rate": 1.2673148240954196e-05, + "loss": 2.2159, + "step": 3656 + }, + { + "epoch": 1.1, + "grad_norm": 9.004791259765625, + "learning_rate": 1.2671143630349804e-05, + "loss": 0.9894, + "step": 3657 + }, + { + "epoch": 1.1, + "grad_norm": 16.240209579467773, + "learning_rate": 1.2669139019745416e-05, + "loss": 1.6444, + "step": 3658 + }, + { + "epoch": 1.1, + "grad_norm": 15.315890312194824, + "learning_rate": 1.2667134409141026e-05, + "loss": 2.046, + "step": 3659 + }, + { + "epoch": 1.1, + "grad_norm": 14.05276870727539, + "learning_rate": 1.2665129798536634e-05, + "loss": 2.0308, + "step": 3660 + }, + { + "epoch": 1.1, + "grad_norm": 12.463837623596191, + "learning_rate": 1.2663125187932246e-05, + "loss": 1.3266, + "step": 3661 + }, + { + "epoch": 1.1, + "grad_norm": 10.216002464294434, + "learning_rate": 1.2661120577327854e-05, + "loss": 1.4467, + "step": 3662 + }, + { + "epoch": 1.1, + "grad_norm": 52.47303771972656, + "learning_rate": 1.2659115966723465e-05, + "loss": 3.601, + "step": 3663 + }, + { + "epoch": 1.1, + "grad_norm": 19.366987228393555, + "learning_rate": 1.2657111356119075e-05, + "loss": 1.9766, + "step": 3664 + }, + { + "epoch": 1.1, + "grad_norm": 11.65963077545166, + "learning_rate": 1.2655106745514685e-05, + "loss": 1.6192, + "step": 3665 + }, + { + "epoch": 1.1, + "grad_norm": 14.95705509185791, + "learning_rate": 1.2653102134910295e-05, + "loss": 1.8748, + "step": 3666 + }, + { + "epoch": 1.1, + "grad_norm": 21.516704559326172, + "learning_rate": 1.2651097524305905e-05, + "loss": 2.2989, + "step": 3667 + }, + { + "epoch": 1.1, + "grad_norm": 12.6599760055542, + "learning_rate": 1.2649092913701515e-05, + "loss": 1.9577, + "step": 3668 + }, + { + "epoch": 1.1, + "grad_norm": 20.421907424926758, + "learning_rate": 1.2647088303097123e-05, + "loss": 1.8006, + "step": 3669 + }, + { + "epoch": 1.1, + "grad_norm": 21.49991226196289, + "learning_rate": 1.2645083692492735e-05, + "loss": 1.5208, + "step": 3670 + }, + { + "epoch": 1.1, + "grad_norm": 16.71851348876953, + "learning_rate": 1.2643079081888345e-05, + "loss": 1.3844, + "step": 3671 + }, + { + "epoch": 1.1, + "grad_norm": 12.691519737243652, + "learning_rate": 1.2641074471283954e-05, + "loss": 1.6805, + "step": 3672 + }, + { + "epoch": 1.1, + "grad_norm": 13.60329532623291, + "learning_rate": 1.2639069860679565e-05, + "loss": 1.3261, + "step": 3673 + }, + { + "epoch": 1.1, + "grad_norm": 12.163605690002441, + "learning_rate": 1.2637065250075174e-05, + "loss": 1.2984, + "step": 3674 + }, + { + "epoch": 1.1, + "grad_norm": 11.735462188720703, + "learning_rate": 1.2635060639470784e-05, + "loss": 1.3691, + "step": 3675 + }, + { + "epoch": 1.11, + "grad_norm": 10.237204551696777, + "learning_rate": 1.2633056028866392e-05, + "loss": 1.4796, + "step": 3676 + }, + { + "epoch": 1.11, + "grad_norm": 16.992961883544922, + "learning_rate": 1.2631051418262004e-05, + "loss": 1.5819, + "step": 3677 + }, + { + "epoch": 1.11, + "grad_norm": 25.052305221557617, + "learning_rate": 1.2629046807657614e-05, + "loss": 2.6452, + "step": 3678 + }, + { + "epoch": 1.11, + "grad_norm": 34.612735748291016, + "learning_rate": 1.2627042197053223e-05, + "loss": 1.888, + "step": 3679 + }, + { + "epoch": 1.11, + "grad_norm": 12.312444686889648, + "learning_rate": 1.2625037586448834e-05, + "loss": 1.6545, + "step": 3680 + }, + { + "epoch": 1.11, + "grad_norm": 17.487689971923828, + "learning_rate": 1.2623032975844443e-05, + "loss": 2.225, + "step": 3681 + }, + { + "epoch": 1.11, + "grad_norm": 12.337897300720215, + "learning_rate": 1.2621028365240053e-05, + "loss": 1.285, + "step": 3682 + }, + { + "epoch": 1.11, + "grad_norm": 12.9970121383667, + "learning_rate": 1.2619023754635661e-05, + "loss": 1.5476, + "step": 3683 + }, + { + "epoch": 1.11, + "grad_norm": 36.430641174316406, + "learning_rate": 1.2617019144031273e-05, + "loss": 2.409, + "step": 3684 + }, + { + "epoch": 1.11, + "grad_norm": 11.384819030761719, + "learning_rate": 1.2615014533426883e-05, + "loss": 1.2839, + "step": 3685 + }, + { + "epoch": 1.11, + "grad_norm": 14.997862815856934, + "learning_rate": 1.2613009922822491e-05, + "loss": 2.1861, + "step": 3686 + }, + { + "epoch": 1.11, + "grad_norm": 9.791463851928711, + "learning_rate": 1.2611005312218103e-05, + "loss": 1.6862, + "step": 3687 + }, + { + "epoch": 1.11, + "grad_norm": 20.38786506652832, + "learning_rate": 1.2609000701613712e-05, + "loss": 1.8915, + "step": 3688 + }, + { + "epoch": 1.11, + "grad_norm": 56.49119567871094, + "learning_rate": 1.2606996091009323e-05, + "loss": 1.4701, + "step": 3689 + }, + { + "epoch": 1.11, + "grad_norm": 9.169239044189453, + "learning_rate": 1.2604991480404933e-05, + "loss": 1.0245, + "step": 3690 + }, + { + "epoch": 1.11, + "grad_norm": 18.84854507446289, + "learning_rate": 1.2602986869800542e-05, + "loss": 1.6371, + "step": 3691 + }, + { + "epoch": 1.11, + "grad_norm": 15.238136291503906, + "learning_rate": 1.2600982259196154e-05, + "loss": 1.4109, + "step": 3692 + }, + { + "epoch": 1.11, + "grad_norm": 24.6544189453125, + "learning_rate": 1.2598977648591762e-05, + "loss": 1.8289, + "step": 3693 + }, + { + "epoch": 1.11, + "grad_norm": 15.802862167358398, + "learning_rate": 1.2596973037987372e-05, + "loss": 2.074, + "step": 3694 + }, + { + "epoch": 1.11, + "grad_norm": 21.196699142456055, + "learning_rate": 1.259496842738298e-05, + "loss": 2.4331, + "step": 3695 + }, + { + "epoch": 1.11, + "grad_norm": 14.442291259765625, + "learning_rate": 1.2592963816778592e-05, + "loss": 1.1208, + "step": 3696 + }, + { + "epoch": 1.11, + "grad_norm": 16.67438316345215, + "learning_rate": 1.2590959206174202e-05, + "loss": 1.9479, + "step": 3697 + }, + { + "epoch": 1.11, + "grad_norm": 30.792251586914062, + "learning_rate": 1.258895459556981e-05, + "loss": 1.0711, + "step": 3698 + }, + { + "epoch": 1.11, + "grad_norm": 21.835054397583008, + "learning_rate": 1.2586949984965423e-05, + "loss": 1.4139, + "step": 3699 + }, + { + "epoch": 1.11, + "grad_norm": 23.054439544677734, + "learning_rate": 1.2584945374361031e-05, + "loss": 2.3756, + "step": 3700 + }, + { + "epoch": 1.11, + "grad_norm": 15.562328338623047, + "learning_rate": 1.2582940763756641e-05, + "loss": 1.6298, + "step": 3701 + }, + { + "epoch": 1.11, + "grad_norm": 19.600475311279297, + "learning_rate": 1.2580936153152253e-05, + "loss": 2.0049, + "step": 3702 + }, + { + "epoch": 1.11, + "grad_norm": 15.934944152832031, + "learning_rate": 1.2578931542547861e-05, + "loss": 2.3699, + "step": 3703 + }, + { + "epoch": 1.11, + "grad_norm": 13.748173713684082, + "learning_rate": 1.2576926931943471e-05, + "loss": 2.802, + "step": 3704 + }, + { + "epoch": 1.11, + "grad_norm": 10.191062927246094, + "learning_rate": 1.257492232133908e-05, + "loss": 0.9145, + "step": 3705 + }, + { + "epoch": 1.11, + "grad_norm": 14.744169235229492, + "learning_rate": 1.2572917710734691e-05, + "loss": 1.7588, + "step": 3706 + }, + { + "epoch": 1.11, + "grad_norm": 17.905502319335938, + "learning_rate": 1.25709131001303e-05, + "loss": 1.4409, + "step": 3707 + }, + { + "epoch": 1.11, + "grad_norm": 22.17319679260254, + "learning_rate": 1.256890848952591e-05, + "loss": 1.6653, + "step": 3708 + }, + { + "epoch": 1.12, + "grad_norm": 14.994551658630371, + "learning_rate": 1.2566903878921522e-05, + "loss": 2.6141, + "step": 3709 + }, + { + "epoch": 1.12, + "grad_norm": 27.965063095092773, + "learning_rate": 1.256489926831713e-05, + "loss": 1.4174, + "step": 3710 + }, + { + "epoch": 1.12, + "grad_norm": 12.438477516174316, + "learning_rate": 1.256289465771274e-05, + "loss": 2.034, + "step": 3711 + }, + { + "epoch": 1.12, + "grad_norm": 18.60542106628418, + "learning_rate": 1.256089004710835e-05, + "loss": 2.1583, + "step": 3712 + }, + { + "epoch": 1.12, + "grad_norm": 11.205132484436035, + "learning_rate": 1.255888543650396e-05, + "loss": 2.2042, + "step": 3713 + }, + { + "epoch": 1.12, + "grad_norm": 21.031993865966797, + "learning_rate": 1.255688082589957e-05, + "loss": 1.3351, + "step": 3714 + }, + { + "epoch": 1.12, + "grad_norm": 22.70185661315918, + "learning_rate": 1.255487621529518e-05, + "loss": 2.8246, + "step": 3715 + }, + { + "epoch": 1.12, + "grad_norm": 19.754451751708984, + "learning_rate": 1.255287160469079e-05, + "loss": 1.9262, + "step": 3716 + }, + { + "epoch": 1.12, + "grad_norm": 14.8783597946167, + "learning_rate": 1.2550866994086399e-05, + "loss": 1.958, + "step": 3717 + }, + { + "epoch": 1.12, + "grad_norm": 13.51906681060791, + "learning_rate": 1.254886238348201e-05, + "loss": 2.0377, + "step": 3718 + }, + { + "epoch": 1.12, + "grad_norm": 13.766884803771973, + "learning_rate": 1.2546857772877619e-05, + "loss": 1.8277, + "step": 3719 + }, + { + "epoch": 1.12, + "grad_norm": 11.940971374511719, + "learning_rate": 1.254485316227323e-05, + "loss": 2.0253, + "step": 3720 + }, + { + "epoch": 1.12, + "eval_loss": 0.280333936214447, + "eval_runtime": 43.5973, + "eval_samples_per_second": 33.924, + "eval_steps_per_second": 33.924, + "step": 3720 + }, + { + "epoch": 1.12, + "grad_norm": 13.200506210327148, + "learning_rate": 1.2542848551668841e-05, + "loss": 1.8569, + "step": 3721 + }, + { + "epoch": 1.12, + "grad_norm": 11.850561141967773, + "learning_rate": 1.254084394106445e-05, + "loss": 1.2431, + "step": 3722 + }, + { + "epoch": 1.12, + "grad_norm": 14.307988166809082, + "learning_rate": 1.253883933046006e-05, + "loss": 1.2535, + "step": 3723 + }, + { + "epoch": 1.12, + "grad_norm": 33.0361328125, + "learning_rate": 1.2536834719855668e-05, + "loss": 2.3736, + "step": 3724 + }, + { + "epoch": 1.12, + "grad_norm": 14.496783256530762, + "learning_rate": 1.253483010925128e-05, + "loss": 1.5165, + "step": 3725 + }, + { + "epoch": 1.12, + "grad_norm": 20.50676155090332, + "learning_rate": 1.2532825498646888e-05, + "loss": 1.7852, + "step": 3726 + }, + { + "epoch": 1.12, + "grad_norm": 16.414701461791992, + "learning_rate": 1.2530820888042498e-05, + "loss": 1.5958, + "step": 3727 + }, + { + "epoch": 1.12, + "grad_norm": 9.845596313476562, + "learning_rate": 1.252881627743811e-05, + "loss": 1.4953, + "step": 3728 + }, + { + "epoch": 1.12, + "grad_norm": 34.321861267089844, + "learning_rate": 1.2526811666833718e-05, + "loss": 1.9955, + "step": 3729 + }, + { + "epoch": 1.12, + "grad_norm": 11.001937866210938, + "learning_rate": 1.2524807056229328e-05, + "loss": 1.7248, + "step": 3730 + }, + { + "epoch": 1.12, + "grad_norm": 14.16842269897461, + "learning_rate": 1.2522802445624937e-05, + "loss": 0.9204, + "step": 3731 + }, + { + "epoch": 1.12, + "grad_norm": 44.32683181762695, + "learning_rate": 1.2520797835020549e-05, + "loss": 1.5575, + "step": 3732 + }, + { + "epoch": 1.12, + "grad_norm": 21.35757064819336, + "learning_rate": 1.2518793224416159e-05, + "loss": 2.7646, + "step": 3733 + }, + { + "epoch": 1.12, + "grad_norm": 10.033564567565918, + "learning_rate": 1.2516788613811767e-05, + "loss": 1.054, + "step": 3734 + }, + { + "epoch": 1.12, + "grad_norm": 11.14671802520752, + "learning_rate": 1.2514784003207379e-05, + "loss": 1.4729, + "step": 3735 + }, + { + "epoch": 1.12, + "grad_norm": 57.583152770996094, + "learning_rate": 1.2512779392602987e-05, + "loss": 4.8937, + "step": 3736 + }, + { + "epoch": 1.12, + "grad_norm": 10.513443946838379, + "learning_rate": 1.2510774781998597e-05, + "loss": 1.1766, + "step": 3737 + }, + { + "epoch": 1.12, + "grad_norm": 20.811086654663086, + "learning_rate": 1.2508770171394207e-05, + "loss": 1.86, + "step": 3738 + }, + { + "epoch": 1.12, + "grad_norm": 15.33969497680664, + "learning_rate": 1.2506765560789817e-05, + "loss": 1.9234, + "step": 3739 + }, + { + "epoch": 1.12, + "grad_norm": 16.466957092285156, + "learning_rate": 1.250476095018543e-05, + "loss": 1.0944, + "step": 3740 + }, + { + "epoch": 1.12, + "grad_norm": 11.51232624053955, + "learning_rate": 1.2502756339581038e-05, + "loss": 1.9455, + "step": 3741 + }, + { + "epoch": 1.13, + "grad_norm": 16.893362045288086, + "learning_rate": 1.2500751728976648e-05, + "loss": 2.0934, + "step": 3742 + }, + { + "epoch": 1.13, + "grad_norm": 10.207427024841309, + "learning_rate": 1.2498747118372256e-05, + "loss": 1.5444, + "step": 3743 + }, + { + "epoch": 1.13, + "grad_norm": 14.303349494934082, + "learning_rate": 1.2496742507767868e-05, + "loss": 2.0237, + "step": 3744 + }, + { + "epoch": 1.13, + "grad_norm": 15.821151733398438, + "learning_rate": 1.2494737897163478e-05, + "loss": 1.2218, + "step": 3745 + }, + { + "epoch": 1.13, + "grad_norm": 19.704866409301758, + "learning_rate": 1.2492733286559086e-05, + "loss": 1.8249, + "step": 3746 + }, + { + "epoch": 1.13, + "grad_norm": 15.844225883483887, + "learning_rate": 1.2490728675954698e-05, + "loss": 1.4068, + "step": 3747 + }, + { + "epoch": 1.13, + "grad_norm": 26.448633193969727, + "learning_rate": 1.2488724065350306e-05, + "loss": 2.5801, + "step": 3748 + }, + { + "epoch": 1.13, + "grad_norm": 31.50472640991211, + "learning_rate": 1.2486719454745917e-05, + "loss": 2.1224, + "step": 3749 + }, + { + "epoch": 1.13, + "grad_norm": 11.211418151855469, + "learning_rate": 1.2484714844141525e-05, + "loss": 1.3734, + "step": 3750 + }, + { + "epoch": 1.13, + "grad_norm": 16.562360763549805, + "learning_rate": 1.2482710233537137e-05, + "loss": 2.8084, + "step": 3751 + }, + { + "epoch": 1.13, + "grad_norm": 14.236701965332031, + "learning_rate": 1.2480705622932747e-05, + "loss": 1.3424, + "step": 3752 + }, + { + "epoch": 1.13, + "grad_norm": 17.27580451965332, + "learning_rate": 1.2478701012328355e-05, + "loss": 2.5758, + "step": 3753 + }, + { + "epoch": 1.13, + "grad_norm": 12.679486274719238, + "learning_rate": 1.2476696401723967e-05, + "loss": 1.6972, + "step": 3754 + }, + { + "epoch": 1.13, + "grad_norm": 17.295305252075195, + "learning_rate": 1.2474691791119575e-05, + "loss": 1.4085, + "step": 3755 + }, + { + "epoch": 1.13, + "grad_norm": 14.256155014038086, + "learning_rate": 1.2472687180515185e-05, + "loss": 1.5598, + "step": 3756 + }, + { + "epoch": 1.13, + "grad_norm": 14.317242622375488, + "learning_rate": 1.2470682569910797e-05, + "loss": 1.98, + "step": 3757 + }, + { + "epoch": 1.13, + "grad_norm": 14.427403450012207, + "learning_rate": 1.2468677959306406e-05, + "loss": 1.4377, + "step": 3758 + }, + { + "epoch": 1.13, + "grad_norm": 19.83202362060547, + "learning_rate": 1.2466673348702016e-05, + "loss": 1.7858, + "step": 3759 + }, + { + "epoch": 1.13, + "grad_norm": 10.979972839355469, + "learning_rate": 1.2464668738097626e-05, + "loss": 0.8914, + "step": 3760 + }, + { + "epoch": 1.13, + "grad_norm": 25.953853607177734, + "learning_rate": 1.2462664127493236e-05, + "loss": 1.768, + "step": 3761 + }, + { + "epoch": 1.13, + "grad_norm": 17.417144775390625, + "learning_rate": 1.2460659516888844e-05, + "loss": 1.9067, + "step": 3762 + }, + { + "epoch": 1.13, + "grad_norm": 18.727876663208008, + "learning_rate": 1.2458654906284456e-05, + "loss": 1.8154, + "step": 3763 + }, + { + "epoch": 1.13, + "grad_norm": 28.132505416870117, + "learning_rate": 1.2456650295680066e-05, + "loss": 2.6384, + "step": 3764 + }, + { + "epoch": 1.13, + "grad_norm": 11.409829139709473, + "learning_rate": 1.2454645685075675e-05, + "loss": 1.3703, + "step": 3765 + }, + { + "epoch": 1.13, + "grad_norm": 12.39699935913086, + "learning_rate": 1.2452641074471286e-05, + "loss": 1.56, + "step": 3766 + }, + { + "epoch": 1.13, + "grad_norm": 12.608708381652832, + "learning_rate": 1.2450636463866895e-05, + "loss": 1.0913, + "step": 3767 + }, + { + "epoch": 1.13, + "grad_norm": 9.895720481872559, + "learning_rate": 1.2448631853262505e-05, + "loss": 1.6166, + "step": 3768 + }, + { + "epoch": 1.13, + "grad_norm": 15.616667747497559, + "learning_rate": 1.2446627242658113e-05, + "loss": 2.4802, + "step": 3769 + }, + { + "epoch": 1.13, + "grad_norm": 28.047149658203125, + "learning_rate": 1.2444622632053725e-05, + "loss": 2.0519, + "step": 3770 + }, + { + "epoch": 1.13, + "grad_norm": 15.117725372314453, + "learning_rate": 1.2442618021449335e-05, + "loss": 2.0924, + "step": 3771 + }, + { + "epoch": 1.13, + "grad_norm": 11.850608825683594, + "learning_rate": 1.2440613410844943e-05, + "loss": 1.4544, + "step": 3772 + }, + { + "epoch": 1.13, + "grad_norm": 13.68972396850586, + "learning_rate": 1.2438608800240555e-05, + "loss": 2.0332, + "step": 3773 + }, + { + "epoch": 1.13, + "grad_norm": 12.880284309387207, + "learning_rate": 1.2436604189636164e-05, + "loss": 2.0553, + "step": 3774 + }, + { + "epoch": 1.13, + "grad_norm": 14.321048736572266, + "learning_rate": 1.2434599579031774e-05, + "loss": 1.2826, + "step": 3775 + }, + { + "epoch": 1.14, + "grad_norm": 22.247779846191406, + "learning_rate": 1.2432594968427385e-05, + "loss": 1.7716, + "step": 3776 + }, + { + "epoch": 1.14, + "grad_norm": 10.209996223449707, + "learning_rate": 1.2430590357822994e-05, + "loss": 1.5767, + "step": 3777 + }, + { + "epoch": 1.14, + "grad_norm": 12.013636589050293, + "learning_rate": 1.2428585747218604e-05, + "loss": 1.114, + "step": 3778 + }, + { + "epoch": 1.14, + "grad_norm": 51.70674514770508, + "learning_rate": 1.2426581136614212e-05, + "loss": 2.013, + "step": 3779 + }, + { + "epoch": 1.14, + "grad_norm": 18.054279327392578, + "learning_rate": 1.2424576526009824e-05, + "loss": 1.4477, + "step": 3780 + }, + { + "epoch": 1.14, + "grad_norm": 19.923124313354492, + "learning_rate": 1.2422571915405432e-05, + "loss": 1.3768, + "step": 3781 + }, + { + "epoch": 1.14, + "grad_norm": 23.104625701904297, + "learning_rate": 1.2420567304801043e-05, + "loss": 2.1138, + "step": 3782 + }, + { + "epoch": 1.14, + "grad_norm": 8.69080638885498, + "learning_rate": 1.2418562694196654e-05, + "loss": 1.1123, + "step": 3783 + }, + { + "epoch": 1.14, + "grad_norm": 15.912267684936523, + "learning_rate": 1.2416558083592263e-05, + "loss": 2.4516, + "step": 3784 + }, + { + "epoch": 1.14, + "grad_norm": 12.12271785736084, + "learning_rate": 1.2414553472987873e-05, + "loss": 1.4647, + "step": 3785 + }, + { + "epoch": 1.14, + "grad_norm": 9.104388236999512, + "learning_rate": 1.2412548862383483e-05, + "loss": 1.8779, + "step": 3786 + }, + { + "epoch": 1.14, + "grad_norm": 17.152158737182617, + "learning_rate": 1.2410544251779093e-05, + "loss": 1.4711, + "step": 3787 + }, + { + "epoch": 1.14, + "grad_norm": 9.907875061035156, + "learning_rate": 1.2408539641174703e-05, + "loss": 1.1329, + "step": 3788 + }, + { + "epoch": 1.14, + "grad_norm": 20.072593688964844, + "learning_rate": 1.2406535030570313e-05, + "loss": 1.9876, + "step": 3789 + }, + { + "epoch": 1.14, + "grad_norm": 14.026973724365234, + "learning_rate": 1.2404530419965923e-05, + "loss": 0.7741, + "step": 3790 + }, + { + "epoch": 1.14, + "grad_norm": 15.969365119934082, + "learning_rate": 1.2402525809361532e-05, + "loss": 1.6111, + "step": 3791 + }, + { + "epoch": 1.14, + "grad_norm": 15.787557601928711, + "learning_rate": 1.2400521198757143e-05, + "loss": 1.5455, + "step": 3792 + }, + { + "epoch": 1.14, + "grad_norm": 14.39417839050293, + "learning_rate": 1.2398516588152752e-05, + "loss": 1.0369, + "step": 3793 + }, + { + "epoch": 1.14, + "grad_norm": 13.072067260742188, + "learning_rate": 1.2396511977548362e-05, + "loss": 2.3197, + "step": 3794 + }, + { + "epoch": 1.14, + "grad_norm": 14.899435997009277, + "learning_rate": 1.2394507366943974e-05, + "loss": 2.7447, + "step": 3795 + }, + { + "epoch": 1.14, + "grad_norm": 15.155303001403809, + "learning_rate": 1.2392502756339582e-05, + "loss": 1.4796, + "step": 3796 + }, + { + "epoch": 1.14, + "grad_norm": 28.191987991333008, + "learning_rate": 1.2390498145735192e-05, + "loss": 1.7832, + "step": 3797 + }, + { + "epoch": 1.14, + "grad_norm": 14.950464248657227, + "learning_rate": 1.23884935351308e-05, + "loss": 2.1336, + "step": 3798 + }, + { + "epoch": 1.14, + "grad_norm": 21.477602005004883, + "learning_rate": 1.2386488924526412e-05, + "loss": 1.7525, + "step": 3799 + }, + { + "epoch": 1.14, + "grad_norm": 25.496963500976562, + "learning_rate": 1.238448431392202e-05, + "loss": 1.6689, + "step": 3800 + }, + { + "epoch": 1.14, + "grad_norm": 12.373186111450195, + "learning_rate": 1.238247970331763e-05, + "loss": 2.1899, + "step": 3801 + }, + { + "epoch": 1.14, + "grad_norm": 20.684795379638672, + "learning_rate": 1.2380475092713243e-05, + "loss": 2.6932, + "step": 3802 + }, + { + "epoch": 1.14, + "grad_norm": 9.77327823638916, + "learning_rate": 1.2378470482108851e-05, + "loss": 1.5063, + "step": 3803 + }, + { + "epoch": 1.14, + "grad_norm": 28.525667190551758, + "learning_rate": 1.2376465871504461e-05, + "loss": 1.532, + "step": 3804 + }, + { + "epoch": 1.14, + "grad_norm": 14.671927452087402, + "learning_rate": 1.2374461260900071e-05, + "loss": 2.0253, + "step": 3805 + }, + { + "epoch": 1.14, + "grad_norm": 30.834918975830078, + "learning_rate": 1.2372456650295681e-05, + "loss": 2.39, + "step": 3806 + }, + { + "epoch": 1.14, + "grad_norm": 16.612564086914062, + "learning_rate": 1.2370452039691291e-05, + "loss": 1.8819, + "step": 3807 + }, + { + "epoch": 1.14, + "grad_norm": 13.142746925354004, + "learning_rate": 1.2368447429086901e-05, + "loss": 1.6061, + "step": 3808 + }, + { + "epoch": 1.15, + "grad_norm": 18.517765045166016, + "learning_rate": 1.2366442818482511e-05, + "loss": 1.6218, + "step": 3809 + }, + { + "epoch": 1.15, + "grad_norm": 19.177627563476562, + "learning_rate": 1.236443820787812e-05, + "loss": 2.3172, + "step": 3810 + }, + { + "epoch": 1.15, + "grad_norm": 21.176328659057617, + "learning_rate": 1.2362433597273732e-05, + "loss": 1.8578, + "step": 3811 + }, + { + "epoch": 1.15, + "grad_norm": 21.381567001342773, + "learning_rate": 1.236042898666934e-05, + "loss": 1.5194, + "step": 3812 + }, + { + "epoch": 1.15, + "grad_norm": 11.9020414352417, + "learning_rate": 1.235842437606495e-05, + "loss": 1.6908, + "step": 3813 + }, + { + "epoch": 1.15, + "grad_norm": 39.47920608520508, + "learning_rate": 1.2356419765460562e-05, + "loss": 1.8726, + "step": 3814 + }, + { + "epoch": 1.15, + "grad_norm": 10.872967720031738, + "learning_rate": 1.235441515485617e-05, + "loss": 1.5569, + "step": 3815 + }, + { + "epoch": 1.15, + "grad_norm": 14.20240592956543, + "learning_rate": 1.235241054425178e-05, + "loss": 1.9525, + "step": 3816 + }, + { + "epoch": 1.15, + "grad_norm": 16.04691505432129, + "learning_rate": 1.2350405933647389e-05, + "loss": 1.4398, + "step": 3817 + }, + { + "epoch": 1.15, + "grad_norm": 19.002132415771484, + "learning_rate": 1.2348401323043e-05, + "loss": 1.9253, + "step": 3818 + }, + { + "epoch": 1.15, + "grad_norm": 9.962849617004395, + "learning_rate": 1.234639671243861e-05, + "loss": 2.0496, + "step": 3819 + }, + { + "epoch": 1.15, + "grad_norm": 10.417901039123535, + "learning_rate": 1.2344392101834219e-05, + "loss": 1.9296, + "step": 3820 + }, + { + "epoch": 1.15, + "grad_norm": 16.502777099609375, + "learning_rate": 1.234238749122983e-05, + "loss": 1.6108, + "step": 3821 + }, + { + "epoch": 1.15, + "grad_norm": 18.25497817993164, + "learning_rate": 1.234038288062544e-05, + "loss": 1.7453, + "step": 3822 + }, + { + "epoch": 1.15, + "grad_norm": 20.428895950317383, + "learning_rate": 1.233837827002105e-05, + "loss": 2.3424, + "step": 3823 + }, + { + "epoch": 1.15, + "grad_norm": 13.153669357299805, + "learning_rate": 1.2336373659416658e-05, + "loss": 1.3317, + "step": 3824 + }, + { + "epoch": 1.15, + "grad_norm": 13.739114761352539, + "learning_rate": 1.233436904881227e-05, + "loss": 1.4802, + "step": 3825 + }, + { + "epoch": 1.15, + "grad_norm": 17.079265594482422, + "learning_rate": 1.233236443820788e-05, + "loss": 1.6825, + "step": 3826 + }, + { + "epoch": 1.15, + "grad_norm": 15.86989974975586, + "learning_rate": 1.2330359827603488e-05, + "loss": 1.7154, + "step": 3827 + }, + { + "epoch": 1.15, + "grad_norm": 14.82314682006836, + "learning_rate": 1.23283552169991e-05, + "loss": 1.3117, + "step": 3828 + }, + { + "epoch": 1.15, + "grad_norm": 21.10222625732422, + "learning_rate": 1.2326350606394708e-05, + "loss": 1.4309, + "step": 3829 + }, + { + "epoch": 1.15, + "grad_norm": 15.547961235046387, + "learning_rate": 1.2324345995790318e-05, + "loss": 1.3606, + "step": 3830 + }, + { + "epoch": 1.15, + "grad_norm": 13.123003005981445, + "learning_rate": 1.232234138518593e-05, + "loss": 2.1664, + "step": 3831 + }, + { + "epoch": 1.15, + "grad_norm": 14.233585357666016, + "learning_rate": 1.2320336774581538e-05, + "loss": 1.9827, + "step": 3832 + }, + { + "epoch": 1.15, + "grad_norm": 8.499350547790527, + "learning_rate": 1.2318332163977148e-05, + "loss": 0.8647, + "step": 3833 + }, + { + "epoch": 1.15, + "grad_norm": 24.76041603088379, + "learning_rate": 1.2316327553372758e-05, + "loss": 2.5669, + "step": 3834 + }, + { + "epoch": 1.15, + "grad_norm": 11.308053970336914, + "learning_rate": 1.2314322942768369e-05, + "loss": 1.5768, + "step": 3835 + }, + { + "epoch": 1.15, + "grad_norm": 25.03925132751465, + "learning_rate": 1.2312318332163977e-05, + "loss": 1.7572, + "step": 3836 + }, + { + "epoch": 1.15, + "grad_norm": 19.462387084960938, + "learning_rate": 1.2310313721559589e-05, + "loss": 2.0888, + "step": 3837 + }, + { + "epoch": 1.15, + "grad_norm": 12.561948776245117, + "learning_rate": 1.2308309110955199e-05, + "loss": 1.1124, + "step": 3838 + }, + { + "epoch": 1.15, + "grad_norm": 34.78009796142578, + "learning_rate": 1.2306304500350807e-05, + "loss": 1.5277, + "step": 3839 + }, + { + "epoch": 1.15, + "grad_norm": 26.024934768676758, + "learning_rate": 1.2304299889746419e-05, + "loss": 2.0164, + "step": 3840 + }, + { + "epoch": 1.15, + "eval_loss": 0.21982242166996002, + "eval_runtime": 43.5261, + "eval_samples_per_second": 33.98, + "eval_steps_per_second": 33.98, + "step": 3840 + }, + { + "epoch": 1.15, + "grad_norm": 6.806481838226318, + "learning_rate": 1.2302295279142027e-05, + "loss": 0.7476, + "step": 3841 + }, + { + "epoch": 1.16, + "grad_norm": 18.72743797302246, + "learning_rate": 1.2300290668537637e-05, + "loss": 1.4762, + "step": 3842 + }, + { + "epoch": 1.16, + "grad_norm": 55.82676315307617, + "learning_rate": 1.2298286057933246e-05, + "loss": 1.809, + "step": 3843 + }, + { + "epoch": 1.16, + "grad_norm": 13.187909126281738, + "learning_rate": 1.2296281447328858e-05, + "loss": 1.9105, + "step": 3844 + }, + { + "epoch": 1.16, + "grad_norm": 11.694255828857422, + "learning_rate": 1.2294276836724468e-05, + "loss": 1.6571, + "step": 3845 + }, + { + "epoch": 1.16, + "grad_norm": 20.324060440063477, + "learning_rate": 1.2292272226120076e-05, + "loss": 1.3941, + "step": 3846 + }, + { + "epoch": 1.16, + "grad_norm": 13.865283012390137, + "learning_rate": 1.2290267615515688e-05, + "loss": 1.9631, + "step": 3847 + }, + { + "epoch": 1.16, + "grad_norm": 18.18462562561035, + "learning_rate": 1.2288263004911296e-05, + "loss": 2.1097, + "step": 3848 + }, + { + "epoch": 1.16, + "grad_norm": 12.211462020874023, + "learning_rate": 1.2286258394306906e-05, + "loss": 1.8528, + "step": 3849 + }, + { + "epoch": 1.16, + "grad_norm": 12.799270629882812, + "learning_rate": 1.2284253783702518e-05, + "loss": 1.9839, + "step": 3850 + }, + { + "epoch": 1.16, + "grad_norm": 36.99559020996094, + "learning_rate": 1.2282249173098127e-05, + "loss": 1.7886, + "step": 3851 + }, + { + "epoch": 1.16, + "grad_norm": 15.477313995361328, + "learning_rate": 1.2280244562493737e-05, + "loss": 1.6013, + "step": 3852 + }, + { + "epoch": 1.16, + "grad_norm": 12.97021198272705, + "learning_rate": 1.2278239951889345e-05, + "loss": 2.1934, + "step": 3853 + }, + { + "epoch": 1.16, + "grad_norm": 9.071000099182129, + "learning_rate": 1.2276235341284957e-05, + "loss": 1.2502, + "step": 3854 + }, + { + "epoch": 1.16, + "grad_norm": 13.090424537658691, + "learning_rate": 1.2274230730680565e-05, + "loss": 1.4692, + "step": 3855 + }, + { + "epoch": 1.16, + "grad_norm": 15.393147468566895, + "learning_rate": 1.2272226120076175e-05, + "loss": 1.1286, + "step": 3856 + }, + { + "epoch": 1.16, + "grad_norm": 38.615726470947266, + "learning_rate": 1.2270221509471787e-05, + "loss": 2.5387, + "step": 3857 + }, + { + "epoch": 1.16, + "grad_norm": 28.15321922302246, + "learning_rate": 1.2268216898867395e-05, + "loss": 2.187, + "step": 3858 + }, + { + "epoch": 1.16, + "grad_norm": 12.39357852935791, + "learning_rate": 1.2266212288263007e-05, + "loss": 2.2882, + "step": 3859 + }, + { + "epoch": 1.16, + "grad_norm": 30.955167770385742, + "learning_rate": 1.2264207677658616e-05, + "loss": 1.8665, + "step": 3860 + }, + { + "epoch": 1.16, + "grad_norm": 16.133106231689453, + "learning_rate": 1.2262203067054226e-05, + "loss": 1.8293, + "step": 3861 + }, + { + "epoch": 1.16, + "grad_norm": 7.856118202209473, + "learning_rate": 1.2260198456449837e-05, + "loss": 0.9778, + "step": 3862 + }, + { + "epoch": 1.16, + "grad_norm": 12.470169067382812, + "learning_rate": 1.2258193845845446e-05, + "loss": 1.4214, + "step": 3863 + }, + { + "epoch": 1.16, + "grad_norm": 19.56519317626953, + "learning_rate": 1.2256189235241056e-05, + "loss": 2.2586, + "step": 3864 + }, + { + "epoch": 1.16, + "grad_norm": 23.616268157958984, + "learning_rate": 1.2254184624636664e-05, + "loss": 2.7421, + "step": 3865 + }, + { + "epoch": 1.16, + "grad_norm": 15.930326461791992, + "learning_rate": 1.2252180014032276e-05, + "loss": 2.1892, + "step": 3866 + }, + { + "epoch": 1.16, + "grad_norm": 24.9573974609375, + "learning_rate": 1.2250175403427884e-05, + "loss": 2.7173, + "step": 3867 + }, + { + "epoch": 1.16, + "grad_norm": 8.64990234375, + "learning_rate": 1.2248170792823495e-05, + "loss": 1.8587, + "step": 3868 + }, + { + "epoch": 1.16, + "grad_norm": 29.846378326416016, + "learning_rate": 1.2246166182219106e-05, + "loss": 1.8677, + "step": 3869 + }, + { + "epoch": 1.16, + "grad_norm": 11.002440452575684, + "learning_rate": 1.2244161571614715e-05, + "loss": 1.6396, + "step": 3870 + }, + { + "epoch": 1.16, + "grad_norm": 16.57766342163086, + "learning_rate": 1.2242156961010325e-05, + "loss": 1.381, + "step": 3871 + }, + { + "epoch": 1.16, + "grad_norm": 16.003095626831055, + "learning_rate": 1.2240152350405933e-05, + "loss": 1.7541, + "step": 3872 + }, + { + "epoch": 1.16, + "grad_norm": 16.007123947143555, + "learning_rate": 1.2238147739801545e-05, + "loss": 1.9818, + "step": 3873 + }, + { + "epoch": 1.16, + "grad_norm": 16.965221405029297, + "learning_rate": 1.2236143129197155e-05, + "loss": 2.3105, + "step": 3874 + }, + { + "epoch": 1.17, + "grad_norm": 27.36821174621582, + "learning_rate": 1.2234138518592763e-05, + "loss": 2.4866, + "step": 3875 + }, + { + "epoch": 1.17, + "grad_norm": 13.598553657531738, + "learning_rate": 1.2232133907988375e-05, + "loss": 1.654, + "step": 3876 + }, + { + "epoch": 1.17, + "grad_norm": 15.706884384155273, + "learning_rate": 1.2230129297383984e-05, + "loss": 2.687, + "step": 3877 + }, + { + "epoch": 1.17, + "grad_norm": 16.47588348388672, + "learning_rate": 1.2228124686779594e-05, + "loss": 1.6308, + "step": 3878 + }, + { + "epoch": 1.17, + "grad_norm": 12.876527786254883, + "learning_rate": 1.2226120076175204e-05, + "loss": 1.4511, + "step": 3879 + }, + { + "epoch": 1.17, + "grad_norm": 20.67323112487793, + "learning_rate": 1.2224115465570814e-05, + "loss": 1.4801, + "step": 3880 + }, + { + "epoch": 1.17, + "grad_norm": 29.967750549316406, + "learning_rate": 1.2222110854966424e-05, + "loss": 2.2102, + "step": 3881 + }, + { + "epoch": 1.17, + "grad_norm": 12.949405670166016, + "learning_rate": 1.2220106244362034e-05, + "loss": 1.8167, + "step": 3882 + }, + { + "epoch": 1.17, + "grad_norm": 26.90522575378418, + "learning_rate": 1.2218101633757644e-05, + "loss": 1.7836, + "step": 3883 + }, + { + "epoch": 1.17, + "grad_norm": 14.43256950378418, + "learning_rate": 1.2216097023153253e-05, + "loss": 1.5144, + "step": 3884 + }, + { + "epoch": 1.17, + "grad_norm": 10.095649719238281, + "learning_rate": 1.2214092412548864e-05, + "loss": 0.8749, + "step": 3885 + }, + { + "epoch": 1.17, + "grad_norm": 7.504431247711182, + "learning_rate": 1.2212087801944473e-05, + "loss": 1.5074, + "step": 3886 + }, + { + "epoch": 1.17, + "grad_norm": 19.273103713989258, + "learning_rate": 1.2210083191340083e-05, + "loss": 2.6758, + "step": 3887 + }, + { + "epoch": 1.17, + "grad_norm": 24.052509307861328, + "learning_rate": 1.2208078580735695e-05, + "loss": 1.6407, + "step": 3888 + }, + { + "epoch": 1.17, + "grad_norm": 19.1132869720459, + "learning_rate": 1.2206073970131303e-05, + "loss": 1.7163, + "step": 3889 + }, + { + "epoch": 1.17, + "grad_norm": 51.50567626953125, + "learning_rate": 1.2204069359526913e-05, + "loss": 2.9385, + "step": 3890 + }, + { + "epoch": 1.17, + "grad_norm": 18.24009132385254, + "learning_rate": 1.2202064748922521e-05, + "loss": 1.7915, + "step": 3891 + }, + { + "epoch": 1.17, + "grad_norm": 14.289188385009766, + "learning_rate": 1.2200060138318133e-05, + "loss": 1.8599, + "step": 3892 + }, + { + "epoch": 1.17, + "grad_norm": 35.022804260253906, + "learning_rate": 1.2198055527713743e-05, + "loss": 2.9202, + "step": 3893 + }, + { + "epoch": 1.17, + "grad_norm": 25.463573455810547, + "learning_rate": 1.2196050917109352e-05, + "loss": 1.622, + "step": 3894 + }, + { + "epoch": 1.17, + "grad_norm": 10.089998245239258, + "learning_rate": 1.2194046306504963e-05, + "loss": 1.1327, + "step": 3895 + }, + { + "epoch": 1.17, + "grad_norm": 23.384336471557617, + "learning_rate": 1.2192041695900572e-05, + "loss": 1.2661, + "step": 3896 + }, + { + "epoch": 1.17, + "grad_norm": 12.663224220275879, + "learning_rate": 1.2190037085296182e-05, + "loss": 2.6986, + "step": 3897 + }, + { + "epoch": 1.17, + "grad_norm": 16.6176700592041, + "learning_rate": 1.218803247469179e-05, + "loss": 1.894, + "step": 3898 + }, + { + "epoch": 1.17, + "grad_norm": 12.314738273620605, + "learning_rate": 1.2186027864087402e-05, + "loss": 1.2068, + "step": 3899 + }, + { + "epoch": 1.17, + "grad_norm": 23.991886138916016, + "learning_rate": 1.2184023253483012e-05, + "loss": 1.6332, + "step": 3900 + }, + { + "epoch": 1.17, + "grad_norm": 23.119836807250977, + "learning_rate": 1.218201864287862e-05, + "loss": 2.1423, + "step": 3901 + }, + { + "epoch": 1.17, + "grad_norm": 29.719497680664062, + "learning_rate": 1.2180014032274232e-05, + "loss": 1.4554, + "step": 3902 + }, + { + "epoch": 1.17, + "grad_norm": 13.610468864440918, + "learning_rate": 1.217800942166984e-05, + "loss": 1.6478, + "step": 3903 + }, + { + "epoch": 1.17, + "grad_norm": 12.19708251953125, + "learning_rate": 1.217600481106545e-05, + "loss": 1.4697, + "step": 3904 + }, + { + "epoch": 1.17, + "grad_norm": 14.37850570678711, + "learning_rate": 1.2174000200461063e-05, + "loss": 1.2426, + "step": 3905 + }, + { + "epoch": 1.17, + "grad_norm": 16.169696807861328, + "learning_rate": 1.2171995589856671e-05, + "loss": 1.7116, + "step": 3906 + }, + { + "epoch": 1.17, + "grad_norm": 14.644316673278809, + "learning_rate": 1.2169990979252281e-05, + "loss": 1.5071, + "step": 3907 + }, + { + "epoch": 1.17, + "grad_norm": 10.753741264343262, + "learning_rate": 1.2167986368647891e-05, + "loss": 1.8289, + "step": 3908 + }, + { + "epoch": 1.18, + "grad_norm": 17.671937942504883, + "learning_rate": 1.2165981758043501e-05, + "loss": 1.3769, + "step": 3909 + }, + { + "epoch": 1.18, + "grad_norm": 10.696758270263672, + "learning_rate": 1.216397714743911e-05, + "loss": 1.7544, + "step": 3910 + }, + { + "epoch": 1.18, + "grad_norm": 10.894145011901855, + "learning_rate": 1.2161972536834721e-05, + "loss": 1.3638, + "step": 3911 + }, + { + "epoch": 1.18, + "grad_norm": 14.590895652770996, + "learning_rate": 1.2159967926230332e-05, + "loss": 1.4625, + "step": 3912 + }, + { + "epoch": 1.18, + "grad_norm": 20.419675827026367, + "learning_rate": 1.215796331562594e-05, + "loss": 2.223, + "step": 3913 + }, + { + "epoch": 1.18, + "grad_norm": 29.404043197631836, + "learning_rate": 1.2155958705021552e-05, + "loss": 1.6396, + "step": 3914 + }, + { + "epoch": 1.18, + "grad_norm": 24.126564025878906, + "learning_rate": 1.215395409441716e-05, + "loss": 1.6138, + "step": 3915 + }, + { + "epoch": 1.18, + "grad_norm": 13.458683967590332, + "learning_rate": 1.215194948381277e-05, + "loss": 2.0191, + "step": 3916 + }, + { + "epoch": 1.18, + "grad_norm": 9.993988037109375, + "learning_rate": 1.2149944873208382e-05, + "loss": 1.3779, + "step": 3917 + }, + { + "epoch": 1.18, + "grad_norm": 13.048202514648438, + "learning_rate": 1.214794026260399e-05, + "loss": 1.8113, + "step": 3918 + }, + { + "epoch": 1.18, + "grad_norm": 11.234829902648926, + "learning_rate": 1.21459356519996e-05, + "loss": 1.4256, + "step": 3919 + }, + { + "epoch": 1.18, + "grad_norm": 25.56989860534668, + "learning_rate": 1.2143931041395209e-05, + "loss": 2.0883, + "step": 3920 + }, + { + "epoch": 1.18, + "grad_norm": 12.80074405670166, + "learning_rate": 1.214192643079082e-05, + "loss": 1.7903, + "step": 3921 + }, + { + "epoch": 1.18, + "grad_norm": 26.22979736328125, + "learning_rate": 1.2139921820186429e-05, + "loss": 2.3038, + "step": 3922 + }, + { + "epoch": 1.18, + "grad_norm": 15.948308944702148, + "learning_rate": 1.2137917209582039e-05, + "loss": 1.9847, + "step": 3923 + }, + { + "epoch": 1.18, + "grad_norm": 20.231281280517578, + "learning_rate": 1.213591259897765e-05, + "loss": 2.6807, + "step": 3924 + }, + { + "epoch": 1.18, + "grad_norm": 26.88908576965332, + "learning_rate": 1.213390798837326e-05, + "loss": 2.2021, + "step": 3925 + }, + { + "epoch": 1.18, + "grad_norm": 9.647951126098633, + "learning_rate": 1.213190337776887e-05, + "loss": 1.7718, + "step": 3926 + }, + { + "epoch": 1.18, + "grad_norm": 25.476966857910156, + "learning_rate": 1.212989876716448e-05, + "loss": 1.7512, + "step": 3927 + }, + { + "epoch": 1.18, + "grad_norm": 17.493688583374023, + "learning_rate": 1.212789415656009e-05, + "loss": 2.0739, + "step": 3928 + }, + { + "epoch": 1.18, + "grad_norm": 18.608076095581055, + "learning_rate": 1.2125889545955698e-05, + "loss": 1.8991, + "step": 3929 + }, + { + "epoch": 1.18, + "grad_norm": 26.32684326171875, + "learning_rate": 1.212388493535131e-05, + "loss": 1.3218, + "step": 3930 + }, + { + "epoch": 1.18, + "grad_norm": 14.290658950805664, + "learning_rate": 1.212188032474692e-05, + "loss": 1.6064, + "step": 3931 + }, + { + "epoch": 1.18, + "grad_norm": 8.176145553588867, + "learning_rate": 1.2119875714142528e-05, + "loss": 1.101, + "step": 3932 + }, + { + "epoch": 1.18, + "grad_norm": 77.52426147460938, + "learning_rate": 1.211787110353814e-05, + "loss": 1.8667, + "step": 3933 + }, + { + "epoch": 1.18, + "grad_norm": 13.218364715576172, + "learning_rate": 1.2115866492933748e-05, + "loss": 2.0845, + "step": 3934 + }, + { + "epoch": 1.18, + "grad_norm": 13.180416107177734, + "learning_rate": 1.2113861882329358e-05, + "loss": 1.7175, + "step": 3935 + }, + { + "epoch": 1.18, + "grad_norm": 27.077486038208008, + "learning_rate": 1.211185727172497e-05, + "loss": 1.5133, + "step": 3936 + }, + { + "epoch": 1.18, + "grad_norm": 14.540903091430664, + "learning_rate": 1.2109852661120579e-05, + "loss": 2.1343, + "step": 3937 + }, + { + "epoch": 1.18, + "grad_norm": 18.087007522583008, + "learning_rate": 1.2107848050516189e-05, + "loss": 1.6534, + "step": 3938 + }, + { + "epoch": 1.18, + "grad_norm": 18.4873046875, + "learning_rate": 1.2105843439911797e-05, + "loss": 1.8601, + "step": 3939 + }, + { + "epoch": 1.18, + "grad_norm": 11.12307071685791, + "learning_rate": 1.2103838829307409e-05, + "loss": 1.4969, + "step": 3940 + }, + { + "epoch": 1.18, + "grad_norm": 22.688817977905273, + "learning_rate": 1.2101834218703017e-05, + "loss": 2.0138, + "step": 3941 + }, + { + "epoch": 1.19, + "grad_norm": 32.38698196411133, + "learning_rate": 1.2099829608098627e-05, + "loss": 3.5612, + "step": 3942 + }, + { + "epoch": 1.19, + "grad_norm": 40.410884857177734, + "learning_rate": 1.2097824997494239e-05, + "loss": 2.6222, + "step": 3943 + }, + { + "epoch": 1.19, + "grad_norm": 11.373454093933105, + "learning_rate": 1.2095820386889847e-05, + "loss": 1.8325, + "step": 3944 + }, + { + "epoch": 1.19, + "grad_norm": 57.750003814697266, + "learning_rate": 1.2093815776285458e-05, + "loss": 2.7973, + "step": 3945 + }, + { + "epoch": 1.19, + "grad_norm": 18.441944122314453, + "learning_rate": 1.2091811165681066e-05, + "loss": 2.9912, + "step": 3946 + }, + { + "epoch": 1.19, + "grad_norm": 32.177513122558594, + "learning_rate": 1.2089806555076678e-05, + "loss": 1.9056, + "step": 3947 + }, + { + "epoch": 1.19, + "grad_norm": 22.477294921875, + "learning_rate": 1.2087801944472288e-05, + "loss": 2.2548, + "step": 3948 + }, + { + "epoch": 1.19, + "grad_norm": 20.485315322875977, + "learning_rate": 1.2085797333867896e-05, + "loss": 1.4721, + "step": 3949 + }, + { + "epoch": 1.19, + "grad_norm": 14.799079895019531, + "learning_rate": 1.2083792723263508e-05, + "loss": 1.8698, + "step": 3950 + }, + { + "epoch": 1.19, + "grad_norm": 13.152766227722168, + "learning_rate": 1.2081788112659116e-05, + "loss": 2.6729, + "step": 3951 + }, + { + "epoch": 1.19, + "grad_norm": 10.631035804748535, + "learning_rate": 1.2079783502054726e-05, + "loss": 1.7254, + "step": 3952 + }, + { + "epoch": 1.19, + "grad_norm": 15.786643981933594, + "learning_rate": 1.2077778891450336e-05, + "loss": 1.1798, + "step": 3953 + }, + { + "epoch": 1.19, + "grad_norm": 12.110862731933594, + "learning_rate": 1.2075774280845947e-05, + "loss": 1.4401, + "step": 3954 + }, + { + "epoch": 1.19, + "grad_norm": 15.332069396972656, + "learning_rate": 1.2073769670241557e-05, + "loss": 2.4265, + "step": 3955 + }, + { + "epoch": 1.19, + "grad_norm": 26.413911819458008, + "learning_rate": 1.2071765059637167e-05, + "loss": 1.7556, + "step": 3956 + }, + { + "epoch": 1.19, + "grad_norm": 10.954570770263672, + "learning_rate": 1.2069760449032777e-05, + "loss": 0.8213, + "step": 3957 + }, + { + "epoch": 1.19, + "grad_norm": 20.662534713745117, + "learning_rate": 1.2067755838428385e-05, + "loss": 1.6746, + "step": 3958 + }, + { + "epoch": 1.19, + "grad_norm": 9.891891479492188, + "learning_rate": 1.2065751227823997e-05, + "loss": 1.3557, + "step": 3959 + }, + { + "epoch": 1.19, + "grad_norm": 13.939960479736328, + "learning_rate": 1.2063746617219607e-05, + "loss": 1.6665, + "step": 3960 + }, + { + "epoch": 1.19, + "eval_loss": 0.22422289848327637, + "eval_runtime": 43.495, + "eval_samples_per_second": 34.004, + "eval_steps_per_second": 34.004, + "step": 3960 + }, + { + "epoch": 1.19, + "grad_norm": 50.82015609741211, + "learning_rate": 1.2061742006615215e-05, + "loss": 2.8102, + "step": 3961 + }, + { + "epoch": 1.19, + "grad_norm": 10.213366508483887, + "learning_rate": 1.2059737396010827e-05, + "loss": 1.7045, + "step": 3962 + }, + { + "epoch": 1.19, + "grad_norm": 9.090950965881348, + "learning_rate": 1.2057732785406436e-05, + "loss": 1.0682, + "step": 3963 + }, + { + "epoch": 1.19, + "grad_norm": 14.20166015625, + "learning_rate": 1.2055728174802046e-05, + "loss": 1.6642, + "step": 3964 + }, + { + "epoch": 1.19, + "grad_norm": 14.564234733581543, + "learning_rate": 1.2053723564197654e-05, + "loss": 1.6248, + "step": 3965 + }, + { + "epoch": 1.19, + "grad_norm": 16.231090545654297, + "learning_rate": 1.2051718953593266e-05, + "loss": 1.2251, + "step": 3966 + }, + { + "epoch": 1.19, + "grad_norm": 15.789569854736328, + "learning_rate": 1.2049714342988876e-05, + "loss": 2.4118, + "step": 3967 + }, + { + "epoch": 1.19, + "grad_norm": 18.06235122680664, + "learning_rate": 1.2047709732384484e-05, + "loss": 1.4758, + "step": 3968 + }, + { + "epoch": 1.19, + "grad_norm": 18.72103500366211, + "learning_rate": 1.2045705121780096e-05, + "loss": 1.416, + "step": 3969 + }, + { + "epoch": 1.19, + "grad_norm": 67.29039001464844, + "learning_rate": 1.2043700511175705e-05, + "loss": 3.1432, + "step": 3970 + }, + { + "epoch": 1.19, + "grad_norm": 9.234892845153809, + "learning_rate": 1.2041695900571315e-05, + "loss": 1.0072, + "step": 3971 + }, + { + "epoch": 1.19, + "grad_norm": 16.655811309814453, + "learning_rate": 1.2039691289966923e-05, + "loss": 1.4894, + "step": 3972 + }, + { + "epoch": 1.19, + "grad_norm": 24.546369552612305, + "learning_rate": 1.2037686679362535e-05, + "loss": 1.6944, + "step": 3973 + }, + { + "epoch": 1.19, + "grad_norm": 22.591285705566406, + "learning_rate": 1.2035682068758145e-05, + "loss": 1.4835, + "step": 3974 + }, + { + "epoch": 1.2, + "grad_norm": 19.460481643676758, + "learning_rate": 1.2033677458153753e-05, + "loss": 2.3772, + "step": 3975 + }, + { + "epoch": 1.2, + "grad_norm": 10.695039749145508, + "learning_rate": 1.2031672847549365e-05, + "loss": 1.3966, + "step": 3976 + }, + { + "epoch": 1.2, + "grad_norm": 16.96231460571289, + "learning_rate": 1.2029668236944973e-05, + "loss": 1.5844, + "step": 3977 + }, + { + "epoch": 1.2, + "grad_norm": 13.368218421936035, + "learning_rate": 1.2027663626340585e-05, + "loss": 2.2137, + "step": 3978 + }, + { + "epoch": 1.2, + "grad_norm": 16.881895065307617, + "learning_rate": 1.2025659015736195e-05, + "loss": 1.809, + "step": 3979 + }, + { + "epoch": 1.2, + "grad_norm": 16.86216926574707, + "learning_rate": 1.2023654405131804e-05, + "loss": 1.7264, + "step": 3980 + }, + { + "epoch": 1.2, + "grad_norm": 19.68351173400879, + "learning_rate": 1.2021649794527415e-05, + "loss": 1.4559, + "step": 3981 + }, + { + "epoch": 1.2, + "grad_norm": 27.27216339111328, + "learning_rate": 1.2019645183923024e-05, + "loss": 1.6525, + "step": 3982 + }, + { + "epoch": 1.2, + "grad_norm": 48.12042999267578, + "learning_rate": 1.2017640573318634e-05, + "loss": 2.7933, + "step": 3983 + }, + { + "epoch": 1.2, + "grad_norm": 17.5889892578125, + "learning_rate": 1.2015635962714242e-05, + "loss": 1.2232, + "step": 3984 + }, + { + "epoch": 1.2, + "grad_norm": 15.102045059204102, + "learning_rate": 1.2013631352109854e-05, + "loss": 2.1175, + "step": 3985 + }, + { + "epoch": 1.2, + "grad_norm": 15.093314170837402, + "learning_rate": 1.2011626741505464e-05, + "loss": 2.0942, + "step": 3986 + }, + { + "epoch": 1.2, + "grad_norm": 14.149928092956543, + "learning_rate": 1.2009622130901073e-05, + "loss": 1.4488, + "step": 3987 + }, + { + "epoch": 1.2, + "grad_norm": 23.17877197265625, + "learning_rate": 1.2007617520296684e-05, + "loss": 2.1352, + "step": 3988 + }, + { + "epoch": 1.2, + "grad_norm": 11.108948707580566, + "learning_rate": 1.2005612909692293e-05, + "loss": 1.4723, + "step": 3989 + }, + { + "epoch": 1.2, + "grad_norm": 41.934715270996094, + "learning_rate": 1.2003608299087903e-05, + "loss": 1.3572, + "step": 3990 + }, + { + "epoch": 1.2, + "grad_norm": 30.752838134765625, + "learning_rate": 1.2001603688483515e-05, + "loss": 1.7268, + "step": 3991 + }, + { + "epoch": 1.2, + "grad_norm": 19.63275718688965, + "learning_rate": 1.1999599077879123e-05, + "loss": 1.8781, + "step": 3992 + }, + { + "epoch": 1.2, + "grad_norm": 16.867294311523438, + "learning_rate": 1.1997594467274733e-05, + "loss": 1.2249, + "step": 3993 + }, + { + "epoch": 1.2, + "grad_norm": 12.371650695800781, + "learning_rate": 1.1995589856670341e-05, + "loss": 2.7085, + "step": 3994 + }, + { + "epoch": 1.2, + "grad_norm": 14.14538860321045, + "learning_rate": 1.1993585246065953e-05, + "loss": 1.7304, + "step": 3995 + }, + { + "epoch": 1.2, + "grad_norm": 28.63239860534668, + "learning_rate": 1.1991580635461562e-05, + "loss": 1.6735, + "step": 3996 + }, + { + "epoch": 1.2, + "grad_norm": 10.542112350463867, + "learning_rate": 1.1989576024857172e-05, + "loss": 2.628, + "step": 3997 + }, + { + "epoch": 1.2, + "grad_norm": 15.562253952026367, + "learning_rate": 1.1987571414252784e-05, + "loss": 1.8914, + "step": 3998 + }, + { + "epoch": 1.2, + "grad_norm": 14.955403327941895, + "learning_rate": 1.1985566803648392e-05, + "loss": 1.4507, + "step": 3999 + }, + { + "epoch": 1.2, + "grad_norm": 8.927510261535645, + "learning_rate": 1.1983562193044002e-05, + "loss": 1.8247, + "step": 4000 + }, + { + "epoch": 1.2, + "grad_norm": 40.698150634765625, + "learning_rate": 1.1981557582439612e-05, + "loss": 2.7692, + "step": 4001 + }, + { + "epoch": 1.2, + "grad_norm": 8.478541374206543, + "learning_rate": 1.1979552971835222e-05, + "loss": 1.0191, + "step": 4002 + }, + { + "epoch": 1.2, + "grad_norm": 29.216922760009766, + "learning_rate": 1.1977548361230832e-05, + "loss": 1.7092, + "step": 4003 + }, + { + "epoch": 1.2, + "grad_norm": 11.635241508483887, + "learning_rate": 1.1975543750626442e-05, + "loss": 1.7223, + "step": 4004 + }, + { + "epoch": 1.2, + "grad_norm": 23.52641487121582, + "learning_rate": 1.1973539140022052e-05, + "loss": 1.6989, + "step": 4005 + }, + { + "epoch": 1.2, + "grad_norm": 18.005016326904297, + "learning_rate": 1.197153452941766e-05, + "loss": 2.5197, + "step": 4006 + }, + { + "epoch": 1.2, + "grad_norm": 59.00607681274414, + "learning_rate": 1.1969529918813273e-05, + "loss": 3.0074, + "step": 4007 + }, + { + "epoch": 1.21, + "grad_norm": 23.461963653564453, + "learning_rate": 1.1967525308208881e-05, + "loss": 2.5943, + "step": 4008 + }, + { + "epoch": 1.21, + "grad_norm": 30.815887451171875, + "learning_rate": 1.1965520697604491e-05, + "loss": 1.9424, + "step": 4009 + }, + { + "epoch": 1.21, + "grad_norm": 29.489526748657227, + "learning_rate": 1.1963516087000103e-05, + "loss": 2.0126, + "step": 4010 + }, + { + "epoch": 1.21, + "grad_norm": 7.134322166442871, + "learning_rate": 1.1961511476395711e-05, + "loss": 1.2801, + "step": 4011 + }, + { + "epoch": 1.21, + "grad_norm": 12.04995059967041, + "learning_rate": 1.1959506865791321e-05, + "loss": 1.0723, + "step": 4012 + }, + { + "epoch": 1.21, + "grad_norm": 13.849966049194336, + "learning_rate": 1.195750225518693e-05, + "loss": 1.0122, + "step": 4013 + }, + { + "epoch": 1.21, + "grad_norm": 7.143357753753662, + "learning_rate": 1.1955497644582541e-05, + "loss": 1.961, + "step": 4014 + }, + { + "epoch": 1.21, + "grad_norm": 16.170339584350586, + "learning_rate": 1.195349303397815e-05, + "loss": 1.9324, + "step": 4015 + }, + { + "epoch": 1.21, + "grad_norm": 22.960071563720703, + "learning_rate": 1.195148842337376e-05, + "loss": 2.5535, + "step": 4016 + }, + { + "epoch": 1.21, + "grad_norm": 15.098960876464844, + "learning_rate": 1.1949483812769372e-05, + "loss": 1.142, + "step": 4017 + }, + { + "epoch": 1.21, + "grad_norm": 19.63709831237793, + "learning_rate": 1.194747920216498e-05, + "loss": 2.072, + "step": 4018 + }, + { + "epoch": 1.21, + "grad_norm": 14.952527046203613, + "learning_rate": 1.194547459156059e-05, + "loss": 1.2427, + "step": 4019 + }, + { + "epoch": 1.21, + "grad_norm": 13.70450210571289, + "learning_rate": 1.1943469980956199e-05, + "loss": 1.8315, + "step": 4020 + }, + { + "epoch": 1.21, + "grad_norm": 13.074018478393555, + "learning_rate": 1.194146537035181e-05, + "loss": 0.9992, + "step": 4021 + }, + { + "epoch": 1.21, + "grad_norm": 12.277508735656738, + "learning_rate": 1.193946075974742e-05, + "loss": 1.8067, + "step": 4022 + }, + { + "epoch": 1.21, + "grad_norm": 27.505800247192383, + "learning_rate": 1.1937456149143029e-05, + "loss": 2.1141, + "step": 4023 + }, + { + "epoch": 1.21, + "grad_norm": 18.097551345825195, + "learning_rate": 1.193545153853864e-05, + "loss": 1.631, + "step": 4024 + }, + { + "epoch": 1.21, + "grad_norm": 15.072179794311523, + "learning_rate": 1.1933446927934249e-05, + "loss": 1.5262, + "step": 4025 + }, + { + "epoch": 1.21, + "grad_norm": 13.110054016113281, + "learning_rate": 1.1931442317329859e-05, + "loss": 1.4835, + "step": 4026 + }, + { + "epoch": 1.21, + "grad_norm": 11.2367525100708, + "learning_rate": 1.192943770672547e-05, + "loss": 1.5453, + "step": 4027 + }, + { + "epoch": 1.21, + "grad_norm": 19.51766014099121, + "learning_rate": 1.192743309612108e-05, + "loss": 1.4438, + "step": 4028 + }, + { + "epoch": 1.21, + "grad_norm": 58.76639938354492, + "learning_rate": 1.1925428485516691e-05, + "loss": 2.2727, + "step": 4029 + }, + { + "epoch": 1.21, + "grad_norm": 16.14882469177246, + "learning_rate": 1.19234238749123e-05, + "loss": 1.4155, + "step": 4030 + }, + { + "epoch": 1.21, + "grad_norm": 9.466064453125, + "learning_rate": 1.192141926430791e-05, + "loss": 1.3346, + "step": 4031 + }, + { + "epoch": 1.21, + "grad_norm": 16.83454704284668, + "learning_rate": 1.1919414653703518e-05, + "loss": 1.3689, + "step": 4032 + }, + { + "epoch": 1.21, + "grad_norm": 11.721967697143555, + "learning_rate": 1.191741004309913e-05, + "loss": 1.5468, + "step": 4033 + }, + { + "epoch": 1.21, + "grad_norm": 9.986775398254395, + "learning_rate": 1.191540543249474e-05, + "loss": 2.5393, + "step": 4034 + }, + { + "epoch": 1.21, + "grad_norm": 11.999430656433105, + "learning_rate": 1.1913400821890348e-05, + "loss": 1.7203, + "step": 4035 + }, + { + "epoch": 1.21, + "grad_norm": 15.086469650268555, + "learning_rate": 1.191139621128596e-05, + "loss": 1.8978, + "step": 4036 + }, + { + "epoch": 1.21, + "grad_norm": 20.604961395263672, + "learning_rate": 1.1909391600681568e-05, + "loss": 1.0557, + "step": 4037 + }, + { + "epoch": 1.21, + "grad_norm": 11.977767944335938, + "learning_rate": 1.1907386990077178e-05, + "loss": 1.9318, + "step": 4038 + }, + { + "epoch": 1.21, + "grad_norm": 17.97518539428711, + "learning_rate": 1.1905382379472787e-05, + "loss": 0.948, + "step": 4039 + }, + { + "epoch": 1.21, + "grad_norm": 9.713574409484863, + "learning_rate": 1.1903377768868399e-05, + "loss": 1.0446, + "step": 4040 + }, + { + "epoch": 1.21, + "grad_norm": 18.080841064453125, + "learning_rate": 1.1901373158264009e-05, + "loss": 1.703, + "step": 4041 + }, + { + "epoch": 1.22, + "grad_norm": 35.3074951171875, + "learning_rate": 1.1899368547659617e-05, + "loss": 2.8692, + "step": 4042 + }, + { + "epoch": 1.22, + "grad_norm": 12.872424125671387, + "learning_rate": 1.1897363937055229e-05, + "loss": 1.2558, + "step": 4043 + }, + { + "epoch": 1.22, + "grad_norm": 20.24639129638672, + "learning_rate": 1.1895359326450837e-05, + "loss": 2.013, + "step": 4044 + }, + { + "epoch": 1.22, + "grad_norm": 19.4423885345459, + "learning_rate": 1.1893354715846447e-05, + "loss": 1.2018, + "step": 4045 + }, + { + "epoch": 1.22, + "grad_norm": 28.46637725830078, + "learning_rate": 1.1891350105242059e-05, + "loss": 2.3941, + "step": 4046 + }, + { + "epoch": 1.22, + "grad_norm": 15.011366844177246, + "learning_rate": 1.1889345494637667e-05, + "loss": 1.5718, + "step": 4047 + }, + { + "epoch": 1.22, + "grad_norm": 30.27291488647461, + "learning_rate": 1.1887340884033278e-05, + "loss": 2.3006, + "step": 4048 + }, + { + "epoch": 1.22, + "grad_norm": 18.10991859436035, + "learning_rate": 1.1885336273428888e-05, + "loss": 1.7672, + "step": 4049 + }, + { + "epoch": 1.22, + "grad_norm": 34.54472732543945, + "learning_rate": 1.1883331662824498e-05, + "loss": 2.484, + "step": 4050 + }, + { + "epoch": 1.22, + "grad_norm": 12.643610954284668, + "learning_rate": 1.1881327052220106e-05, + "loss": 2.0557, + "step": 4051 + }, + { + "epoch": 1.22, + "grad_norm": 12.419498443603516, + "learning_rate": 1.1879322441615718e-05, + "loss": 1.8953, + "step": 4052 + }, + { + "epoch": 1.22, + "grad_norm": 9.909022331237793, + "learning_rate": 1.1877317831011328e-05, + "loss": 0.9697, + "step": 4053 + }, + { + "epoch": 1.22, + "grad_norm": 213.60519409179688, + "learning_rate": 1.1875313220406936e-05, + "loss": 2.7859, + "step": 4054 + }, + { + "epoch": 1.22, + "grad_norm": 43.98239517211914, + "learning_rate": 1.1873308609802548e-05, + "loss": 2.0812, + "step": 4055 + }, + { + "epoch": 1.22, + "grad_norm": 22.185237884521484, + "learning_rate": 1.1871303999198157e-05, + "loss": 1.778, + "step": 4056 + }, + { + "epoch": 1.22, + "grad_norm": 20.283220291137695, + "learning_rate": 1.1869299388593767e-05, + "loss": 1.6555, + "step": 4057 + }, + { + "epoch": 1.22, + "grad_norm": 52.456600189208984, + "learning_rate": 1.1867294777989375e-05, + "loss": 2.7659, + "step": 4058 + }, + { + "epoch": 1.22, + "grad_norm": 27.247201919555664, + "learning_rate": 1.1865290167384987e-05, + "loss": 1.9769, + "step": 4059 + }, + { + "epoch": 1.22, + "grad_norm": 19.406150817871094, + "learning_rate": 1.1863285556780597e-05, + "loss": 2.0779, + "step": 4060 + }, + { + "epoch": 1.22, + "grad_norm": 7.287651062011719, + "learning_rate": 1.1861280946176205e-05, + "loss": 1.3887, + "step": 4061 + }, + { + "epoch": 1.22, + "grad_norm": 12.292468070983887, + "learning_rate": 1.1859276335571817e-05, + "loss": 1.7488, + "step": 4062 + }, + { + "epoch": 1.22, + "grad_norm": 9.17116928100586, + "learning_rate": 1.1857271724967425e-05, + "loss": 1.2331, + "step": 4063 + }, + { + "epoch": 1.22, + "grad_norm": 12.708617210388184, + "learning_rate": 1.1855267114363036e-05, + "loss": 1.5956, + "step": 4064 + }, + { + "epoch": 1.22, + "grad_norm": 19.217905044555664, + "learning_rate": 1.1853262503758647e-05, + "loss": 3.3333, + "step": 4065 + }, + { + "epoch": 1.22, + "grad_norm": 9.486623764038086, + "learning_rate": 1.1851257893154256e-05, + "loss": 1.1654, + "step": 4066 + }, + { + "epoch": 1.22, + "grad_norm": 15.519346237182617, + "learning_rate": 1.1849253282549866e-05, + "loss": 1.3905, + "step": 4067 + }, + { + "epoch": 1.22, + "grad_norm": 27.3670597076416, + "learning_rate": 1.1847248671945474e-05, + "loss": 1.8644, + "step": 4068 + }, + { + "epoch": 1.22, + "grad_norm": 12.652219772338867, + "learning_rate": 1.1845244061341086e-05, + "loss": 1.7432, + "step": 4069 + }, + { + "epoch": 1.22, + "grad_norm": 7.705514430999756, + "learning_rate": 1.1843239450736694e-05, + "loss": 0.8268, + "step": 4070 + }, + { + "epoch": 1.22, + "grad_norm": 13.351577758789062, + "learning_rate": 1.1841234840132304e-05, + "loss": 2.1508, + "step": 4071 + }, + { + "epoch": 1.22, + "grad_norm": 44.65596008300781, + "learning_rate": 1.1839230229527916e-05, + "loss": 2.0205, + "step": 4072 + }, + { + "epoch": 1.22, + "grad_norm": 24.321109771728516, + "learning_rate": 1.1837225618923525e-05, + "loss": 1.7858, + "step": 4073 + }, + { + "epoch": 1.22, + "grad_norm": 48.96630096435547, + "learning_rate": 1.1835221008319135e-05, + "loss": 1.9235, + "step": 4074 + }, + { + "epoch": 1.23, + "grad_norm": 16.92742156982422, + "learning_rate": 1.1833216397714745e-05, + "loss": 1.7837, + "step": 4075 + }, + { + "epoch": 1.23, + "grad_norm": 42.10576629638672, + "learning_rate": 1.1831211787110355e-05, + "loss": 3.2812, + "step": 4076 + }, + { + "epoch": 1.23, + "grad_norm": 13.819901466369629, + "learning_rate": 1.1829207176505965e-05, + "loss": 1.4294, + "step": 4077 + }, + { + "epoch": 1.23, + "grad_norm": 11.949419021606445, + "learning_rate": 1.1827202565901575e-05, + "loss": 1.3175, + "step": 4078 + }, + { + "epoch": 1.23, + "grad_norm": 23.273836135864258, + "learning_rate": 1.1825197955297185e-05, + "loss": 2.5785, + "step": 4079 + }, + { + "epoch": 1.23, + "grad_norm": 15.00057601928711, + "learning_rate": 1.1823193344692793e-05, + "loss": 1.4673, + "step": 4080 + }, + { + "epoch": 1.23, + "eval_loss": 0.22429148852825165, + "eval_runtime": 43.6464, + "eval_samples_per_second": 33.886, + "eval_steps_per_second": 33.886, + "step": 4080 + }, + { + "epoch": 1.23, + "grad_norm": 16.4818172454834, + "learning_rate": 1.1821188734088405e-05, + "loss": 1.8456, + "step": 4081 + }, + { + "epoch": 1.23, + "grad_norm": 9.311663627624512, + "learning_rate": 1.1819184123484014e-05, + "loss": 1.5563, + "step": 4082 + }, + { + "epoch": 1.23, + "grad_norm": 19.122093200683594, + "learning_rate": 1.1817179512879624e-05, + "loss": 2.2633, + "step": 4083 + }, + { + "epoch": 1.23, + "grad_norm": 33.5509033203125, + "learning_rate": 1.1815174902275236e-05, + "loss": 1.9949, + "step": 4084 + }, + { + "epoch": 1.23, + "grad_norm": 14.471306800842285, + "learning_rate": 1.1813170291670844e-05, + "loss": 1.8224, + "step": 4085 + }, + { + "epoch": 1.23, + "grad_norm": 11.821321487426758, + "learning_rate": 1.1811165681066454e-05, + "loss": 1.8433, + "step": 4086 + }, + { + "epoch": 1.23, + "grad_norm": 26.970781326293945, + "learning_rate": 1.1809161070462062e-05, + "loss": 2.336, + "step": 4087 + }, + { + "epoch": 1.23, + "grad_norm": 18.373268127441406, + "learning_rate": 1.1807156459857674e-05, + "loss": 1.4686, + "step": 4088 + }, + { + "epoch": 1.23, + "grad_norm": 17.756778717041016, + "learning_rate": 1.1805151849253284e-05, + "loss": 1.4351, + "step": 4089 + }, + { + "epoch": 1.23, + "grad_norm": 21.345794677734375, + "learning_rate": 1.1803147238648893e-05, + "loss": 1.7058, + "step": 4090 + }, + { + "epoch": 1.23, + "grad_norm": 99.40449523925781, + "learning_rate": 1.1801142628044504e-05, + "loss": 2.2699, + "step": 4091 + }, + { + "epoch": 1.23, + "grad_norm": 29.920364379882812, + "learning_rate": 1.1799138017440113e-05, + "loss": 1.7795, + "step": 4092 + }, + { + "epoch": 1.23, + "grad_norm": 16.23194122314453, + "learning_rate": 1.1797133406835723e-05, + "loss": 1.8951, + "step": 4093 + }, + { + "epoch": 1.23, + "grad_norm": 16.518890380859375, + "learning_rate": 1.1795128796231331e-05, + "loss": 2.0618, + "step": 4094 + }, + { + "epoch": 1.23, + "grad_norm": 15.834227561950684, + "learning_rate": 1.1793124185626943e-05, + "loss": 1.3648, + "step": 4095 + }, + { + "epoch": 1.23, + "grad_norm": 21.73334503173828, + "learning_rate": 1.1791119575022553e-05, + "loss": 1.6835, + "step": 4096 + }, + { + "epoch": 1.23, + "grad_norm": 16.20547103881836, + "learning_rate": 1.1789114964418163e-05, + "loss": 2.8748, + "step": 4097 + }, + { + "epoch": 1.23, + "grad_norm": 13.940154075622559, + "learning_rate": 1.1787110353813773e-05, + "loss": 1.7865, + "step": 4098 + }, + { + "epoch": 1.23, + "grad_norm": 10.902242660522461, + "learning_rate": 1.1785105743209382e-05, + "loss": 1.4294, + "step": 4099 + }, + { + "epoch": 1.23, + "grad_norm": 20.06895637512207, + "learning_rate": 1.1783101132604993e-05, + "loss": 2.0791, + "step": 4100 + }, + { + "epoch": 1.23, + "grad_norm": 21.698528289794922, + "learning_rate": 1.1781096522000602e-05, + "loss": 1.8301, + "step": 4101 + }, + { + "epoch": 1.23, + "grad_norm": 50.30092239379883, + "learning_rate": 1.1779091911396212e-05, + "loss": 2.0178, + "step": 4102 + }, + { + "epoch": 1.23, + "grad_norm": 29.254417419433594, + "learning_rate": 1.1777087300791824e-05, + "loss": 1.7604, + "step": 4103 + }, + { + "epoch": 1.23, + "grad_norm": 13.801785469055176, + "learning_rate": 1.1775082690187432e-05, + "loss": 1.9442, + "step": 4104 + }, + { + "epoch": 1.23, + "grad_norm": 14.859130859375, + "learning_rate": 1.1773078079583042e-05, + "loss": 2.0129, + "step": 4105 + }, + { + "epoch": 1.23, + "grad_norm": 14.485882759094238, + "learning_rate": 1.177107346897865e-05, + "loss": 1.7607, + "step": 4106 + }, + { + "epoch": 1.23, + "grad_norm": 16.90434455871582, + "learning_rate": 1.1769068858374262e-05, + "loss": 1.5387, + "step": 4107 + }, + { + "epoch": 1.24, + "grad_norm": 13.56885051727295, + "learning_rate": 1.1767064247769872e-05, + "loss": 1.2367, + "step": 4108 + }, + { + "epoch": 1.24, + "grad_norm": 14.099568367004395, + "learning_rate": 1.176505963716548e-05, + "loss": 0.9839, + "step": 4109 + }, + { + "epoch": 1.24, + "grad_norm": 22.063432693481445, + "learning_rate": 1.1763055026561093e-05, + "loss": 1.3803, + "step": 4110 + }, + { + "epoch": 1.24, + "grad_norm": 16.2562255859375, + "learning_rate": 1.1761050415956701e-05, + "loss": 1.5013, + "step": 4111 + }, + { + "epoch": 1.24, + "grad_norm": 26.141794204711914, + "learning_rate": 1.1759045805352311e-05, + "loss": 2.49, + "step": 4112 + }, + { + "epoch": 1.24, + "grad_norm": 16.62359619140625, + "learning_rate": 1.175704119474792e-05, + "loss": 0.9543, + "step": 4113 + }, + { + "epoch": 1.24, + "grad_norm": 21.430021286010742, + "learning_rate": 1.1755036584143531e-05, + "loss": 1.2435, + "step": 4114 + }, + { + "epoch": 1.24, + "grad_norm": 15.07040786743164, + "learning_rate": 1.1753031973539141e-05, + "loss": 0.9931, + "step": 4115 + }, + { + "epoch": 1.24, + "grad_norm": 9.915048599243164, + "learning_rate": 1.175102736293475e-05, + "loss": 1.498, + "step": 4116 + }, + { + "epoch": 1.24, + "grad_norm": 16.46501922607422, + "learning_rate": 1.1749022752330362e-05, + "loss": 2.0349, + "step": 4117 + }, + { + "epoch": 1.24, + "grad_norm": 16.73762321472168, + "learning_rate": 1.174701814172597e-05, + "loss": 1.9466, + "step": 4118 + }, + { + "epoch": 1.24, + "grad_norm": 19.883028030395508, + "learning_rate": 1.174501353112158e-05, + "loss": 1.8937, + "step": 4119 + }, + { + "epoch": 1.24, + "grad_norm": 47.894859313964844, + "learning_rate": 1.1743008920517192e-05, + "loss": 1.9303, + "step": 4120 + }, + { + "epoch": 1.24, + "grad_norm": 14.77435302734375, + "learning_rate": 1.17410043099128e-05, + "loss": 1.8005, + "step": 4121 + }, + { + "epoch": 1.24, + "grad_norm": 9.72576904296875, + "learning_rate": 1.173899969930841e-05, + "loss": 1.5267, + "step": 4122 + }, + { + "epoch": 1.24, + "grad_norm": 27.231468200683594, + "learning_rate": 1.173699508870402e-05, + "loss": 1.5438, + "step": 4123 + }, + { + "epoch": 1.24, + "grad_norm": 14.572410583496094, + "learning_rate": 1.173499047809963e-05, + "loss": 1.7702, + "step": 4124 + }, + { + "epoch": 1.24, + "grad_norm": 14.047436714172363, + "learning_rate": 1.1732985867495239e-05, + "loss": 1.8277, + "step": 4125 + }, + { + "epoch": 1.24, + "grad_norm": 20.151954650878906, + "learning_rate": 1.173098125689085e-05, + "loss": 1.7252, + "step": 4126 + }, + { + "epoch": 1.24, + "grad_norm": 13.929177284240723, + "learning_rate": 1.172897664628646e-05, + "loss": 2.2859, + "step": 4127 + }, + { + "epoch": 1.24, + "grad_norm": 16.374547958374023, + "learning_rate": 1.1726972035682069e-05, + "loss": 1.4863, + "step": 4128 + }, + { + "epoch": 1.24, + "grad_norm": 20.026716232299805, + "learning_rate": 1.172496742507768e-05, + "loss": 1.4962, + "step": 4129 + }, + { + "epoch": 1.24, + "grad_norm": 14.383737564086914, + "learning_rate": 1.172296281447329e-05, + "loss": 1.8605, + "step": 4130 + }, + { + "epoch": 1.24, + "grad_norm": 8.625479698181152, + "learning_rate": 1.17209582038689e-05, + "loss": 1.3414, + "step": 4131 + }, + { + "epoch": 1.24, + "grad_norm": 9.765339851379395, + "learning_rate": 1.1718953593264511e-05, + "loss": 2.2172, + "step": 4132 + }, + { + "epoch": 1.24, + "grad_norm": 17.315092086791992, + "learning_rate": 1.171694898266012e-05, + "loss": 1.8639, + "step": 4133 + }, + { + "epoch": 1.24, + "grad_norm": 28.305118560791016, + "learning_rate": 1.171494437205573e-05, + "loss": 1.2452, + "step": 4134 + }, + { + "epoch": 1.24, + "grad_norm": 23.19142723083496, + "learning_rate": 1.1712939761451338e-05, + "loss": 1.5799, + "step": 4135 + }, + { + "epoch": 1.24, + "grad_norm": 38.168418884277344, + "learning_rate": 1.171093515084695e-05, + "loss": 2.4729, + "step": 4136 + }, + { + "epoch": 1.24, + "grad_norm": 12.799455642700195, + "learning_rate": 1.1708930540242558e-05, + "loss": 0.9976, + "step": 4137 + }, + { + "epoch": 1.24, + "grad_norm": 11.151683807373047, + "learning_rate": 1.1706925929638168e-05, + "loss": 2.1855, + "step": 4138 + }, + { + "epoch": 1.24, + "grad_norm": 55.58821487426758, + "learning_rate": 1.170492131903378e-05, + "loss": 1.4985, + "step": 4139 + }, + { + "epoch": 1.24, + "grad_norm": 12.804335594177246, + "learning_rate": 1.1702916708429388e-05, + "loss": 2.2843, + "step": 4140 + }, + { + "epoch": 1.25, + "grad_norm": 18.690269470214844, + "learning_rate": 1.1700912097824998e-05, + "loss": 1.8465, + "step": 4141 + }, + { + "epoch": 1.25, + "grad_norm": 13.506994247436523, + "learning_rate": 1.1698907487220607e-05, + "loss": 1.2933, + "step": 4142 + }, + { + "epoch": 1.25, + "grad_norm": 12.685111999511719, + "learning_rate": 1.1696902876616219e-05, + "loss": 1.1366, + "step": 4143 + }, + { + "epoch": 1.25, + "grad_norm": 51.746055603027344, + "learning_rate": 1.1694898266011827e-05, + "loss": 2.0853, + "step": 4144 + }, + { + "epoch": 1.25, + "grad_norm": 16.30849838256836, + "learning_rate": 1.1692893655407437e-05, + "loss": 1.8595, + "step": 4145 + }, + { + "epoch": 1.25, + "grad_norm": 24.40019989013672, + "learning_rate": 1.1690889044803049e-05, + "loss": 2.4656, + "step": 4146 + }, + { + "epoch": 1.25, + "grad_norm": 30.043943405151367, + "learning_rate": 1.1688884434198657e-05, + "loss": 1.5323, + "step": 4147 + }, + { + "epoch": 1.25, + "grad_norm": 16.770814895629883, + "learning_rate": 1.1686879823594269e-05, + "loss": 2.9425, + "step": 4148 + }, + { + "epoch": 1.25, + "grad_norm": 11.892877578735352, + "learning_rate": 1.1684875212989877e-05, + "loss": 2.216, + "step": 4149 + }, + { + "epoch": 1.25, + "grad_norm": 20.787343978881836, + "learning_rate": 1.1682870602385488e-05, + "loss": 2.4695, + "step": 4150 + }, + { + "epoch": 1.25, + "grad_norm": 23.898902893066406, + "learning_rate": 1.16808659917811e-05, + "loss": 1.5012, + "step": 4151 + }, + { + "epoch": 1.25, + "grad_norm": 24.14310073852539, + "learning_rate": 1.1678861381176708e-05, + "loss": 1.8126, + "step": 4152 + }, + { + "epoch": 1.25, + "grad_norm": 16.704151153564453, + "learning_rate": 1.1676856770572318e-05, + "loss": 2.1569, + "step": 4153 + }, + { + "epoch": 1.25, + "grad_norm": 11.908331871032715, + "learning_rate": 1.1674852159967926e-05, + "loss": 1.5541, + "step": 4154 + }, + { + "epoch": 1.25, + "grad_norm": 14.777883529663086, + "learning_rate": 1.1672847549363538e-05, + "loss": 1.7314, + "step": 4155 + }, + { + "epoch": 1.25, + "grad_norm": 13.375772476196289, + "learning_rate": 1.1670842938759146e-05, + "loss": 1.1609, + "step": 4156 + }, + { + "epoch": 1.25, + "grad_norm": 9.32558536529541, + "learning_rate": 1.1668838328154756e-05, + "loss": 1.6317, + "step": 4157 + }, + { + "epoch": 1.25, + "grad_norm": 14.819250106811523, + "learning_rate": 1.1666833717550368e-05, + "loss": 1.5169, + "step": 4158 + }, + { + "epoch": 1.25, + "grad_norm": 7.862631797790527, + "learning_rate": 1.1664829106945977e-05, + "loss": 1.187, + "step": 4159 + }, + { + "epoch": 1.25, + "grad_norm": 26.258054733276367, + "learning_rate": 1.1662824496341587e-05, + "loss": 2.8375, + "step": 4160 + }, + { + "epoch": 1.25, + "grad_norm": 16.67561912536621, + "learning_rate": 1.1660819885737195e-05, + "loss": 1.5601, + "step": 4161 + }, + { + "epoch": 1.25, + "grad_norm": 22.342544555664062, + "learning_rate": 1.1658815275132807e-05, + "loss": 1.5789, + "step": 4162 + }, + { + "epoch": 1.25, + "grad_norm": 13.550854682922363, + "learning_rate": 1.1656810664528417e-05, + "loss": 2.4655, + "step": 4163 + }, + { + "epoch": 1.25, + "grad_norm": 33.92632293701172, + "learning_rate": 1.1654806053924025e-05, + "loss": 1.5478, + "step": 4164 + }, + { + "epoch": 1.25, + "grad_norm": 10.819427490234375, + "learning_rate": 1.1652801443319637e-05, + "loss": 1.5479, + "step": 4165 + }, + { + "epoch": 1.25, + "grad_norm": 16.504961013793945, + "learning_rate": 1.1650796832715245e-05, + "loss": 1.4122, + "step": 4166 + }, + { + "epoch": 1.25, + "grad_norm": 22.91649055480957, + "learning_rate": 1.1648792222110856e-05, + "loss": 1.9459, + "step": 4167 + }, + { + "epoch": 1.25, + "grad_norm": 9.941856384277344, + "learning_rate": 1.1646787611506466e-05, + "loss": 1.7419, + "step": 4168 + }, + { + "epoch": 1.25, + "grad_norm": 12.914164543151855, + "learning_rate": 1.1644783000902076e-05, + "loss": 1.2351, + "step": 4169 + }, + { + "epoch": 1.25, + "grad_norm": 10.681536674499512, + "learning_rate": 1.1642778390297686e-05, + "loss": 1.7278, + "step": 4170 + }, + { + "epoch": 1.25, + "grad_norm": 17.06873321533203, + "learning_rate": 1.1640773779693296e-05, + "loss": 2.0348, + "step": 4171 + }, + { + "epoch": 1.25, + "grad_norm": 20.173357009887695, + "learning_rate": 1.1638769169088906e-05, + "loss": 1.6212, + "step": 4172 + }, + { + "epoch": 1.25, + "grad_norm": 6.546261310577393, + "learning_rate": 1.1636764558484514e-05, + "loss": 0.4214, + "step": 4173 + }, + { + "epoch": 1.25, + "grad_norm": 27.385225296020508, + "learning_rate": 1.1634759947880126e-05, + "loss": 1.6822, + "step": 4174 + }, + { + "epoch": 1.26, + "grad_norm": 37.10486602783203, + "learning_rate": 1.1632755337275736e-05, + "loss": 1.5469, + "step": 4175 + }, + { + "epoch": 1.26, + "grad_norm": 13.431403160095215, + "learning_rate": 1.1630750726671345e-05, + "loss": 2.7734, + "step": 4176 + }, + { + "epoch": 1.26, + "grad_norm": 17.27809715270996, + "learning_rate": 1.1628746116066956e-05, + "loss": 1.8377, + "step": 4177 + }, + { + "epoch": 1.26, + "grad_norm": 6.829921245574951, + "learning_rate": 1.1626741505462565e-05, + "loss": 1.0531, + "step": 4178 + }, + { + "epoch": 1.26, + "grad_norm": 15.239294052124023, + "learning_rate": 1.1624736894858175e-05, + "loss": 1.8001, + "step": 4179 + }, + { + "epoch": 1.26, + "grad_norm": 12.837756156921387, + "learning_rate": 1.1622732284253783e-05, + "loss": 1.1665, + "step": 4180 + }, + { + "epoch": 1.26, + "grad_norm": 16.143875122070312, + "learning_rate": 1.1620727673649395e-05, + "loss": 0.9885, + "step": 4181 + }, + { + "epoch": 1.26, + "grad_norm": 12.66282844543457, + "learning_rate": 1.1618723063045005e-05, + "loss": 1.219, + "step": 4182 + }, + { + "epoch": 1.26, + "grad_norm": 27.858238220214844, + "learning_rate": 1.1616718452440614e-05, + "loss": 2.2014, + "step": 4183 + }, + { + "epoch": 1.26, + "grad_norm": 21.537065505981445, + "learning_rate": 1.1614713841836225e-05, + "loss": 2.1228, + "step": 4184 + }, + { + "epoch": 1.26, + "grad_norm": 43.390296936035156, + "learning_rate": 1.1612709231231834e-05, + "loss": 2.343, + "step": 4185 + }, + { + "epoch": 1.26, + "grad_norm": 24.188064575195312, + "learning_rate": 1.1610704620627444e-05, + "loss": 1.1875, + "step": 4186 + }, + { + "epoch": 1.26, + "grad_norm": 13.248528480529785, + "learning_rate": 1.1608700010023052e-05, + "loss": 1.3365, + "step": 4187 + }, + { + "epoch": 1.26, + "grad_norm": 26.479053497314453, + "learning_rate": 1.1606695399418664e-05, + "loss": 2.2343, + "step": 4188 + }, + { + "epoch": 1.26, + "grad_norm": 20.167842864990234, + "learning_rate": 1.1604690788814274e-05, + "loss": 1.9832, + "step": 4189 + }, + { + "epoch": 1.26, + "grad_norm": 27.049171447753906, + "learning_rate": 1.1602686178209882e-05, + "loss": 1.6917, + "step": 4190 + }, + { + "epoch": 1.26, + "grad_norm": 7.952903747558594, + "learning_rate": 1.1600681567605494e-05, + "loss": 1.1251, + "step": 4191 + }, + { + "epoch": 1.26, + "grad_norm": 9.99161148071289, + "learning_rate": 1.1598676957001103e-05, + "loss": 1.3309, + "step": 4192 + }, + { + "epoch": 1.26, + "grad_norm": 23.682477951049805, + "learning_rate": 1.1596672346396713e-05, + "loss": 1.7625, + "step": 4193 + }, + { + "epoch": 1.26, + "grad_norm": 14.787646293640137, + "learning_rate": 1.1594667735792324e-05, + "loss": 0.8436, + "step": 4194 + }, + { + "epoch": 1.26, + "grad_norm": 16.3641414642334, + "learning_rate": 1.1592663125187933e-05, + "loss": 1.5916, + "step": 4195 + }, + { + "epoch": 1.26, + "grad_norm": 15.075904846191406, + "learning_rate": 1.1590658514583543e-05, + "loss": 1.5184, + "step": 4196 + }, + { + "epoch": 1.26, + "grad_norm": 13.050864219665527, + "learning_rate": 1.1588653903979153e-05, + "loss": 1.7963, + "step": 4197 + }, + { + "epoch": 1.26, + "grad_norm": 10.601302146911621, + "learning_rate": 1.1586649293374763e-05, + "loss": 1.7489, + "step": 4198 + }, + { + "epoch": 1.26, + "grad_norm": 21.2014217376709, + "learning_rate": 1.1584644682770371e-05, + "loss": 1.6054, + "step": 4199 + }, + { + "epoch": 1.26, + "grad_norm": 17.15383529663086, + "learning_rate": 1.1582640072165983e-05, + "loss": 1.2979, + "step": 4200 + }, + { + "epoch": 1.26, + "eval_loss": 0.2202432006597519, + "eval_runtime": 43.4456, + "eval_samples_per_second": 34.043, + "eval_steps_per_second": 34.043, + "step": 4200 + }, + { + "epoch": 1.26, + "grad_norm": 14.757326126098633, + "learning_rate": 1.1580635461561593e-05, + "loss": 1.03, + "step": 4201 + }, + { + "epoch": 1.26, + "grad_norm": 12.233736038208008, + "learning_rate": 1.1578630850957202e-05, + "loss": 1.4463, + "step": 4202 + }, + { + "epoch": 1.26, + "grad_norm": 17.687862396240234, + "learning_rate": 1.1576626240352814e-05, + "loss": 1.9516, + "step": 4203 + }, + { + "epoch": 1.26, + "grad_norm": 18.68252944946289, + "learning_rate": 1.1574621629748422e-05, + "loss": 2.6357, + "step": 4204 + }, + { + "epoch": 1.26, + "grad_norm": 24.503774642944336, + "learning_rate": 1.1572617019144032e-05, + "loss": 1.4327, + "step": 4205 + }, + { + "epoch": 1.26, + "grad_norm": 14.029136657714844, + "learning_rate": 1.1570612408539644e-05, + "loss": 2.0086, + "step": 4206 + }, + { + "epoch": 1.26, + "grad_norm": 13.784452438354492, + "learning_rate": 1.1568607797935252e-05, + "loss": 2.5528, + "step": 4207 + }, + { + "epoch": 1.27, + "grad_norm": 29.253156661987305, + "learning_rate": 1.1566603187330862e-05, + "loss": 0.5661, + "step": 4208 + }, + { + "epoch": 1.27, + "grad_norm": 26.327823638916016, + "learning_rate": 1.156459857672647e-05, + "loss": 1.8774, + "step": 4209 + }, + { + "epoch": 1.27, + "grad_norm": 50.058536529541016, + "learning_rate": 1.1562593966122082e-05, + "loss": 2.2888, + "step": 4210 + }, + { + "epoch": 1.27, + "grad_norm": 8.850481033325195, + "learning_rate": 1.156058935551769e-05, + "loss": 1.476, + "step": 4211 + }, + { + "epoch": 1.27, + "grad_norm": 12.578634262084961, + "learning_rate": 1.1558584744913301e-05, + "loss": 1.5757, + "step": 4212 + }, + { + "epoch": 1.27, + "grad_norm": 42.25316619873047, + "learning_rate": 1.1556580134308913e-05, + "loss": 1.574, + "step": 4213 + }, + { + "epoch": 1.27, + "grad_norm": 15.227287292480469, + "learning_rate": 1.1554575523704521e-05, + "loss": 2.4708, + "step": 4214 + }, + { + "epoch": 1.27, + "grad_norm": 10.333723068237305, + "learning_rate": 1.1552570913100131e-05, + "loss": 1.6319, + "step": 4215 + }, + { + "epoch": 1.27, + "grad_norm": 9.808292388916016, + "learning_rate": 1.1550566302495741e-05, + "loss": 2.1226, + "step": 4216 + }, + { + "epoch": 1.27, + "grad_norm": 39.865562438964844, + "learning_rate": 1.1548561691891351e-05, + "loss": 2.9798, + "step": 4217 + }, + { + "epoch": 1.27, + "grad_norm": 47.3433723449707, + "learning_rate": 1.154655708128696e-05, + "loss": 1.7633, + "step": 4218 + }, + { + "epoch": 1.27, + "grad_norm": 18.767004013061523, + "learning_rate": 1.1544552470682571e-05, + "loss": 1.0271, + "step": 4219 + }, + { + "epoch": 1.27, + "grad_norm": 5.930094242095947, + "learning_rate": 1.1542547860078182e-05, + "loss": 0.7385, + "step": 4220 + }, + { + "epoch": 1.27, + "grad_norm": 17.10114097595215, + "learning_rate": 1.154054324947379e-05, + "loss": 1.4075, + "step": 4221 + }, + { + "epoch": 1.27, + "grad_norm": 22.982498168945312, + "learning_rate": 1.1538538638869402e-05, + "loss": 2.3147, + "step": 4222 + }, + { + "epoch": 1.27, + "grad_norm": 29.882640838623047, + "learning_rate": 1.153653402826501e-05, + "loss": 1.3968, + "step": 4223 + }, + { + "epoch": 1.27, + "grad_norm": 17.916343688964844, + "learning_rate": 1.153452941766062e-05, + "loss": 1.5234, + "step": 4224 + }, + { + "epoch": 1.27, + "grad_norm": 12.272050857543945, + "learning_rate": 1.1532524807056232e-05, + "loss": 0.895, + "step": 4225 + }, + { + "epoch": 1.27, + "grad_norm": 14.076766967773438, + "learning_rate": 1.153052019645184e-05, + "loss": 1.1994, + "step": 4226 + }, + { + "epoch": 1.27, + "grad_norm": 16.016265869140625, + "learning_rate": 1.152851558584745e-05, + "loss": 1.0423, + "step": 4227 + }, + { + "epoch": 1.27, + "grad_norm": 10.317485809326172, + "learning_rate": 1.1526510975243059e-05, + "loss": 1.3147, + "step": 4228 + }, + { + "epoch": 1.27, + "grad_norm": 12.768622398376465, + "learning_rate": 1.152450636463867e-05, + "loss": 2.2527, + "step": 4229 + }, + { + "epoch": 1.27, + "grad_norm": 15.519412994384766, + "learning_rate": 1.1522501754034279e-05, + "loss": 1.4902, + "step": 4230 + }, + { + "epoch": 1.27, + "grad_norm": 18.289718627929688, + "learning_rate": 1.1520497143429889e-05, + "loss": 2.5218, + "step": 4231 + }, + { + "epoch": 1.27, + "grad_norm": 17.28578758239746, + "learning_rate": 1.1518492532825501e-05, + "loss": 0.8004, + "step": 4232 + }, + { + "epoch": 1.27, + "grad_norm": 23.51228141784668, + "learning_rate": 1.151648792222111e-05, + "loss": 1.8972, + "step": 4233 + }, + { + "epoch": 1.27, + "grad_norm": 12.242810249328613, + "learning_rate": 1.151448331161672e-05, + "loss": 1.0982, + "step": 4234 + }, + { + "epoch": 1.27, + "grad_norm": 16.621652603149414, + "learning_rate": 1.1512478701012328e-05, + "loss": 1.8477, + "step": 4235 + }, + { + "epoch": 1.27, + "grad_norm": 9.91673755645752, + "learning_rate": 1.151047409040794e-05, + "loss": 1.4897, + "step": 4236 + }, + { + "epoch": 1.27, + "grad_norm": 17.222322463989258, + "learning_rate": 1.150846947980355e-05, + "loss": 1.2186, + "step": 4237 + }, + { + "epoch": 1.27, + "grad_norm": 11.030045509338379, + "learning_rate": 1.1506464869199158e-05, + "loss": 1.0489, + "step": 4238 + }, + { + "epoch": 1.27, + "grad_norm": 13.282185554504395, + "learning_rate": 1.150446025859477e-05, + "loss": 2.1427, + "step": 4239 + }, + { + "epoch": 1.27, + "grad_norm": 18.355653762817383, + "learning_rate": 1.1502455647990378e-05, + "loss": 1.5696, + "step": 4240 + }, + { + "epoch": 1.28, + "grad_norm": 15.735917091369629, + "learning_rate": 1.1500451037385988e-05, + "loss": 1.789, + "step": 4241 + }, + { + "epoch": 1.28, + "grad_norm": 28.514938354492188, + "learning_rate": 1.1498446426781598e-05, + "loss": 2.2112, + "step": 4242 + }, + { + "epoch": 1.28, + "grad_norm": 16.573684692382812, + "learning_rate": 1.1496441816177208e-05, + "loss": 1.6606, + "step": 4243 + }, + { + "epoch": 1.28, + "grad_norm": 18.012393951416016, + "learning_rate": 1.1494437205572819e-05, + "loss": 1.3518, + "step": 4244 + }, + { + "epoch": 1.28, + "grad_norm": 21.622119903564453, + "learning_rate": 1.1492432594968429e-05, + "loss": 1.9794, + "step": 4245 + }, + { + "epoch": 1.28, + "grad_norm": 11.884101867675781, + "learning_rate": 1.1490427984364039e-05, + "loss": 1.4142, + "step": 4246 + }, + { + "epoch": 1.28, + "grad_norm": 19.552974700927734, + "learning_rate": 1.1488423373759647e-05, + "loss": 1.0571, + "step": 4247 + }, + { + "epoch": 1.28, + "grad_norm": 17.979076385498047, + "learning_rate": 1.1486418763155259e-05, + "loss": 1.6985, + "step": 4248 + }, + { + "epoch": 1.28, + "grad_norm": 19.238636016845703, + "learning_rate": 1.1484414152550869e-05, + "loss": 1.396, + "step": 4249 + }, + { + "epoch": 1.28, + "grad_norm": 46.26264953613281, + "learning_rate": 1.1482409541946477e-05, + "loss": 2.4083, + "step": 4250 + }, + { + "epoch": 1.28, + "grad_norm": 13.219773292541504, + "learning_rate": 1.1480404931342089e-05, + "loss": 0.8623, + "step": 4251 + }, + { + "epoch": 1.28, + "grad_norm": 9.744564056396484, + "learning_rate": 1.1478400320737697e-05, + "loss": 2.33, + "step": 4252 + }, + { + "epoch": 1.28, + "grad_norm": 17.062484741210938, + "learning_rate": 1.1476395710133308e-05, + "loss": 1.4111, + "step": 4253 + }, + { + "epoch": 1.28, + "grad_norm": 11.308719635009766, + "learning_rate": 1.1474391099528916e-05, + "loss": 1.0592, + "step": 4254 + }, + { + "epoch": 1.28, + "grad_norm": 12.195525169372559, + "learning_rate": 1.1472386488924528e-05, + "loss": 1.8724, + "step": 4255 + }, + { + "epoch": 1.28, + "grad_norm": 13.376428604125977, + "learning_rate": 1.1470381878320138e-05, + "loss": 1.5216, + "step": 4256 + }, + { + "epoch": 1.28, + "grad_norm": 7.64074182510376, + "learning_rate": 1.1468377267715746e-05, + "loss": 0.7377, + "step": 4257 + }, + { + "epoch": 1.28, + "grad_norm": 19.55640983581543, + "learning_rate": 1.1466372657111358e-05, + "loss": 2.3189, + "step": 4258 + }, + { + "epoch": 1.28, + "grad_norm": 202.1298370361328, + "learning_rate": 1.1464368046506966e-05, + "loss": 1.7217, + "step": 4259 + }, + { + "epoch": 1.28, + "grad_norm": 44.232112884521484, + "learning_rate": 1.1462363435902576e-05, + "loss": 1.1155, + "step": 4260 + }, + { + "epoch": 1.28, + "grad_norm": 20.715179443359375, + "learning_rate": 1.1460358825298185e-05, + "loss": 1.5698, + "step": 4261 + }, + { + "epoch": 1.28, + "grad_norm": 16.672870635986328, + "learning_rate": 1.1458354214693797e-05, + "loss": 2.0041, + "step": 4262 + }, + { + "epoch": 1.28, + "grad_norm": 15.357919692993164, + "learning_rate": 1.1456349604089407e-05, + "loss": 1.9595, + "step": 4263 + }, + { + "epoch": 1.28, + "grad_norm": 25.542970657348633, + "learning_rate": 1.1454344993485015e-05, + "loss": 3.4864, + "step": 4264 + }, + { + "epoch": 1.28, + "grad_norm": 20.45473861694336, + "learning_rate": 1.1452340382880627e-05, + "loss": 1.8289, + "step": 4265 + }, + { + "epoch": 1.28, + "grad_norm": 18.233806610107422, + "learning_rate": 1.1450335772276235e-05, + "loss": 1.9064, + "step": 4266 + }, + { + "epoch": 1.28, + "grad_norm": 8.359683990478516, + "learning_rate": 1.1448331161671847e-05, + "loss": 1.201, + "step": 4267 + }, + { + "epoch": 1.28, + "grad_norm": 9.61418628692627, + "learning_rate": 1.1446326551067457e-05, + "loss": 2.0508, + "step": 4268 + }, + { + "epoch": 1.28, + "grad_norm": 12.493593215942383, + "learning_rate": 1.1444321940463066e-05, + "loss": 1.8421, + "step": 4269 + }, + { + "epoch": 1.28, + "grad_norm": 19.759090423583984, + "learning_rate": 1.1442317329858677e-05, + "loss": 2.7833, + "step": 4270 + }, + { + "epoch": 1.28, + "grad_norm": 12.588993072509766, + "learning_rate": 1.1440312719254286e-05, + "loss": 1.1135, + "step": 4271 + }, + { + "epoch": 1.28, + "grad_norm": 22.17607879638672, + "learning_rate": 1.1438308108649896e-05, + "loss": 2.3915, + "step": 4272 + }, + { + "epoch": 1.28, + "grad_norm": 11.478544235229492, + "learning_rate": 1.1436303498045504e-05, + "loss": 0.9938, + "step": 4273 + }, + { + "epoch": 1.29, + "grad_norm": 10.766488075256348, + "learning_rate": 1.1434298887441116e-05, + "loss": 1.0582, + "step": 4274 + }, + { + "epoch": 1.29, + "grad_norm": 20.56522560119629, + "learning_rate": 1.1432294276836726e-05, + "loss": 1.6918, + "step": 4275 + }, + { + "epoch": 1.29, + "grad_norm": 9.447360038757324, + "learning_rate": 1.1430289666232334e-05, + "loss": 1.839, + "step": 4276 + }, + { + "epoch": 1.29, + "grad_norm": 9.256118774414062, + "learning_rate": 1.1428285055627946e-05, + "loss": 1.307, + "step": 4277 + }, + { + "epoch": 1.29, + "grad_norm": 13.01025676727295, + "learning_rate": 1.1426280445023555e-05, + "loss": 1.6272, + "step": 4278 + }, + { + "epoch": 1.29, + "grad_norm": 11.885642051696777, + "learning_rate": 1.1424275834419165e-05, + "loss": 0.5883, + "step": 4279 + }, + { + "epoch": 1.29, + "grad_norm": 13.292377471923828, + "learning_rate": 1.1422271223814776e-05, + "loss": 1.4237, + "step": 4280 + }, + { + "epoch": 1.29, + "grad_norm": 7.443901538848877, + "learning_rate": 1.1420266613210385e-05, + "loss": 0.9989, + "step": 4281 + }, + { + "epoch": 1.29, + "grad_norm": 83.39949035644531, + "learning_rate": 1.1418262002605995e-05, + "loss": 1.8078, + "step": 4282 + }, + { + "epoch": 1.29, + "grad_norm": 12.701087951660156, + "learning_rate": 1.1416257392001603e-05, + "loss": 1.5752, + "step": 4283 + }, + { + "epoch": 1.29, + "grad_norm": 58.16669464111328, + "learning_rate": 1.1414252781397215e-05, + "loss": 2.8532, + "step": 4284 + }, + { + "epoch": 1.29, + "grad_norm": 21.291078567504883, + "learning_rate": 1.1412248170792823e-05, + "loss": 1.2318, + "step": 4285 + }, + { + "epoch": 1.29, + "grad_norm": 18.41106605529785, + "learning_rate": 1.1410243560188434e-05, + "loss": 2.264, + "step": 4286 + }, + { + "epoch": 1.29, + "grad_norm": 20.537443161010742, + "learning_rate": 1.1408238949584045e-05, + "loss": 2.6269, + "step": 4287 + }, + { + "epoch": 1.29, + "grad_norm": 15.127706527709961, + "learning_rate": 1.1406234338979654e-05, + "loss": 1.9341, + "step": 4288 + }, + { + "epoch": 1.29, + "grad_norm": 20.711387634277344, + "learning_rate": 1.1404229728375264e-05, + "loss": 1.7335, + "step": 4289 + }, + { + "epoch": 1.29, + "grad_norm": 50.79159164428711, + "learning_rate": 1.1402225117770874e-05, + "loss": 2.1342, + "step": 4290 + }, + { + "epoch": 1.29, + "grad_norm": 29.805246353149414, + "learning_rate": 1.1400220507166484e-05, + "loss": 2.0907, + "step": 4291 + }, + { + "epoch": 1.29, + "grad_norm": 12.824877738952637, + "learning_rate": 1.1398215896562094e-05, + "loss": 1.1383, + "step": 4292 + }, + { + "epoch": 1.29, + "grad_norm": 7.294422149658203, + "learning_rate": 1.1396211285957704e-05, + "loss": 1.3885, + "step": 4293 + }, + { + "epoch": 1.29, + "grad_norm": 31.934511184692383, + "learning_rate": 1.1394206675353314e-05, + "loss": 2.1207, + "step": 4294 + }, + { + "epoch": 1.29, + "grad_norm": 18.717044830322266, + "learning_rate": 1.1392202064748923e-05, + "loss": 1.4102, + "step": 4295 + }, + { + "epoch": 1.29, + "grad_norm": 14.38543701171875, + "learning_rate": 1.1390197454144534e-05, + "loss": 1.9233, + "step": 4296 + }, + { + "epoch": 1.29, + "grad_norm": 20.979101181030273, + "learning_rate": 1.1388192843540143e-05, + "loss": 1.509, + "step": 4297 + }, + { + "epoch": 1.29, + "grad_norm": 16.605932235717773, + "learning_rate": 1.1386188232935753e-05, + "loss": 1.4354, + "step": 4298 + }, + { + "epoch": 1.29, + "grad_norm": 9.505638122558594, + "learning_rate": 1.1384183622331365e-05, + "loss": 1.1145, + "step": 4299 + }, + { + "epoch": 1.29, + "grad_norm": 15.347785949707031, + "learning_rate": 1.1382179011726973e-05, + "loss": 1.6882, + "step": 4300 + }, + { + "epoch": 1.29, + "grad_norm": 20.905508041381836, + "learning_rate": 1.1380174401122583e-05, + "loss": 1.5622, + "step": 4301 + }, + { + "epoch": 1.29, + "grad_norm": 41.201194763183594, + "learning_rate": 1.1378169790518192e-05, + "loss": 3.4766, + "step": 4302 + }, + { + "epoch": 1.29, + "grad_norm": 15.540892601013184, + "learning_rate": 1.1376165179913803e-05, + "loss": 1.4535, + "step": 4303 + }, + { + "epoch": 1.29, + "grad_norm": 24.336341857910156, + "learning_rate": 1.1374160569309412e-05, + "loss": 1.7003, + "step": 4304 + }, + { + "epoch": 1.29, + "grad_norm": 28.202116012573242, + "learning_rate": 1.1372155958705022e-05, + "loss": 1.2576, + "step": 4305 + }, + { + "epoch": 1.29, + "grad_norm": 12.418293952941895, + "learning_rate": 1.1370151348100634e-05, + "loss": 1.8355, + "step": 4306 + }, + { + "epoch": 1.29, + "grad_norm": 35.69652557373047, + "learning_rate": 1.1368146737496242e-05, + "loss": 2.1658, + "step": 4307 + }, + { + "epoch": 1.3, + "grad_norm": 17.411029815673828, + "learning_rate": 1.1366142126891852e-05, + "loss": 2.4983, + "step": 4308 + }, + { + "epoch": 1.3, + "grad_norm": 10.031928062438965, + "learning_rate": 1.136413751628746e-05, + "loss": 1.7212, + "step": 4309 + }, + { + "epoch": 1.3, + "grad_norm": 11.622937202453613, + "learning_rate": 1.1362132905683072e-05, + "loss": 1.4201, + "step": 4310 + }, + { + "epoch": 1.3, + "grad_norm": 19.42033576965332, + "learning_rate": 1.1360128295078682e-05, + "loss": 1.8876, + "step": 4311 + }, + { + "epoch": 1.3, + "grad_norm": 24.995628356933594, + "learning_rate": 1.135812368447429e-05, + "loss": 1.682, + "step": 4312 + }, + { + "epoch": 1.3, + "grad_norm": 11.356565475463867, + "learning_rate": 1.1356119073869902e-05, + "loss": 1.3858, + "step": 4313 + }, + { + "epoch": 1.3, + "grad_norm": 18.608964920043945, + "learning_rate": 1.1354114463265511e-05, + "loss": 1.2763, + "step": 4314 + }, + { + "epoch": 1.3, + "grad_norm": 16.34697723388672, + "learning_rate": 1.1352109852661121e-05, + "loss": 1.2931, + "step": 4315 + }, + { + "epoch": 1.3, + "grad_norm": 17.903200149536133, + "learning_rate": 1.1350105242056731e-05, + "loss": 1.8466, + "step": 4316 + }, + { + "epoch": 1.3, + "grad_norm": 7.10983419418335, + "learning_rate": 1.1348100631452341e-05, + "loss": 0.967, + "step": 4317 + }, + { + "epoch": 1.3, + "grad_norm": 13.738561630249023, + "learning_rate": 1.1346096020847951e-05, + "loss": 1.4251, + "step": 4318 + }, + { + "epoch": 1.3, + "grad_norm": 14.061918258666992, + "learning_rate": 1.1344091410243561e-05, + "loss": 1.8931, + "step": 4319 + }, + { + "epoch": 1.3, + "grad_norm": 28.247364044189453, + "learning_rate": 1.1342086799639171e-05, + "loss": 2.5285, + "step": 4320 + }, + { + "epoch": 1.3, + "eval_loss": 0.21285276114940643, + "eval_runtime": 43.3572, + "eval_samples_per_second": 34.112, + "eval_steps_per_second": 34.112, + "step": 4320 + }, + { + "epoch": 1.3, + "grad_norm": 11.178879737854004, + "learning_rate": 1.134008218903478e-05, + "loss": 1.5172, + "step": 4321 + }, + { + "epoch": 1.3, + "grad_norm": 16.7074031829834, + "learning_rate": 1.1338077578430392e-05, + "loss": 1.3458, + "step": 4322 + }, + { + "epoch": 1.3, + "grad_norm": 21.143476486206055, + "learning_rate": 1.1336072967826002e-05, + "loss": 2.4362, + "step": 4323 + }, + { + "epoch": 1.3, + "grad_norm": 22.83983039855957, + "learning_rate": 1.133406835722161e-05, + "loss": 2.1729, + "step": 4324 + }, + { + "epoch": 1.3, + "grad_norm": 16.511369705200195, + "learning_rate": 1.1332063746617222e-05, + "loss": 1.3165, + "step": 4325 + }, + { + "epoch": 1.3, + "grad_norm": 16.411027908325195, + "learning_rate": 1.133005913601283e-05, + "loss": 2.1342, + "step": 4326 + }, + { + "epoch": 1.3, + "grad_norm": 16.996347427368164, + "learning_rate": 1.132805452540844e-05, + "loss": 1.9439, + "step": 4327 + }, + { + "epoch": 1.3, + "grad_norm": 27.14801788330078, + "learning_rate": 1.1326049914804049e-05, + "loss": 2.3698, + "step": 4328 + }, + { + "epoch": 1.3, + "grad_norm": 10.485852241516113, + "learning_rate": 1.132404530419966e-05, + "loss": 1.1316, + "step": 4329 + }, + { + "epoch": 1.3, + "grad_norm": 48.32770538330078, + "learning_rate": 1.132204069359527e-05, + "loss": 1.8271, + "step": 4330 + }, + { + "epoch": 1.3, + "grad_norm": 68.1049575805664, + "learning_rate": 1.1320036082990879e-05, + "loss": 2.5344, + "step": 4331 + }, + { + "epoch": 1.3, + "grad_norm": 22.41739273071289, + "learning_rate": 1.131803147238649e-05, + "loss": 2.1376, + "step": 4332 + }, + { + "epoch": 1.3, + "grad_norm": 61.55850601196289, + "learning_rate": 1.1316026861782099e-05, + "loss": 3.21, + "step": 4333 + }, + { + "epoch": 1.3, + "grad_norm": 13.802007675170898, + "learning_rate": 1.1314022251177709e-05, + "loss": 1.7577, + "step": 4334 + }, + { + "epoch": 1.3, + "grad_norm": 16.4052734375, + "learning_rate": 1.1312017640573321e-05, + "loss": 1.2041, + "step": 4335 + }, + { + "epoch": 1.3, + "grad_norm": 26.579936981201172, + "learning_rate": 1.131001302996893e-05, + "loss": 1.6785, + "step": 4336 + }, + { + "epoch": 1.3, + "grad_norm": 24.084991455078125, + "learning_rate": 1.130800841936454e-05, + "loss": 1.5441, + "step": 4337 + }, + { + "epoch": 1.3, + "grad_norm": 21.47726058959961, + "learning_rate": 1.130600380876015e-05, + "loss": 1.9604, + "step": 4338 + }, + { + "epoch": 1.3, + "grad_norm": 38.121421813964844, + "learning_rate": 1.130399919815576e-05, + "loss": 2.2498, + "step": 4339 + }, + { + "epoch": 1.3, + "grad_norm": 23.063236236572266, + "learning_rate": 1.1301994587551368e-05, + "loss": 1.8673, + "step": 4340 + }, + { + "epoch": 1.31, + "grad_norm": 11.506025314331055, + "learning_rate": 1.129998997694698e-05, + "loss": 1.3873, + "step": 4341 + }, + { + "epoch": 1.31, + "grad_norm": 11.516486167907715, + "learning_rate": 1.129798536634259e-05, + "loss": 1.2646, + "step": 4342 + }, + { + "epoch": 1.31, + "grad_norm": 15.611605644226074, + "learning_rate": 1.1295980755738198e-05, + "loss": 1.2307, + "step": 4343 + }, + { + "epoch": 1.31, + "grad_norm": 17.53089141845703, + "learning_rate": 1.129397614513381e-05, + "loss": 1.2414, + "step": 4344 + }, + { + "epoch": 1.31, + "grad_norm": 14.397602081298828, + "learning_rate": 1.1291971534529418e-05, + "loss": 1.3744, + "step": 4345 + }, + { + "epoch": 1.31, + "grad_norm": 23.23992156982422, + "learning_rate": 1.1289966923925028e-05, + "loss": 1.8092, + "step": 4346 + }, + { + "epoch": 1.31, + "grad_norm": 15.50413990020752, + "learning_rate": 1.1287962313320637e-05, + "loss": 1.819, + "step": 4347 + }, + { + "epoch": 1.31, + "grad_norm": 12.4719877243042, + "learning_rate": 1.1285957702716249e-05, + "loss": 0.959, + "step": 4348 + }, + { + "epoch": 1.31, + "grad_norm": 8.75976276397705, + "learning_rate": 1.1283953092111859e-05, + "loss": 1.0429, + "step": 4349 + }, + { + "epoch": 1.31, + "grad_norm": 34.667945861816406, + "learning_rate": 1.1281948481507467e-05, + "loss": 2.2953, + "step": 4350 + }, + { + "epoch": 1.31, + "grad_norm": 15.772024154663086, + "learning_rate": 1.1279943870903079e-05, + "loss": 1.67, + "step": 4351 + }, + { + "epoch": 1.31, + "grad_norm": 19.514604568481445, + "learning_rate": 1.1277939260298687e-05, + "loss": 2.0143, + "step": 4352 + }, + { + "epoch": 1.31, + "grad_norm": 22.16571807861328, + "learning_rate": 1.1275934649694297e-05, + "loss": 1.2593, + "step": 4353 + }, + { + "epoch": 1.31, + "grad_norm": 17.157798767089844, + "learning_rate": 1.1273930039089909e-05, + "loss": 1.6187, + "step": 4354 + }, + { + "epoch": 1.31, + "grad_norm": 33.16340637207031, + "learning_rate": 1.1271925428485518e-05, + "loss": 2.0045, + "step": 4355 + }, + { + "epoch": 1.31, + "grad_norm": 14.194746971130371, + "learning_rate": 1.1269920817881128e-05, + "loss": 1.671, + "step": 4356 + }, + { + "epoch": 1.31, + "grad_norm": 15.4343900680542, + "learning_rate": 1.1267916207276736e-05, + "loss": 1.2274, + "step": 4357 + }, + { + "epoch": 1.31, + "grad_norm": 21.623807907104492, + "learning_rate": 1.1265911596672348e-05, + "loss": 2.5802, + "step": 4358 + }, + { + "epoch": 1.31, + "grad_norm": 12.904182434082031, + "learning_rate": 1.1263906986067956e-05, + "loss": 1.4189, + "step": 4359 + }, + { + "epoch": 1.31, + "grad_norm": 16.601791381835938, + "learning_rate": 1.1261902375463566e-05, + "loss": 2.787, + "step": 4360 + }, + { + "epoch": 1.31, + "grad_norm": 12.915867805480957, + "learning_rate": 1.1259897764859178e-05, + "loss": 1.5663, + "step": 4361 + }, + { + "epoch": 1.31, + "grad_norm": 17.556509017944336, + "learning_rate": 1.1257893154254786e-05, + "loss": 1.8852, + "step": 4362 + }, + { + "epoch": 1.31, + "grad_norm": 23.893291473388672, + "learning_rate": 1.1255888543650397e-05, + "loss": 1.4539, + "step": 4363 + }, + { + "epoch": 1.31, + "grad_norm": 11.98607063293457, + "learning_rate": 1.1253883933046007e-05, + "loss": 1.7452, + "step": 4364 + }, + { + "epoch": 1.31, + "grad_norm": 13.887165069580078, + "learning_rate": 1.1251879322441617e-05, + "loss": 1.6717, + "step": 4365 + }, + { + "epoch": 1.31, + "grad_norm": 7.588750839233398, + "learning_rate": 1.1249874711837227e-05, + "loss": 0.8518, + "step": 4366 + }, + { + "epoch": 1.31, + "grad_norm": 27.59679412841797, + "learning_rate": 1.1247870101232837e-05, + "loss": 2.483, + "step": 4367 + }, + { + "epoch": 1.31, + "grad_norm": 19.16243553161621, + "learning_rate": 1.1245865490628447e-05, + "loss": 1.4322, + "step": 4368 + }, + { + "epoch": 1.31, + "grad_norm": 16.66512107849121, + "learning_rate": 1.1243860880024055e-05, + "loss": 1.9586, + "step": 4369 + }, + { + "epoch": 1.31, + "grad_norm": 17.893173217773438, + "learning_rate": 1.1241856269419667e-05, + "loss": 2.8739, + "step": 4370 + }, + { + "epoch": 1.31, + "grad_norm": 16.915111541748047, + "learning_rate": 1.1239851658815275e-05, + "loss": 1.3351, + "step": 4371 + }, + { + "epoch": 1.31, + "grad_norm": 20.749107360839844, + "learning_rate": 1.1237847048210886e-05, + "loss": 2.3157, + "step": 4372 + }, + { + "epoch": 1.31, + "grad_norm": 33.895973205566406, + "learning_rate": 1.1235842437606497e-05, + "loss": 3.2345, + "step": 4373 + }, + { + "epoch": 1.32, + "grad_norm": 16.17300033569336, + "learning_rate": 1.1233837827002106e-05, + "loss": 1.0639, + "step": 4374 + }, + { + "epoch": 1.32, + "grad_norm": 11.130797386169434, + "learning_rate": 1.1231833216397716e-05, + "loss": 2.0059, + "step": 4375 + }, + { + "epoch": 1.32, + "grad_norm": 12.532829284667969, + "learning_rate": 1.1229828605793324e-05, + "loss": 1.1101, + "step": 4376 + }, + { + "epoch": 1.32, + "grad_norm": 10.261481285095215, + "learning_rate": 1.1227823995188936e-05, + "loss": 1.7796, + "step": 4377 + }, + { + "epoch": 1.32, + "grad_norm": 18.463781356811523, + "learning_rate": 1.1225819384584546e-05, + "loss": 1.9479, + "step": 4378 + }, + { + "epoch": 1.32, + "grad_norm": 12.4391450881958, + "learning_rate": 1.1223814773980154e-05, + "loss": 2.0404, + "step": 4379 + }, + { + "epoch": 1.32, + "grad_norm": 15.823043823242188, + "learning_rate": 1.1221810163375766e-05, + "loss": 1.4798, + "step": 4380 + }, + { + "epoch": 1.32, + "grad_norm": 19.075504302978516, + "learning_rate": 1.1219805552771375e-05, + "loss": 1.712, + "step": 4381 + }, + { + "epoch": 1.32, + "grad_norm": 13.849617958068848, + "learning_rate": 1.1217800942166985e-05, + "loss": 1.8711, + "step": 4382 + }, + { + "epoch": 1.32, + "grad_norm": 17.942249298095703, + "learning_rate": 1.1215796331562593e-05, + "loss": 2.6901, + "step": 4383 + }, + { + "epoch": 1.32, + "grad_norm": 9.185188293457031, + "learning_rate": 1.1213791720958205e-05, + "loss": 1.0339, + "step": 4384 + }, + { + "epoch": 1.32, + "grad_norm": 12.996100425720215, + "learning_rate": 1.1211787110353815e-05, + "loss": 1.7259, + "step": 4385 + }, + { + "epoch": 1.32, + "grad_norm": 31.83272933959961, + "learning_rate": 1.1209782499749425e-05, + "loss": 2.1627, + "step": 4386 + }, + { + "epoch": 1.32, + "grad_norm": 11.160262107849121, + "learning_rate": 1.1207777889145035e-05, + "loss": 1.7325, + "step": 4387 + }, + { + "epoch": 1.32, + "grad_norm": 19.457317352294922, + "learning_rate": 1.1205773278540644e-05, + "loss": 1.6398, + "step": 4388 + }, + { + "epoch": 1.32, + "grad_norm": 15.514799118041992, + "learning_rate": 1.1203768667936255e-05, + "loss": 1.5179, + "step": 4389 + }, + { + "epoch": 1.32, + "grad_norm": 55.15019607543945, + "learning_rate": 1.1201764057331864e-05, + "loss": 4.2946, + "step": 4390 + }, + { + "epoch": 1.32, + "grad_norm": 17.135194778442383, + "learning_rate": 1.1199759446727474e-05, + "loss": 2.0419, + "step": 4391 + }, + { + "epoch": 1.32, + "grad_norm": 11.672941207885742, + "learning_rate": 1.1197754836123086e-05, + "loss": 1.4943, + "step": 4392 + }, + { + "epoch": 1.32, + "grad_norm": 27.473955154418945, + "learning_rate": 1.1195750225518694e-05, + "loss": 2.4105, + "step": 4393 + }, + { + "epoch": 1.32, + "grad_norm": 12.319040298461914, + "learning_rate": 1.1193745614914304e-05, + "loss": 1.1325, + "step": 4394 + }, + { + "epoch": 1.32, + "grad_norm": 19.93411636352539, + "learning_rate": 1.1191741004309912e-05, + "loss": 2.4709, + "step": 4395 + }, + { + "epoch": 1.32, + "grad_norm": 25.42889976501465, + "learning_rate": 1.1189736393705524e-05, + "loss": 2.7937, + "step": 4396 + }, + { + "epoch": 1.32, + "grad_norm": 31.769067764282227, + "learning_rate": 1.1187731783101134e-05, + "loss": 2.7676, + "step": 4397 + }, + { + "epoch": 1.32, + "grad_norm": 21.329421997070312, + "learning_rate": 1.1185727172496743e-05, + "loss": 2.0521, + "step": 4398 + }, + { + "epoch": 1.32, + "grad_norm": 17.158628463745117, + "learning_rate": 1.1183722561892354e-05, + "loss": 1.5289, + "step": 4399 + }, + { + "epoch": 1.32, + "grad_norm": 13.490479469299316, + "learning_rate": 1.1181717951287963e-05, + "loss": 1.4183, + "step": 4400 + }, + { + "epoch": 1.32, + "grad_norm": 8.754962921142578, + "learning_rate": 1.1179713340683573e-05, + "loss": 1.2418, + "step": 4401 + }, + { + "epoch": 1.32, + "grad_norm": 11.73357105255127, + "learning_rate": 1.1177708730079181e-05, + "loss": 1.3139, + "step": 4402 + }, + { + "epoch": 1.32, + "grad_norm": 13.230740547180176, + "learning_rate": 1.1175704119474793e-05, + "loss": 1.6772, + "step": 4403 + }, + { + "epoch": 1.32, + "grad_norm": 17.465307235717773, + "learning_rate": 1.1173699508870403e-05, + "loss": 1.6562, + "step": 4404 + }, + { + "epoch": 1.32, + "grad_norm": 32.38056945800781, + "learning_rate": 1.1171694898266012e-05, + "loss": 2.3572, + "step": 4405 + }, + { + "epoch": 1.32, + "grad_norm": 14.467717170715332, + "learning_rate": 1.1169690287661623e-05, + "loss": 1.2874, + "step": 4406 + }, + { + "epoch": 1.33, + "grad_norm": 17.778472900390625, + "learning_rate": 1.1167685677057232e-05, + "loss": 1.9954, + "step": 4407 + }, + { + "epoch": 1.33, + "grad_norm": 8.146272659301758, + "learning_rate": 1.1165681066452842e-05, + "loss": 0.6304, + "step": 4408 + }, + { + "epoch": 1.33, + "grad_norm": 18.66156005859375, + "learning_rate": 1.1163676455848454e-05, + "loss": 2.0366, + "step": 4409 + }, + { + "epoch": 1.33, + "grad_norm": 13.149731636047363, + "learning_rate": 1.1161671845244062e-05, + "loss": 1.5874, + "step": 4410 + }, + { + "epoch": 1.33, + "grad_norm": 15.526867866516113, + "learning_rate": 1.1159667234639672e-05, + "loss": 1.4642, + "step": 4411 + }, + { + "epoch": 1.33, + "grad_norm": 27.83363151550293, + "learning_rate": 1.1157662624035282e-05, + "loss": 2.0596, + "step": 4412 + }, + { + "epoch": 1.33, + "grad_norm": 82.84793090820312, + "learning_rate": 1.1155658013430892e-05, + "loss": 1.8469, + "step": 4413 + }, + { + "epoch": 1.33, + "grad_norm": 16.59908676147461, + "learning_rate": 1.11536534028265e-05, + "loss": 1.9362, + "step": 4414 + }, + { + "epoch": 1.33, + "grad_norm": 8.759167671203613, + "learning_rate": 1.1151648792222112e-05, + "loss": 2.0726, + "step": 4415 + }, + { + "epoch": 1.33, + "grad_norm": 32.26197814941406, + "learning_rate": 1.1149644181617723e-05, + "loss": 2.0811, + "step": 4416 + }, + { + "epoch": 1.33, + "grad_norm": 8.401446342468262, + "learning_rate": 1.1147639571013331e-05, + "loss": 1.3336, + "step": 4417 + }, + { + "epoch": 1.33, + "grad_norm": 10.307430267333984, + "learning_rate": 1.1145634960408943e-05, + "loss": 1.1188, + "step": 4418 + }, + { + "epoch": 1.33, + "grad_norm": 16.372766494750977, + "learning_rate": 1.1143630349804551e-05, + "loss": 1.8886, + "step": 4419 + }, + { + "epoch": 1.33, + "grad_norm": 39.85802459716797, + "learning_rate": 1.1141625739200161e-05, + "loss": 2.3996, + "step": 4420 + }, + { + "epoch": 1.33, + "grad_norm": 14.044843673706055, + "learning_rate": 1.1139621128595773e-05, + "loss": 1.4335, + "step": 4421 + }, + { + "epoch": 1.33, + "grad_norm": 23.040878295898438, + "learning_rate": 1.1137616517991381e-05, + "loss": 1.8451, + "step": 4422 + }, + { + "epoch": 1.33, + "grad_norm": 17.075387954711914, + "learning_rate": 1.1135611907386991e-05, + "loss": 2.2248, + "step": 4423 + }, + { + "epoch": 1.33, + "grad_norm": 110.57589721679688, + "learning_rate": 1.11336072967826e-05, + "loss": 1.5452, + "step": 4424 + }, + { + "epoch": 1.33, + "grad_norm": 15.424068450927734, + "learning_rate": 1.1131602686178212e-05, + "loss": 1.5217, + "step": 4425 + }, + { + "epoch": 1.33, + "grad_norm": 23.823379516601562, + "learning_rate": 1.112959807557382e-05, + "loss": 1.2601, + "step": 4426 + }, + { + "epoch": 1.33, + "grad_norm": 10.928523063659668, + "learning_rate": 1.112759346496943e-05, + "loss": 1.4933, + "step": 4427 + }, + { + "epoch": 1.33, + "grad_norm": 13.319860458374023, + "learning_rate": 1.1125588854365042e-05, + "loss": 1.4411, + "step": 4428 + }, + { + "epoch": 1.33, + "grad_norm": 11.283712387084961, + "learning_rate": 1.112358424376065e-05, + "loss": 2.1932, + "step": 4429 + }, + { + "epoch": 1.33, + "grad_norm": 7.913817882537842, + "learning_rate": 1.112157963315626e-05, + "loss": 1.3824, + "step": 4430 + }, + { + "epoch": 1.33, + "grad_norm": 10.032218933105469, + "learning_rate": 1.1119575022551869e-05, + "loss": 1.2246, + "step": 4431 + }, + { + "epoch": 1.33, + "grad_norm": 25.357025146484375, + "learning_rate": 1.111757041194748e-05, + "loss": 1.7075, + "step": 4432 + }, + { + "epoch": 1.33, + "grad_norm": 16.777484893798828, + "learning_rate": 1.1115565801343089e-05, + "loss": 1.6018, + "step": 4433 + }, + { + "epoch": 1.33, + "grad_norm": 27.090877532958984, + "learning_rate": 1.1113561190738699e-05, + "loss": 2.4302, + "step": 4434 + }, + { + "epoch": 1.33, + "grad_norm": 24.513057708740234, + "learning_rate": 1.111155658013431e-05, + "loss": 2.2252, + "step": 4435 + }, + { + "epoch": 1.33, + "grad_norm": 20.253862380981445, + "learning_rate": 1.1109551969529919e-05, + "loss": 1.4782, + "step": 4436 + }, + { + "epoch": 1.33, + "grad_norm": 10.198271751403809, + "learning_rate": 1.1107547358925531e-05, + "loss": 2.0537, + "step": 4437 + }, + { + "epoch": 1.33, + "grad_norm": 16.26251792907715, + "learning_rate": 1.110554274832114e-05, + "loss": 1.6344, + "step": 4438 + }, + { + "epoch": 1.33, + "grad_norm": 17.53883934020996, + "learning_rate": 1.110353813771675e-05, + "loss": 1.4399, + "step": 4439 + }, + { + "epoch": 1.33, + "grad_norm": 9.351251602172852, + "learning_rate": 1.1101533527112361e-05, + "loss": 1.2029, + "step": 4440 + }, + { + "epoch": 1.33, + "eval_loss": 0.22466596961021423, + "eval_runtime": 43.3151, + "eval_samples_per_second": 34.145, + "eval_steps_per_second": 34.145, + "step": 4440 + }, + { + "epoch": 1.34, + "grad_norm": 18.78108787536621, + "learning_rate": 1.109952891650797e-05, + "loss": 1.8896, + "step": 4441 + }, + { + "epoch": 1.34, + "grad_norm": 14.223852157592773, + "learning_rate": 1.109752430590358e-05, + "loss": 1.8473, + "step": 4442 + }, + { + "epoch": 1.34, + "grad_norm": 29.34437370300293, + "learning_rate": 1.1095519695299188e-05, + "loss": 1.8663, + "step": 4443 + }, + { + "epoch": 1.34, + "grad_norm": 11.203985214233398, + "learning_rate": 1.10935150846948e-05, + "loss": 1.4461, + "step": 4444 + }, + { + "epoch": 1.34, + "grad_norm": 13.125014305114746, + "learning_rate": 1.1091510474090408e-05, + "loss": 1.3763, + "step": 4445 + }, + { + "epoch": 1.34, + "grad_norm": 17.99578094482422, + "learning_rate": 1.1089505863486018e-05, + "loss": 2.1525, + "step": 4446 + }, + { + "epoch": 1.34, + "grad_norm": 36.31547927856445, + "learning_rate": 1.108750125288163e-05, + "loss": 2.2111, + "step": 4447 + }, + { + "epoch": 1.34, + "grad_norm": 13.750075340270996, + "learning_rate": 1.1085496642277238e-05, + "loss": 1.3206, + "step": 4448 + }, + { + "epoch": 1.34, + "grad_norm": 15.520805358886719, + "learning_rate": 1.1083492031672849e-05, + "loss": 3.3127, + "step": 4449 + }, + { + "epoch": 1.34, + "grad_norm": 16.05813217163086, + "learning_rate": 1.1081487421068457e-05, + "loss": 1.5862, + "step": 4450 + }, + { + "epoch": 1.34, + "grad_norm": 18.96892738342285, + "learning_rate": 1.1079482810464069e-05, + "loss": 1.4905, + "step": 4451 + }, + { + "epoch": 1.34, + "grad_norm": 11.585920333862305, + "learning_rate": 1.1077478199859679e-05, + "loss": 2.2658, + "step": 4452 + }, + { + "epoch": 1.34, + "grad_norm": 21.121341705322266, + "learning_rate": 1.1075473589255287e-05, + "loss": 0.8119, + "step": 4453 + }, + { + "epoch": 1.34, + "grad_norm": 39.30922317504883, + "learning_rate": 1.1073468978650899e-05, + "loss": 2.4467, + "step": 4454 + }, + { + "epoch": 1.34, + "grad_norm": 15.98094654083252, + "learning_rate": 1.1071464368046507e-05, + "loss": 2.3305, + "step": 4455 + }, + { + "epoch": 1.34, + "grad_norm": 9.627312660217285, + "learning_rate": 1.1069459757442117e-05, + "loss": 1.2344, + "step": 4456 + }, + { + "epoch": 1.34, + "grad_norm": 14.245975494384766, + "learning_rate": 1.1067455146837727e-05, + "loss": 0.9343, + "step": 4457 + }, + { + "epoch": 1.34, + "grad_norm": 29.139923095703125, + "learning_rate": 1.1065450536233338e-05, + "loss": 2.0948, + "step": 4458 + }, + { + "epoch": 1.34, + "grad_norm": 23.93746566772461, + "learning_rate": 1.1063445925628948e-05, + "loss": 1.8753, + "step": 4459 + }, + { + "epoch": 1.34, + "grad_norm": 8.223407745361328, + "learning_rate": 1.1061441315024558e-05, + "loss": 1.0049, + "step": 4460 + }, + { + "epoch": 1.34, + "grad_norm": 28.680850982666016, + "learning_rate": 1.1059436704420168e-05, + "loss": 1.8643, + "step": 4461 + }, + { + "epoch": 1.34, + "grad_norm": 27.831514358520508, + "learning_rate": 1.1057432093815776e-05, + "loss": 1.8748, + "step": 4462 + }, + { + "epoch": 1.34, + "grad_norm": 17.656972885131836, + "learning_rate": 1.1055427483211388e-05, + "loss": 1.8081, + "step": 4463 + }, + { + "epoch": 1.34, + "grad_norm": 12.41053581237793, + "learning_rate": 1.1053422872606998e-05, + "loss": 1.6312, + "step": 4464 + }, + { + "epoch": 1.34, + "grad_norm": 14.747664451599121, + "learning_rate": 1.1051418262002606e-05, + "loss": 1.8963, + "step": 4465 + }, + { + "epoch": 1.34, + "grad_norm": 20.91558074951172, + "learning_rate": 1.1049413651398218e-05, + "loss": 1.0479, + "step": 4466 + }, + { + "epoch": 1.34, + "grad_norm": 18.06478500366211, + "learning_rate": 1.1047409040793827e-05, + "loss": 1.513, + "step": 4467 + }, + { + "epoch": 1.34, + "grad_norm": 25.323034286499023, + "learning_rate": 1.1045404430189437e-05, + "loss": 2.1898, + "step": 4468 + }, + { + "epoch": 1.34, + "grad_norm": 10.666196823120117, + "learning_rate": 1.1043399819585045e-05, + "loss": 1.6848, + "step": 4469 + }, + { + "epoch": 1.34, + "grad_norm": 13.26004695892334, + "learning_rate": 1.1041395208980657e-05, + "loss": 1.3236, + "step": 4470 + }, + { + "epoch": 1.34, + "grad_norm": 11.206985473632812, + "learning_rate": 1.1039390598376267e-05, + "loss": 1.2008, + "step": 4471 + }, + { + "epoch": 1.34, + "grad_norm": 22.27272605895996, + "learning_rate": 1.1037385987771875e-05, + "loss": 1.5483, + "step": 4472 + }, + { + "epoch": 1.34, + "grad_norm": 13.044824600219727, + "learning_rate": 1.1035381377167487e-05, + "loss": 1.4811, + "step": 4473 + }, + { + "epoch": 1.35, + "grad_norm": 15.387162208557129, + "learning_rate": 1.1033376766563096e-05, + "loss": 1.7151, + "step": 4474 + }, + { + "epoch": 1.35, + "grad_norm": 20.35818862915039, + "learning_rate": 1.1031372155958706e-05, + "loss": 1.6009, + "step": 4475 + }, + { + "epoch": 1.35, + "grad_norm": 12.326359748840332, + "learning_rate": 1.1029367545354314e-05, + "loss": 1.7638, + "step": 4476 + }, + { + "epoch": 1.35, + "grad_norm": 29.4976863861084, + "learning_rate": 1.1027362934749926e-05, + "loss": 1.8569, + "step": 4477 + }, + { + "epoch": 1.35, + "grad_norm": 11.03281307220459, + "learning_rate": 1.1025358324145536e-05, + "loss": 1.0993, + "step": 4478 + }, + { + "epoch": 1.35, + "grad_norm": 20.460525512695312, + "learning_rate": 1.1023353713541144e-05, + "loss": 1.4282, + "step": 4479 + }, + { + "epoch": 1.35, + "grad_norm": 11.63111400604248, + "learning_rate": 1.1021349102936756e-05, + "loss": 2.2856, + "step": 4480 + }, + { + "epoch": 1.35, + "grad_norm": 17.048723220825195, + "learning_rate": 1.1019344492332364e-05, + "loss": 2.037, + "step": 4481 + }, + { + "epoch": 1.35, + "grad_norm": 7.7821502685546875, + "learning_rate": 1.1017339881727975e-05, + "loss": 1.2083, + "step": 4482 + }, + { + "epoch": 1.35, + "grad_norm": 16.613143920898438, + "learning_rate": 1.1015335271123586e-05, + "loss": 1.3636, + "step": 4483 + }, + { + "epoch": 1.35, + "grad_norm": 15.978687286376953, + "learning_rate": 1.1013330660519195e-05, + "loss": 1.5447, + "step": 4484 + }, + { + "epoch": 1.35, + "grad_norm": 12.686722755432129, + "learning_rate": 1.1011326049914805e-05, + "loss": 1.308, + "step": 4485 + }, + { + "epoch": 1.35, + "grad_norm": 15.303074836730957, + "learning_rate": 1.1009321439310415e-05, + "loss": 2.4245, + "step": 4486 + }, + { + "epoch": 1.35, + "grad_norm": 12.841423988342285, + "learning_rate": 1.1007316828706025e-05, + "loss": 2.0611, + "step": 4487 + }, + { + "epoch": 1.35, + "grad_norm": 14.324323654174805, + "learning_rate": 1.1005312218101633e-05, + "loss": 1.9151, + "step": 4488 + }, + { + "epoch": 1.35, + "grad_norm": 10.316316604614258, + "learning_rate": 1.1003307607497245e-05, + "loss": 1.8989, + "step": 4489 + }, + { + "epoch": 1.35, + "grad_norm": 11.872175216674805, + "learning_rate": 1.1001302996892855e-05, + "loss": 1.4971, + "step": 4490 + }, + { + "epoch": 1.35, + "grad_norm": 13.584701538085938, + "learning_rate": 1.0999298386288464e-05, + "loss": 1.6093, + "step": 4491 + }, + { + "epoch": 1.35, + "grad_norm": 23.555721282958984, + "learning_rate": 1.0997293775684075e-05, + "loss": 1.4708, + "step": 4492 + }, + { + "epoch": 1.35, + "grad_norm": 13.753824234008789, + "learning_rate": 1.0995289165079684e-05, + "loss": 1.4468, + "step": 4493 + }, + { + "epoch": 1.35, + "grad_norm": 23.030887603759766, + "learning_rate": 1.0993284554475294e-05, + "loss": 1.9327, + "step": 4494 + }, + { + "epoch": 1.35, + "grad_norm": 43.81611251831055, + "learning_rate": 1.0991279943870906e-05, + "loss": 2.3426, + "step": 4495 + }, + { + "epoch": 1.35, + "grad_norm": 22.91530418395996, + "learning_rate": 1.0989275333266514e-05, + "loss": 2.1606, + "step": 4496 + }, + { + "epoch": 1.35, + "grad_norm": 30.3094425201416, + "learning_rate": 1.0987270722662124e-05, + "loss": 2.3948, + "step": 4497 + }, + { + "epoch": 1.35, + "grad_norm": 90.9547348022461, + "learning_rate": 1.0985266112057732e-05, + "loss": 2.5067, + "step": 4498 + }, + { + "epoch": 1.35, + "grad_norm": 11.258960723876953, + "learning_rate": 1.0983261501453344e-05, + "loss": 2.0156, + "step": 4499 + }, + { + "epoch": 1.35, + "grad_norm": 66.39472961425781, + "learning_rate": 1.0981256890848953e-05, + "loss": 2.6879, + "step": 4500 + }, + { + "epoch": 1.35, + "grad_norm": 20.91155242919922, + "learning_rate": 1.0979252280244563e-05, + "loss": 1.7934, + "step": 4501 + }, + { + "epoch": 1.35, + "grad_norm": 18.402379989624023, + "learning_rate": 1.0977247669640175e-05, + "loss": 1.3445, + "step": 4502 + }, + { + "epoch": 1.35, + "grad_norm": 15.267261505126953, + "learning_rate": 1.0975243059035783e-05, + "loss": 1.642, + "step": 4503 + }, + { + "epoch": 1.35, + "grad_norm": 50.35695266723633, + "learning_rate": 1.0973238448431393e-05, + "loss": 2.0173, + "step": 4504 + }, + { + "epoch": 1.35, + "grad_norm": 10.630228042602539, + "learning_rate": 1.0971233837827003e-05, + "loss": 1.9372, + "step": 4505 + }, + { + "epoch": 1.35, + "grad_norm": 32.916053771972656, + "learning_rate": 1.0969229227222613e-05, + "loss": 2.2776, + "step": 4506 + }, + { + "epoch": 1.36, + "grad_norm": 22.993738174438477, + "learning_rate": 1.0967224616618223e-05, + "loss": 2.2662, + "step": 4507 + }, + { + "epoch": 1.36, + "grad_norm": 13.480685234069824, + "learning_rate": 1.0965220006013833e-05, + "loss": 1.4548, + "step": 4508 + }, + { + "epoch": 1.36, + "grad_norm": 22.65932273864746, + "learning_rate": 1.0963215395409443e-05, + "loss": 2.1417, + "step": 4509 + }, + { + "epoch": 1.36, + "grad_norm": 9.409554481506348, + "learning_rate": 1.0961210784805052e-05, + "loss": 1.1043, + "step": 4510 + }, + { + "epoch": 1.36, + "grad_norm": 51.40016555786133, + "learning_rate": 1.0959206174200664e-05, + "loss": 2.5073, + "step": 4511 + }, + { + "epoch": 1.36, + "grad_norm": 32.164634704589844, + "learning_rate": 1.0957201563596272e-05, + "loss": 1.7867, + "step": 4512 + }, + { + "epoch": 1.36, + "grad_norm": 29.420494079589844, + "learning_rate": 1.0955196952991882e-05, + "loss": 1.7466, + "step": 4513 + }, + { + "epoch": 1.36, + "grad_norm": 10.119473457336426, + "learning_rate": 1.0953192342387494e-05, + "loss": 1.3604, + "step": 4514 + }, + { + "epoch": 1.36, + "grad_norm": 10.170442581176758, + "learning_rate": 1.0951187731783102e-05, + "loss": 1.2242, + "step": 4515 + }, + { + "epoch": 1.36, + "grad_norm": 11.852310180664062, + "learning_rate": 1.0949183121178712e-05, + "loss": 1.2755, + "step": 4516 + }, + { + "epoch": 1.36, + "grad_norm": 14.793402671813965, + "learning_rate": 1.094717851057432e-05, + "loss": 1.752, + "step": 4517 + }, + { + "epoch": 1.36, + "grad_norm": 42.632144927978516, + "learning_rate": 1.0945173899969932e-05, + "loss": 1.7613, + "step": 4518 + }, + { + "epoch": 1.36, + "grad_norm": 19.947559356689453, + "learning_rate": 1.0943169289365541e-05, + "loss": 1.9094, + "step": 4519 + }, + { + "epoch": 1.36, + "grad_norm": 15.529866218566895, + "learning_rate": 1.0941164678761151e-05, + "loss": 1.3777, + "step": 4520 + }, + { + "epoch": 1.36, + "grad_norm": 21.871538162231445, + "learning_rate": 1.0939160068156763e-05, + "loss": 1.91, + "step": 4521 + }, + { + "epoch": 1.36, + "grad_norm": 18.05958366394043, + "learning_rate": 1.0937155457552371e-05, + "loss": 1.6152, + "step": 4522 + }, + { + "epoch": 1.36, + "grad_norm": 13.304652214050293, + "learning_rate": 1.0935150846947981e-05, + "loss": 1.4861, + "step": 4523 + }, + { + "epoch": 1.36, + "grad_norm": 72.59398651123047, + "learning_rate": 1.093314623634359e-05, + "loss": 2.0853, + "step": 4524 + }, + { + "epoch": 1.36, + "grad_norm": 11.113508224487305, + "learning_rate": 1.0931141625739201e-05, + "loss": 1.5427, + "step": 4525 + }, + { + "epoch": 1.36, + "grad_norm": 11.754325866699219, + "learning_rate": 1.0929137015134811e-05, + "loss": 1.8286, + "step": 4526 + }, + { + "epoch": 1.36, + "grad_norm": 16.077978134155273, + "learning_rate": 1.092713240453042e-05, + "loss": 1.3027, + "step": 4527 + }, + { + "epoch": 1.36, + "grad_norm": 14.799029350280762, + "learning_rate": 1.0925127793926032e-05, + "loss": 1.222, + "step": 4528 + }, + { + "epoch": 1.36, + "grad_norm": 12.316436767578125, + "learning_rate": 1.092312318332164e-05, + "loss": 1.5382, + "step": 4529 + }, + { + "epoch": 1.36, + "grad_norm": 11.659561157226562, + "learning_rate": 1.092111857271725e-05, + "loss": 1.0118, + "step": 4530 + }, + { + "epoch": 1.36, + "grad_norm": 10.741880416870117, + "learning_rate": 1.091911396211286e-05, + "loss": 1.1201, + "step": 4531 + }, + { + "epoch": 1.36, + "grad_norm": 17.27454376220703, + "learning_rate": 1.091710935150847e-05, + "loss": 1.8521, + "step": 4532 + }, + { + "epoch": 1.36, + "grad_norm": 19.821794509887695, + "learning_rate": 1.091510474090408e-05, + "loss": 1.1864, + "step": 4533 + }, + { + "epoch": 1.36, + "grad_norm": 42.41746520996094, + "learning_rate": 1.091310013029969e-05, + "loss": 1.7798, + "step": 4534 + }, + { + "epoch": 1.36, + "grad_norm": 17.059722900390625, + "learning_rate": 1.09110955196953e-05, + "loss": 2.3114, + "step": 4535 + }, + { + "epoch": 1.36, + "grad_norm": 38.7147331237793, + "learning_rate": 1.0909090909090909e-05, + "loss": 2.4257, + "step": 4536 + }, + { + "epoch": 1.36, + "grad_norm": 17.778711318969727, + "learning_rate": 1.090708629848652e-05, + "loss": 1.598, + "step": 4537 + }, + { + "epoch": 1.36, + "grad_norm": 55.522010803222656, + "learning_rate": 1.090508168788213e-05, + "loss": 1.7511, + "step": 4538 + }, + { + "epoch": 1.36, + "grad_norm": 31.59695053100586, + "learning_rate": 1.090307707727774e-05, + "loss": 1.6838, + "step": 4539 + }, + { + "epoch": 1.37, + "grad_norm": 14.828858375549316, + "learning_rate": 1.0901072466673351e-05, + "loss": 1.3515, + "step": 4540 + }, + { + "epoch": 1.37, + "grad_norm": 18.156906127929688, + "learning_rate": 1.089906785606896e-05, + "loss": 2.1229, + "step": 4541 + }, + { + "epoch": 1.37, + "grad_norm": 14.6245698928833, + "learning_rate": 1.089706324546457e-05, + "loss": 1.0952, + "step": 4542 + }, + { + "epoch": 1.37, + "grad_norm": 14.46635913848877, + "learning_rate": 1.0895058634860178e-05, + "loss": 1.6335, + "step": 4543 + }, + { + "epoch": 1.37, + "grad_norm": 11.85185432434082, + "learning_rate": 1.089305402425579e-05, + "loss": 1.6408, + "step": 4544 + }, + { + "epoch": 1.37, + "grad_norm": 9.316162109375, + "learning_rate": 1.08910494136514e-05, + "loss": 1.0231, + "step": 4545 + }, + { + "epoch": 1.37, + "grad_norm": 16.85337257385254, + "learning_rate": 1.0889044803047008e-05, + "loss": 1.486, + "step": 4546 + }, + { + "epoch": 1.37, + "grad_norm": 13.948755264282227, + "learning_rate": 1.088704019244262e-05, + "loss": 2.3643, + "step": 4547 + }, + { + "epoch": 1.37, + "grad_norm": 22.807424545288086, + "learning_rate": 1.0885035581838228e-05, + "loss": 1.6381, + "step": 4548 + }, + { + "epoch": 1.37, + "grad_norm": 17.668928146362305, + "learning_rate": 1.0883030971233838e-05, + "loss": 1.8671, + "step": 4549 + }, + { + "epoch": 1.37, + "grad_norm": 61.22587966918945, + "learning_rate": 1.088102636062945e-05, + "loss": 2.4726, + "step": 4550 + }, + { + "epoch": 1.37, + "grad_norm": 15.125883102416992, + "learning_rate": 1.0879021750025058e-05, + "loss": 1.6064, + "step": 4551 + }, + { + "epoch": 1.37, + "grad_norm": 12.281584739685059, + "learning_rate": 1.0877017139420669e-05, + "loss": 1.9036, + "step": 4552 + }, + { + "epoch": 1.37, + "grad_norm": 20.28942108154297, + "learning_rate": 1.0875012528816277e-05, + "loss": 2.022, + "step": 4553 + }, + { + "epoch": 1.37, + "grad_norm": 12.765894889831543, + "learning_rate": 1.0873007918211889e-05, + "loss": 1.2068, + "step": 4554 + }, + { + "epoch": 1.37, + "grad_norm": 14.457947731018066, + "learning_rate": 1.0871003307607497e-05, + "loss": 1.2275, + "step": 4555 + }, + { + "epoch": 1.37, + "grad_norm": 26.644542694091797, + "learning_rate": 1.0868998697003109e-05, + "loss": 3.3596, + "step": 4556 + }, + { + "epoch": 1.37, + "grad_norm": 17.826154708862305, + "learning_rate": 1.0866994086398719e-05, + "loss": 2.4254, + "step": 4557 + }, + { + "epoch": 1.37, + "grad_norm": 26.188709259033203, + "learning_rate": 1.0864989475794327e-05, + "loss": 1.8766, + "step": 4558 + }, + { + "epoch": 1.37, + "grad_norm": 15.313033103942871, + "learning_rate": 1.0862984865189939e-05, + "loss": 1.5112, + "step": 4559 + }, + { + "epoch": 1.37, + "grad_norm": 15.900823593139648, + "learning_rate": 1.0860980254585548e-05, + "loss": 2.0939, + "step": 4560 + }, + { + "epoch": 1.37, + "eval_loss": 0.21247684955596924, + "eval_runtime": 43.2712, + "eval_samples_per_second": 34.18, + "eval_steps_per_second": 34.18, + "step": 4560 + }, + { + "epoch": 1.37, + "grad_norm": 9.430264472961426, + "learning_rate": 1.0858975643981158e-05, + "loss": 1.7693, + "step": 4561 + }, + { + "epoch": 1.37, + "grad_norm": 9.332898139953613, + "learning_rate": 1.0856971033376766e-05, + "loss": 1.048, + "step": 4562 + }, + { + "epoch": 1.37, + "grad_norm": 15.586241722106934, + "learning_rate": 1.0854966422772378e-05, + "loss": 1.3119, + "step": 4563 + }, + { + "epoch": 1.37, + "grad_norm": 23.395587921142578, + "learning_rate": 1.0852961812167988e-05, + "loss": 2.1297, + "step": 4564 + }, + { + "epoch": 1.37, + "grad_norm": 11.959819793701172, + "learning_rate": 1.0850957201563596e-05, + "loss": 1.2409, + "step": 4565 + }, + { + "epoch": 1.37, + "grad_norm": 9.007245063781738, + "learning_rate": 1.0848952590959208e-05, + "loss": 1.714, + "step": 4566 + }, + { + "epoch": 1.37, + "grad_norm": 56.485538482666016, + "learning_rate": 1.0846947980354816e-05, + "loss": 2.3396, + "step": 4567 + }, + { + "epoch": 1.37, + "grad_norm": 12.010300636291504, + "learning_rate": 1.0844943369750427e-05, + "loss": 2.0262, + "step": 4568 + }, + { + "epoch": 1.37, + "grad_norm": 37.940059661865234, + "learning_rate": 1.0842938759146038e-05, + "loss": 1.7123, + "step": 4569 + }, + { + "epoch": 1.37, + "grad_norm": 13.00949764251709, + "learning_rate": 1.0840934148541647e-05, + "loss": 1.3661, + "step": 4570 + }, + { + "epoch": 1.37, + "grad_norm": 15.618173599243164, + "learning_rate": 1.0838929537937257e-05, + "loss": 1.9199, + "step": 4571 + }, + { + "epoch": 1.37, + "grad_norm": 22.606422424316406, + "learning_rate": 1.0836924927332865e-05, + "loss": 1.8386, + "step": 4572 + }, + { + "epoch": 1.37, + "grad_norm": 13.844141960144043, + "learning_rate": 1.0834920316728477e-05, + "loss": 1.3548, + "step": 4573 + }, + { + "epoch": 1.38, + "grad_norm": 11.279787063598633, + "learning_rate": 1.0832915706124085e-05, + "loss": 1.5721, + "step": 4574 + }, + { + "epoch": 1.38, + "grad_norm": 13.838190078735352, + "learning_rate": 1.0830911095519695e-05, + "loss": 1.6001, + "step": 4575 + }, + { + "epoch": 1.38, + "grad_norm": 18.21175765991211, + "learning_rate": 1.0828906484915307e-05, + "loss": 1.9498, + "step": 4576 + }, + { + "epoch": 1.38, + "grad_norm": 11.330121994018555, + "learning_rate": 1.0826901874310916e-05, + "loss": 1.3935, + "step": 4577 + }, + { + "epoch": 1.38, + "grad_norm": 18.719507217407227, + "learning_rate": 1.0824897263706526e-05, + "loss": 1.4156, + "step": 4578 + }, + { + "epoch": 1.38, + "grad_norm": 49.91222381591797, + "learning_rate": 1.0822892653102136e-05, + "loss": 2.8363, + "step": 4579 + }, + { + "epoch": 1.38, + "grad_norm": 32.154563903808594, + "learning_rate": 1.0820888042497746e-05, + "loss": 2.5501, + "step": 4580 + }, + { + "epoch": 1.38, + "grad_norm": 8.45114803314209, + "learning_rate": 1.0818883431893356e-05, + "loss": 1.6866, + "step": 4581 + }, + { + "epoch": 1.38, + "grad_norm": 44.169677734375, + "learning_rate": 1.0816878821288966e-05, + "loss": 2.4302, + "step": 4582 + }, + { + "epoch": 1.38, + "grad_norm": 25.283023834228516, + "learning_rate": 1.0814874210684576e-05, + "loss": 2.2415, + "step": 4583 + }, + { + "epoch": 1.38, + "grad_norm": 16.97034454345703, + "learning_rate": 1.0812869600080184e-05, + "loss": 1.8726, + "step": 4584 + }, + { + "epoch": 1.38, + "grad_norm": 9.19655990600586, + "learning_rate": 1.0810864989475796e-05, + "loss": 1.1808, + "step": 4585 + }, + { + "epoch": 1.38, + "grad_norm": 18.277463912963867, + "learning_rate": 1.0808860378871405e-05, + "loss": 1.8823, + "step": 4586 + }, + { + "epoch": 1.38, + "grad_norm": 21.819673538208008, + "learning_rate": 1.0806855768267015e-05, + "loss": 2.0347, + "step": 4587 + }, + { + "epoch": 1.38, + "grad_norm": 9.92550277709961, + "learning_rate": 1.0804851157662627e-05, + "loss": 1.032, + "step": 4588 + }, + { + "epoch": 1.38, + "grad_norm": 22.52004623413086, + "learning_rate": 1.0802846547058235e-05, + "loss": 2.1637, + "step": 4589 + }, + { + "epoch": 1.38, + "grad_norm": 18.34803581237793, + "learning_rate": 1.0800841936453845e-05, + "loss": 1.9793, + "step": 4590 + }, + { + "epoch": 1.38, + "grad_norm": 24.83062171936035, + "learning_rate": 1.0798837325849453e-05, + "loss": 2.2503, + "step": 4591 + }, + { + "epoch": 1.38, + "grad_norm": 31.985370635986328, + "learning_rate": 1.0796832715245065e-05, + "loss": 1.9019, + "step": 4592 + }, + { + "epoch": 1.38, + "grad_norm": 11.606969833374023, + "learning_rate": 1.0794828104640674e-05, + "loss": 1.5377, + "step": 4593 + }, + { + "epoch": 1.38, + "grad_norm": 14.753523826599121, + "learning_rate": 1.0792823494036284e-05, + "loss": 1.6545, + "step": 4594 + }, + { + "epoch": 1.38, + "grad_norm": 19.58027458190918, + "learning_rate": 1.0790818883431895e-05, + "loss": 1.7343, + "step": 4595 + }, + { + "epoch": 1.38, + "grad_norm": 17.902088165283203, + "learning_rate": 1.0788814272827504e-05, + "loss": 1.763, + "step": 4596 + }, + { + "epoch": 1.38, + "grad_norm": 10.415716171264648, + "learning_rate": 1.0786809662223114e-05, + "loss": 1.6063, + "step": 4597 + }, + { + "epoch": 1.38, + "grad_norm": 13.281739234924316, + "learning_rate": 1.0784805051618722e-05, + "loss": 1.8627, + "step": 4598 + }, + { + "epoch": 1.38, + "grad_norm": 10.292632102966309, + "learning_rate": 1.0782800441014334e-05, + "loss": 1.7306, + "step": 4599 + }, + { + "epoch": 1.38, + "grad_norm": 12.653688430786133, + "learning_rate": 1.0780795830409944e-05, + "loss": 1.3753, + "step": 4600 + }, + { + "epoch": 1.38, + "grad_norm": 26.442541122436523, + "learning_rate": 1.0778791219805553e-05, + "loss": 2.2257, + "step": 4601 + }, + { + "epoch": 1.38, + "grad_norm": 18.668859481811523, + "learning_rate": 1.0776786609201164e-05, + "loss": 1.8788, + "step": 4602 + }, + { + "epoch": 1.38, + "grad_norm": 19.046268463134766, + "learning_rate": 1.0774781998596773e-05, + "loss": 2.1165, + "step": 4603 + }, + { + "epoch": 1.38, + "grad_norm": 11.989215850830078, + "learning_rate": 1.0772777387992383e-05, + "loss": 1.2069, + "step": 4604 + }, + { + "epoch": 1.38, + "grad_norm": 16.762723922729492, + "learning_rate": 1.0770772777387993e-05, + "loss": 1.3351, + "step": 4605 + }, + { + "epoch": 1.38, + "grad_norm": 21.549854278564453, + "learning_rate": 1.0768768166783603e-05, + "loss": 1.8815, + "step": 4606 + }, + { + "epoch": 1.39, + "grad_norm": 46.833518981933594, + "learning_rate": 1.0766763556179213e-05, + "loss": 2.6215, + "step": 4607 + }, + { + "epoch": 1.39, + "grad_norm": 13.283019065856934, + "learning_rate": 1.0764758945574823e-05, + "loss": 1.9159, + "step": 4608 + }, + { + "epoch": 1.39, + "grad_norm": 41.622859954833984, + "learning_rate": 1.0762754334970433e-05, + "loss": 1.4153, + "step": 4609 + }, + { + "epoch": 1.39, + "grad_norm": 9.129386901855469, + "learning_rate": 1.0760749724366042e-05, + "loss": 1.0581, + "step": 4610 + }, + { + "epoch": 1.39, + "grad_norm": 63.40047836303711, + "learning_rate": 1.0758745113761653e-05, + "loss": 1.7917, + "step": 4611 + }, + { + "epoch": 1.39, + "grad_norm": 119.53295135498047, + "learning_rate": 1.0756740503157263e-05, + "loss": 1.7074, + "step": 4612 + }, + { + "epoch": 1.39, + "grad_norm": 15.246512413024902, + "learning_rate": 1.0754735892552872e-05, + "loss": 1.3584, + "step": 4613 + }, + { + "epoch": 1.39, + "grad_norm": 12.469449043273926, + "learning_rate": 1.0752731281948484e-05, + "loss": 1.8839, + "step": 4614 + }, + { + "epoch": 1.39, + "grad_norm": 38.270023345947266, + "learning_rate": 1.0750726671344092e-05, + "loss": 1.4275, + "step": 4615 + }, + { + "epoch": 1.39, + "grad_norm": 16.700550079345703, + "learning_rate": 1.0748722060739702e-05, + "loss": 1.771, + "step": 4616 + }, + { + "epoch": 1.39, + "grad_norm": 23.500558853149414, + "learning_rate": 1.074671745013531e-05, + "loss": 2.2971, + "step": 4617 + }, + { + "epoch": 1.39, + "grad_norm": 17.58616065979004, + "learning_rate": 1.0744712839530922e-05, + "loss": 1.05, + "step": 4618 + }, + { + "epoch": 1.39, + "grad_norm": 12.4642972946167, + "learning_rate": 1.0742708228926532e-05, + "loss": 1.1186, + "step": 4619 + }, + { + "epoch": 1.39, + "grad_norm": 12.4574556350708, + "learning_rate": 1.074070361832214e-05, + "loss": 1.4973, + "step": 4620 + }, + { + "epoch": 1.39, + "grad_norm": 10.377580642700195, + "learning_rate": 1.0738699007717753e-05, + "loss": 1.8076, + "step": 4621 + }, + { + "epoch": 1.39, + "grad_norm": 19.34781265258789, + "learning_rate": 1.0736694397113361e-05, + "loss": 2.2722, + "step": 4622 + }, + { + "epoch": 1.39, + "grad_norm": 16.955293655395508, + "learning_rate": 1.0734689786508971e-05, + "loss": 1.7219, + "step": 4623 + }, + { + "epoch": 1.39, + "grad_norm": 18.286012649536133, + "learning_rate": 1.0732685175904583e-05, + "loss": 2.9251, + "step": 4624 + }, + { + "epoch": 1.39, + "grad_norm": 19.848609924316406, + "learning_rate": 1.0730680565300191e-05, + "loss": 2.2197, + "step": 4625 + }, + { + "epoch": 1.39, + "grad_norm": 10.013169288635254, + "learning_rate": 1.0728675954695801e-05, + "loss": 0.7125, + "step": 4626 + }, + { + "epoch": 1.39, + "grad_norm": 20.401023864746094, + "learning_rate": 1.0726671344091411e-05, + "loss": 1.5097, + "step": 4627 + }, + { + "epoch": 1.39, + "grad_norm": 20.424652099609375, + "learning_rate": 1.0724666733487021e-05, + "loss": 1.6132, + "step": 4628 + }, + { + "epoch": 1.39, + "grad_norm": 17.268680572509766, + "learning_rate": 1.072266212288263e-05, + "loss": 1.589, + "step": 4629 + }, + { + "epoch": 1.39, + "grad_norm": 10.245696067810059, + "learning_rate": 1.0720657512278242e-05, + "loss": 2.4915, + "step": 4630 + }, + { + "epoch": 1.39, + "grad_norm": 21.644811630249023, + "learning_rate": 1.0718652901673852e-05, + "loss": 2.4812, + "step": 4631 + }, + { + "epoch": 1.39, + "grad_norm": 19.70071792602539, + "learning_rate": 1.071664829106946e-05, + "loss": 1.8817, + "step": 4632 + }, + { + "epoch": 1.39, + "grad_norm": 22.212900161743164, + "learning_rate": 1.0714643680465072e-05, + "loss": 1.9495, + "step": 4633 + }, + { + "epoch": 1.39, + "grad_norm": 11.387062072753906, + "learning_rate": 1.071263906986068e-05, + "loss": 1.3702, + "step": 4634 + }, + { + "epoch": 1.39, + "grad_norm": 10.107064247131348, + "learning_rate": 1.071063445925629e-05, + "loss": 1.0777, + "step": 4635 + }, + { + "epoch": 1.39, + "grad_norm": 34.227264404296875, + "learning_rate": 1.0708629848651899e-05, + "loss": 1.9404, + "step": 4636 + }, + { + "epoch": 1.39, + "grad_norm": 12.135993003845215, + "learning_rate": 1.070662523804751e-05, + "loss": 1.2095, + "step": 4637 + }, + { + "epoch": 1.39, + "grad_norm": 15.475272178649902, + "learning_rate": 1.070462062744312e-05, + "loss": 1.4376, + "step": 4638 + }, + { + "epoch": 1.39, + "grad_norm": 29.245624542236328, + "learning_rate": 1.0702616016838729e-05, + "loss": 2.0634, + "step": 4639 + }, + { + "epoch": 1.4, + "grad_norm": 33.319969177246094, + "learning_rate": 1.070061140623434e-05, + "loss": 1.772, + "step": 4640 + }, + { + "epoch": 1.4, + "grad_norm": 15.358686447143555, + "learning_rate": 1.0698606795629949e-05, + "loss": 1.5593, + "step": 4641 + }, + { + "epoch": 1.4, + "grad_norm": 13.498563766479492, + "learning_rate": 1.069660218502556e-05, + "loss": 1.8836, + "step": 4642 + }, + { + "epoch": 1.4, + "grad_norm": 11.653413772583008, + "learning_rate": 1.0694597574421171e-05, + "loss": 1.3999, + "step": 4643 + }, + { + "epoch": 1.4, + "grad_norm": 7.912506103515625, + "learning_rate": 1.069259296381678e-05, + "loss": 1.0855, + "step": 4644 + }, + { + "epoch": 1.4, + "grad_norm": 18.94805908203125, + "learning_rate": 1.069058835321239e-05, + "loss": 2.4709, + "step": 4645 + }, + { + "epoch": 1.4, + "grad_norm": 14.374813079833984, + "learning_rate": 1.0688583742607998e-05, + "loss": 1.4454, + "step": 4646 + }, + { + "epoch": 1.4, + "grad_norm": 77.0107650756836, + "learning_rate": 1.068657913200361e-05, + "loss": 2.156, + "step": 4647 + }, + { + "epoch": 1.4, + "grad_norm": 14.53071403503418, + "learning_rate": 1.0684574521399218e-05, + "loss": 1.3087, + "step": 4648 + }, + { + "epoch": 1.4, + "grad_norm": 18.24622917175293, + "learning_rate": 1.0682569910794828e-05, + "loss": 2.3464, + "step": 4649 + }, + { + "epoch": 1.4, + "grad_norm": 7.772927761077881, + "learning_rate": 1.068056530019044e-05, + "loss": 1.3107, + "step": 4650 + }, + { + "epoch": 1.4, + "grad_norm": 4.152658462524414, + "learning_rate": 1.0678560689586048e-05, + "loss": 0.467, + "step": 4651 + }, + { + "epoch": 1.4, + "grad_norm": 14.69251537322998, + "learning_rate": 1.0676556078981658e-05, + "loss": 1.5746, + "step": 4652 + }, + { + "epoch": 1.4, + "grad_norm": 15.464540481567383, + "learning_rate": 1.0674551468377268e-05, + "loss": 1.5305, + "step": 4653 + }, + { + "epoch": 1.4, + "grad_norm": 11.303486824035645, + "learning_rate": 1.0672546857772879e-05, + "loss": 1.8223, + "step": 4654 + }, + { + "epoch": 1.4, + "grad_norm": 23.572040557861328, + "learning_rate": 1.0670542247168489e-05, + "loss": 1.9877, + "step": 4655 + }, + { + "epoch": 1.4, + "grad_norm": 22.017297744750977, + "learning_rate": 1.0668537636564099e-05, + "loss": 1.2985, + "step": 4656 + }, + { + "epoch": 1.4, + "grad_norm": 13.40971851348877, + "learning_rate": 1.0666533025959709e-05, + "loss": 1.3887, + "step": 4657 + }, + { + "epoch": 1.4, + "grad_norm": 17.30711555480957, + "learning_rate": 1.0664528415355317e-05, + "loss": 1.6859, + "step": 4658 + }, + { + "epoch": 1.4, + "grad_norm": 8.817755699157715, + "learning_rate": 1.0662523804750929e-05, + "loss": 1.308, + "step": 4659 + }, + { + "epoch": 1.4, + "grad_norm": 29.949804306030273, + "learning_rate": 1.0660519194146537e-05, + "loss": 1.2951, + "step": 4660 + }, + { + "epoch": 1.4, + "grad_norm": 83.96745300292969, + "learning_rate": 1.0658514583542147e-05, + "loss": 2.6652, + "step": 4661 + }, + { + "epoch": 1.4, + "grad_norm": 9.81541919708252, + "learning_rate": 1.065650997293776e-05, + "loss": 0.9763, + "step": 4662 + }, + { + "epoch": 1.4, + "grad_norm": 24.451271057128906, + "learning_rate": 1.0654505362333368e-05, + "loss": 2.2991, + "step": 4663 + }, + { + "epoch": 1.4, + "grad_norm": 21.38994789123535, + "learning_rate": 1.0652500751728978e-05, + "loss": 1.4465, + "step": 4664 + }, + { + "epoch": 1.4, + "grad_norm": 9.186866760253906, + "learning_rate": 1.0650496141124586e-05, + "loss": 0.5958, + "step": 4665 + }, + { + "epoch": 1.4, + "grad_norm": 18.036312103271484, + "learning_rate": 1.0648491530520198e-05, + "loss": 1.7996, + "step": 4666 + }, + { + "epoch": 1.4, + "grad_norm": 13.11405086517334, + "learning_rate": 1.0646486919915808e-05, + "loss": 0.8623, + "step": 4667 + }, + { + "epoch": 1.4, + "grad_norm": 15.945577621459961, + "learning_rate": 1.0644482309311416e-05, + "loss": 1.5742, + "step": 4668 + }, + { + "epoch": 1.4, + "grad_norm": 10.415094375610352, + "learning_rate": 1.0642477698707028e-05, + "loss": 1.0011, + "step": 4669 + }, + { + "epoch": 1.4, + "grad_norm": 13.777579307556152, + "learning_rate": 1.0640473088102636e-05, + "loss": 1.6847, + "step": 4670 + }, + { + "epoch": 1.4, + "grad_norm": 29.36578369140625, + "learning_rate": 1.0638468477498247e-05, + "loss": 1.2423, + "step": 4671 + }, + { + "epoch": 1.4, + "grad_norm": 26.05026626586914, + "learning_rate": 1.0636463866893855e-05, + "loss": 1.976, + "step": 4672 + }, + { + "epoch": 1.4, + "grad_norm": 26.071765899658203, + "learning_rate": 1.0634459256289467e-05, + "loss": 2.2839, + "step": 4673 + }, + { + "epoch": 1.41, + "grad_norm": 16.8984317779541, + "learning_rate": 1.0632454645685077e-05, + "loss": 1.3912, + "step": 4674 + }, + { + "epoch": 1.41, + "grad_norm": 65.57967376708984, + "learning_rate": 1.0630450035080687e-05, + "loss": 2.2416, + "step": 4675 + }, + { + "epoch": 1.41, + "grad_norm": 12.922877311706543, + "learning_rate": 1.0628445424476297e-05, + "loss": 1.5951, + "step": 4676 + }, + { + "epoch": 1.41, + "grad_norm": 15.637014389038086, + "learning_rate": 1.0626440813871905e-05, + "loss": 1.9517, + "step": 4677 + }, + { + "epoch": 1.41, + "grad_norm": 12.687530517578125, + "learning_rate": 1.0624436203267517e-05, + "loss": 1.3298, + "step": 4678 + }, + { + "epoch": 1.41, + "grad_norm": 47.006107330322266, + "learning_rate": 1.0622431592663126e-05, + "loss": 2.6819, + "step": 4679 + }, + { + "epoch": 1.41, + "grad_norm": 12.535234451293945, + "learning_rate": 1.0620426982058736e-05, + "loss": 1.4554, + "step": 4680 + }, + { + "epoch": 1.41, + "eval_loss": 0.2101414054632187, + "eval_runtime": 43.0905, + "eval_samples_per_second": 34.323, + "eval_steps_per_second": 34.323, + "step": 4680 + }, + { + "epoch": 1.41, + "grad_norm": 9.197644233703613, + "learning_rate": 1.0618422371454347e-05, + "loss": 1.1242, + "step": 4681 + }, + { + "epoch": 1.41, + "grad_norm": 11.781989097595215, + "learning_rate": 1.0616417760849956e-05, + "loss": 1.9253, + "step": 4682 + }, + { + "epoch": 1.41, + "grad_norm": 41.59600830078125, + "learning_rate": 1.0614413150245566e-05, + "loss": 2.3303, + "step": 4683 + }, + { + "epoch": 1.41, + "grad_norm": 13.190827369689941, + "learning_rate": 1.0612408539641174e-05, + "loss": 0.8268, + "step": 4684 + }, + { + "epoch": 1.41, + "grad_norm": 57.965003967285156, + "learning_rate": 1.0610403929036786e-05, + "loss": 2.375, + "step": 4685 + }, + { + "epoch": 1.41, + "grad_norm": 21.654773712158203, + "learning_rate": 1.0608399318432396e-05, + "loss": 2.0238, + "step": 4686 + }, + { + "epoch": 1.41, + "grad_norm": 21.622922897338867, + "learning_rate": 1.0606394707828005e-05, + "loss": 1.8048, + "step": 4687 + }, + { + "epoch": 1.41, + "grad_norm": 21.0383358001709, + "learning_rate": 1.0604390097223616e-05, + "loss": 2.2016, + "step": 4688 + }, + { + "epoch": 1.41, + "grad_norm": 25.455896377563477, + "learning_rate": 1.0602385486619225e-05, + "loss": 1.9829, + "step": 4689 + }, + { + "epoch": 1.41, + "grad_norm": 27.83086395263672, + "learning_rate": 1.0600380876014835e-05, + "loss": 1.637, + "step": 4690 + }, + { + "epoch": 1.41, + "grad_norm": 14.252532005310059, + "learning_rate": 1.0598376265410443e-05, + "loss": 0.7723, + "step": 4691 + }, + { + "epoch": 1.41, + "grad_norm": 16.23126220703125, + "learning_rate": 1.0596371654806055e-05, + "loss": 1.5121, + "step": 4692 + }, + { + "epoch": 1.41, + "grad_norm": 26.036483764648438, + "learning_rate": 1.0594367044201665e-05, + "loss": 2.4891, + "step": 4693 + }, + { + "epoch": 1.41, + "grad_norm": 11.481901168823242, + "learning_rate": 1.0592362433597273e-05, + "loss": 1.8866, + "step": 4694 + }, + { + "epoch": 1.41, + "grad_norm": 15.858734130859375, + "learning_rate": 1.0590357822992885e-05, + "loss": 1.6499, + "step": 4695 + }, + { + "epoch": 1.41, + "grad_norm": 42.61335754394531, + "learning_rate": 1.0588353212388494e-05, + "loss": 2.7187, + "step": 4696 + }, + { + "epoch": 1.41, + "grad_norm": 18.403488159179688, + "learning_rate": 1.0586348601784104e-05, + "loss": 1.5052, + "step": 4697 + }, + { + "epoch": 1.41, + "grad_norm": 12.110722541809082, + "learning_rate": 1.0584343991179715e-05, + "loss": 1.499, + "step": 4698 + }, + { + "epoch": 1.41, + "grad_norm": 22.188568115234375, + "learning_rate": 1.0582339380575324e-05, + "loss": 2.2641, + "step": 4699 + }, + { + "epoch": 1.41, + "grad_norm": 22.298145294189453, + "learning_rate": 1.0580334769970934e-05, + "loss": 1.9698, + "step": 4700 + }, + { + "epoch": 1.41, + "grad_norm": 16.0262508392334, + "learning_rate": 1.0578330159366544e-05, + "loss": 1.4256, + "step": 4701 + }, + { + "epoch": 1.41, + "grad_norm": 7.636526584625244, + "learning_rate": 1.0576325548762154e-05, + "loss": 0.9309, + "step": 4702 + }, + { + "epoch": 1.41, + "grad_norm": 16.564451217651367, + "learning_rate": 1.0574320938157762e-05, + "loss": 0.9893, + "step": 4703 + }, + { + "epoch": 1.41, + "grad_norm": 12.611454963684082, + "learning_rate": 1.0572316327553374e-05, + "loss": 1.1581, + "step": 4704 + }, + { + "epoch": 1.41, + "grad_norm": 19.108123779296875, + "learning_rate": 1.0570311716948984e-05, + "loss": 2.3804, + "step": 4705 + }, + { + "epoch": 1.41, + "grad_norm": 28.391921997070312, + "learning_rate": 1.0568307106344593e-05, + "loss": 1.5563, + "step": 4706 + }, + { + "epoch": 1.42, + "grad_norm": 14.404617309570312, + "learning_rate": 1.0566302495740205e-05, + "loss": 1.6261, + "step": 4707 + }, + { + "epoch": 1.42, + "grad_norm": 16.0683650970459, + "learning_rate": 1.0564297885135813e-05, + "loss": 1.5423, + "step": 4708 + }, + { + "epoch": 1.42, + "grad_norm": 15.672560691833496, + "learning_rate": 1.0562293274531423e-05, + "loss": 1.2338, + "step": 4709 + }, + { + "epoch": 1.42, + "grad_norm": 63.09310531616211, + "learning_rate": 1.0560288663927035e-05, + "loss": 3.1458, + "step": 4710 + }, + { + "epoch": 1.42, + "grad_norm": 31.956239700317383, + "learning_rate": 1.0558284053322643e-05, + "loss": 1.1543, + "step": 4711 + }, + { + "epoch": 1.42, + "grad_norm": 18.227100372314453, + "learning_rate": 1.0556279442718253e-05, + "loss": 1.5385, + "step": 4712 + }, + { + "epoch": 1.42, + "grad_norm": 17.809656143188477, + "learning_rate": 1.0554274832113862e-05, + "loss": 1.6656, + "step": 4713 + }, + { + "epoch": 1.42, + "grad_norm": 14.160889625549316, + "learning_rate": 1.0552270221509473e-05, + "loss": 1.4603, + "step": 4714 + }, + { + "epoch": 1.42, + "grad_norm": 19.26714324951172, + "learning_rate": 1.0550265610905082e-05, + "loss": 1.4625, + "step": 4715 + }, + { + "epoch": 1.42, + "grad_norm": 14.98919677734375, + "learning_rate": 1.0548261000300692e-05, + "loss": 1.7723, + "step": 4716 + }, + { + "epoch": 1.42, + "grad_norm": 15.679314613342285, + "learning_rate": 1.0546256389696304e-05, + "loss": 2.7869, + "step": 4717 + }, + { + "epoch": 1.42, + "grad_norm": 21.745820999145508, + "learning_rate": 1.0544251779091912e-05, + "loss": 1.7639, + "step": 4718 + }, + { + "epoch": 1.42, + "grad_norm": 36.71981430053711, + "learning_rate": 1.0542247168487522e-05, + "loss": 1.5607, + "step": 4719 + }, + { + "epoch": 1.42, + "grad_norm": 37.882877349853516, + "learning_rate": 1.054024255788313e-05, + "loss": 2.3179, + "step": 4720 + }, + { + "epoch": 1.42, + "grad_norm": 13.184647560119629, + "learning_rate": 1.0538237947278742e-05, + "loss": 2.0276, + "step": 4721 + }, + { + "epoch": 1.42, + "grad_norm": 7.884564399719238, + "learning_rate": 1.053623333667435e-05, + "loss": 2.1323, + "step": 4722 + }, + { + "epoch": 1.42, + "grad_norm": 13.331478118896484, + "learning_rate": 1.053422872606996e-05, + "loss": 1.2308, + "step": 4723 + }, + { + "epoch": 1.42, + "grad_norm": 10.356979370117188, + "learning_rate": 1.0532224115465573e-05, + "loss": 1.2406, + "step": 4724 + }, + { + "epoch": 1.42, + "grad_norm": 27.083091735839844, + "learning_rate": 1.0530219504861181e-05, + "loss": 1.3356, + "step": 4725 + }, + { + "epoch": 1.42, + "grad_norm": 14.349416732788086, + "learning_rate": 1.0528214894256791e-05, + "loss": 1.337, + "step": 4726 + }, + { + "epoch": 1.42, + "grad_norm": 13.042084693908691, + "learning_rate": 1.0526210283652401e-05, + "loss": 1.3127, + "step": 4727 + }, + { + "epoch": 1.42, + "grad_norm": 39.22003936767578, + "learning_rate": 1.0524205673048011e-05, + "loss": 1.7421, + "step": 4728 + }, + { + "epoch": 1.42, + "grad_norm": 7.622201919555664, + "learning_rate": 1.0522201062443623e-05, + "loss": 0.9831, + "step": 4729 + }, + { + "epoch": 1.42, + "grad_norm": 31.560455322265625, + "learning_rate": 1.0520196451839231e-05, + "loss": 2.0777, + "step": 4730 + }, + { + "epoch": 1.42, + "grad_norm": 13.57126235961914, + "learning_rate": 1.0518191841234841e-05, + "loss": 1.4528, + "step": 4731 + }, + { + "epoch": 1.42, + "grad_norm": 25.839723587036133, + "learning_rate": 1.051618723063045e-05, + "loss": 1.3498, + "step": 4732 + }, + { + "epoch": 1.42, + "grad_norm": 8.843565940856934, + "learning_rate": 1.0514182620026062e-05, + "loss": 2.0511, + "step": 4733 + }, + { + "epoch": 1.42, + "grad_norm": 13.93571949005127, + "learning_rate": 1.051217800942167e-05, + "loss": 1.4334, + "step": 4734 + }, + { + "epoch": 1.42, + "grad_norm": 32.184539794921875, + "learning_rate": 1.051017339881728e-05, + "loss": 2.1138, + "step": 4735 + }, + { + "epoch": 1.42, + "grad_norm": 12.731632232666016, + "learning_rate": 1.0508168788212892e-05, + "loss": 1.506, + "step": 4736 + }, + { + "epoch": 1.42, + "grad_norm": 9.572550773620605, + "learning_rate": 1.05061641776085e-05, + "loss": 2.4816, + "step": 4737 + }, + { + "epoch": 1.42, + "grad_norm": 13.963497161865234, + "learning_rate": 1.050415956700411e-05, + "loss": 1.4354, + "step": 4738 + }, + { + "epoch": 1.42, + "grad_norm": 19.97682762145996, + "learning_rate": 1.0502154956399719e-05, + "loss": 1.7492, + "step": 4739 + }, + { + "epoch": 1.43, + "grad_norm": 11.23585033416748, + "learning_rate": 1.050015034579533e-05, + "loss": 1.396, + "step": 4740 + }, + { + "epoch": 1.43, + "grad_norm": 21.9536075592041, + "learning_rate": 1.049814573519094e-05, + "loss": 1.1373, + "step": 4741 + }, + { + "epoch": 1.43, + "grad_norm": 15.670050621032715, + "learning_rate": 1.0496141124586549e-05, + "loss": 1.9185, + "step": 4742 + }, + { + "epoch": 1.43, + "grad_norm": 18.927589416503906, + "learning_rate": 1.049413651398216e-05, + "loss": 1.7097, + "step": 4743 + }, + { + "epoch": 1.43, + "grad_norm": 14.530732154846191, + "learning_rate": 1.049213190337777e-05, + "loss": 1.1204, + "step": 4744 + }, + { + "epoch": 1.43, + "grad_norm": 29.094755172729492, + "learning_rate": 1.049012729277338e-05, + "loss": 1.787, + "step": 4745 + }, + { + "epoch": 1.43, + "grad_norm": 25.101224899291992, + "learning_rate": 1.048812268216899e-05, + "loss": 2.8473, + "step": 4746 + }, + { + "epoch": 1.43, + "grad_norm": 17.750200271606445, + "learning_rate": 1.04861180715646e-05, + "loss": 1.8474, + "step": 4747 + }, + { + "epoch": 1.43, + "grad_norm": 22.220870971679688, + "learning_rate": 1.048411346096021e-05, + "loss": 1.8063, + "step": 4748 + }, + { + "epoch": 1.43, + "grad_norm": 7.666170120239258, + "learning_rate": 1.048210885035582e-05, + "loss": 1.4058, + "step": 4749 + }, + { + "epoch": 1.43, + "grad_norm": 28.113630294799805, + "learning_rate": 1.048010423975143e-05, + "loss": 2.1896, + "step": 4750 + }, + { + "epoch": 1.43, + "grad_norm": 13.658616065979004, + "learning_rate": 1.0478099629147038e-05, + "loss": 1.7455, + "step": 4751 + }, + { + "epoch": 1.43, + "grad_norm": 11.242774963378906, + "learning_rate": 1.047609501854265e-05, + "loss": 1.9807, + "step": 4752 + }, + { + "epoch": 1.43, + "grad_norm": 11.421576499938965, + "learning_rate": 1.047409040793826e-05, + "loss": 1.5406, + "step": 4753 + }, + { + "epoch": 1.43, + "grad_norm": 24.668371200561523, + "learning_rate": 1.0472085797333868e-05, + "loss": 2.2752, + "step": 4754 + }, + { + "epoch": 1.43, + "grad_norm": 24.07050132751465, + "learning_rate": 1.047008118672948e-05, + "loss": 1.4799, + "step": 4755 + }, + { + "epoch": 1.43, + "grad_norm": 38.27885055541992, + "learning_rate": 1.0468076576125088e-05, + "loss": 1.5369, + "step": 4756 + }, + { + "epoch": 1.43, + "grad_norm": 20.224210739135742, + "learning_rate": 1.0466071965520699e-05, + "loss": 1.9254, + "step": 4757 + }, + { + "epoch": 1.43, + "grad_norm": 10.304624557495117, + "learning_rate": 1.0464067354916307e-05, + "loss": 1.5751, + "step": 4758 + }, + { + "epoch": 1.43, + "grad_norm": 23.387136459350586, + "learning_rate": 1.0462062744311919e-05, + "loss": 1.2816, + "step": 4759 + }, + { + "epoch": 1.43, + "grad_norm": 8.415315628051758, + "learning_rate": 1.0460058133707529e-05, + "loss": 1.2024, + "step": 4760 + }, + { + "epoch": 1.43, + "grad_norm": 15.556396484375, + "learning_rate": 1.0458053523103137e-05, + "loss": 1.7196, + "step": 4761 + }, + { + "epoch": 1.43, + "grad_norm": 70.80048370361328, + "learning_rate": 1.0456048912498749e-05, + "loss": 1.6606, + "step": 4762 + }, + { + "epoch": 1.43, + "grad_norm": 8.032504081726074, + "learning_rate": 1.0454044301894357e-05, + "loss": 1.4636, + "step": 4763 + }, + { + "epoch": 1.43, + "grad_norm": 13.399280548095703, + "learning_rate": 1.0452039691289967e-05, + "loss": 2.0113, + "step": 4764 + }, + { + "epoch": 1.43, + "grad_norm": 10.88759708404541, + "learning_rate": 1.0450035080685576e-05, + "loss": 1.7287, + "step": 4765 + }, + { + "epoch": 1.43, + "grad_norm": 46.82052230834961, + "learning_rate": 1.0448030470081188e-05, + "loss": 2.7536, + "step": 4766 + }, + { + "epoch": 1.43, + "grad_norm": 19.954078674316406, + "learning_rate": 1.0446025859476798e-05, + "loss": 1.6187, + "step": 4767 + }, + { + "epoch": 1.43, + "grad_norm": 46.83697509765625, + "learning_rate": 1.0444021248872406e-05, + "loss": 2.2134, + "step": 4768 + }, + { + "epoch": 1.43, + "grad_norm": 29.12272834777832, + "learning_rate": 1.0442016638268018e-05, + "loss": 2.3703, + "step": 4769 + }, + { + "epoch": 1.43, + "grad_norm": 37.321956634521484, + "learning_rate": 1.0440012027663626e-05, + "loss": 1.2239, + "step": 4770 + }, + { + "epoch": 1.43, + "grad_norm": 18.08650779724121, + "learning_rate": 1.0438007417059236e-05, + "loss": 1.4429, + "step": 4771 + }, + { + "epoch": 1.43, + "grad_norm": 15.265766143798828, + "learning_rate": 1.0436002806454848e-05, + "loss": 1.2304, + "step": 4772 + }, + { + "epoch": 1.44, + "grad_norm": 16.230012893676758, + "learning_rate": 1.0433998195850457e-05, + "loss": 1.2576, + "step": 4773 + }, + { + "epoch": 1.44, + "grad_norm": 27.102861404418945, + "learning_rate": 1.0431993585246067e-05, + "loss": 2.3902, + "step": 4774 + }, + { + "epoch": 1.44, + "grad_norm": 9.05881404876709, + "learning_rate": 1.0429988974641677e-05, + "loss": 0.8901, + "step": 4775 + }, + { + "epoch": 1.44, + "grad_norm": 24.15192985534668, + "learning_rate": 1.0427984364037287e-05, + "loss": 1.4729, + "step": 4776 + }, + { + "epoch": 1.44, + "grad_norm": 11.738104820251465, + "learning_rate": 1.0425979753432895e-05, + "loss": 1.5979, + "step": 4777 + }, + { + "epoch": 1.44, + "grad_norm": 17.82452392578125, + "learning_rate": 1.0423975142828507e-05, + "loss": 1.2219, + "step": 4778 + }, + { + "epoch": 1.44, + "grad_norm": 10.79785442352295, + "learning_rate": 1.0421970532224117e-05, + "loss": 1.1237, + "step": 4779 + }, + { + "epoch": 1.44, + "grad_norm": 66.31954193115234, + "learning_rate": 1.0419965921619725e-05, + "loss": 1.1348, + "step": 4780 + }, + { + "epoch": 1.44, + "grad_norm": 14.166802406311035, + "learning_rate": 1.0417961311015337e-05, + "loss": 1.5118, + "step": 4781 + }, + { + "epoch": 1.44, + "grad_norm": 19.842845916748047, + "learning_rate": 1.0415956700410946e-05, + "loss": 1.22, + "step": 4782 + }, + { + "epoch": 1.44, + "grad_norm": 19.89260482788086, + "learning_rate": 1.0413952089806556e-05, + "loss": 1.4466, + "step": 4783 + }, + { + "epoch": 1.44, + "grad_norm": 8.727401733398438, + "learning_rate": 1.0411947479202167e-05, + "loss": 1.2331, + "step": 4784 + }, + { + "epoch": 1.44, + "grad_norm": 16.119182586669922, + "learning_rate": 1.0409942868597776e-05, + "loss": 2.1112, + "step": 4785 + }, + { + "epoch": 1.44, + "grad_norm": 8.426682472229004, + "learning_rate": 1.0407938257993386e-05, + "loss": 0.7954, + "step": 4786 + }, + { + "epoch": 1.44, + "grad_norm": 11.057311058044434, + "learning_rate": 1.0405933647388994e-05, + "loss": 1.4438, + "step": 4787 + }, + { + "epoch": 1.44, + "grad_norm": 11.847160339355469, + "learning_rate": 1.0403929036784606e-05, + "loss": 1.6443, + "step": 4788 + }, + { + "epoch": 1.44, + "grad_norm": 52.34111022949219, + "learning_rate": 1.0401924426180214e-05, + "loss": 1.8, + "step": 4789 + }, + { + "epoch": 1.44, + "grad_norm": 12.100299835205078, + "learning_rate": 1.0399919815575825e-05, + "loss": 0.9117, + "step": 4790 + }, + { + "epoch": 1.44, + "grad_norm": 25.9334774017334, + "learning_rate": 1.0397915204971436e-05, + "loss": 1.6836, + "step": 4791 + }, + { + "epoch": 1.44, + "grad_norm": 16.446218490600586, + "learning_rate": 1.0395910594367045e-05, + "loss": 1.581, + "step": 4792 + }, + { + "epoch": 1.44, + "grad_norm": 16.541593551635742, + "learning_rate": 1.0393905983762655e-05, + "loss": 1.6899, + "step": 4793 + }, + { + "epoch": 1.44, + "grad_norm": 14.012060165405273, + "learning_rate": 1.0391901373158265e-05, + "loss": 1.3736, + "step": 4794 + }, + { + "epoch": 1.44, + "grad_norm": 18.858366012573242, + "learning_rate": 1.0389896762553875e-05, + "loss": 1.2005, + "step": 4795 + }, + { + "epoch": 1.44, + "grad_norm": 11.462150573730469, + "learning_rate": 1.0387892151949485e-05, + "loss": 1.0013, + "step": 4796 + }, + { + "epoch": 1.44, + "grad_norm": 13.834492683410645, + "learning_rate": 1.0385887541345095e-05, + "loss": 0.8818, + "step": 4797 + }, + { + "epoch": 1.44, + "grad_norm": 44.42348861694336, + "learning_rate": 1.0383882930740705e-05, + "loss": 2.2629, + "step": 4798 + }, + { + "epoch": 1.44, + "grad_norm": 21.56435775756836, + "learning_rate": 1.0381878320136314e-05, + "loss": 2.646, + "step": 4799 + }, + { + "epoch": 1.44, + "grad_norm": 12.599276542663574, + "learning_rate": 1.0379873709531925e-05, + "loss": 1.2467, + "step": 4800 + }, + { + "epoch": 1.44, + "eval_loss": 0.20445984601974487, + "eval_runtime": 43.0646, + "eval_samples_per_second": 34.344, + "eval_steps_per_second": 34.344, + "step": 4800 + }, + { + "epoch": 1.44, + "grad_norm": 21.611980438232422, + "learning_rate": 1.0377869098927534e-05, + "loss": 1.8799, + "step": 4801 + }, + { + "epoch": 1.44, + "grad_norm": 26.111543655395508, + "learning_rate": 1.0375864488323144e-05, + "loss": 1.8642, + "step": 4802 + }, + { + "epoch": 1.44, + "grad_norm": 19.41090965270996, + "learning_rate": 1.0373859877718756e-05, + "loss": 1.8098, + "step": 4803 + }, + { + "epoch": 1.44, + "grad_norm": 23.62296485900879, + "learning_rate": 1.0371855267114364e-05, + "loss": 2.0232, + "step": 4804 + }, + { + "epoch": 1.44, + "grad_norm": 25.114355087280273, + "learning_rate": 1.0369850656509974e-05, + "loss": 1.348, + "step": 4805 + }, + { + "epoch": 1.44, + "grad_norm": 15.635001182556152, + "learning_rate": 1.0367846045905583e-05, + "loss": 1.4803, + "step": 4806 + }, + { + "epoch": 1.45, + "grad_norm": 12.778668403625488, + "learning_rate": 1.0365841435301194e-05, + "loss": 1.837, + "step": 4807 + }, + { + "epoch": 1.45, + "grad_norm": 19.424400329589844, + "learning_rate": 1.0363836824696803e-05, + "loss": 1.6641, + "step": 4808 + }, + { + "epoch": 1.45, + "grad_norm": 9.217851638793945, + "learning_rate": 1.0361832214092413e-05, + "loss": 0.925, + "step": 4809 + }, + { + "epoch": 1.45, + "grad_norm": 26.674348831176758, + "learning_rate": 1.0359827603488025e-05, + "loss": 1.5977, + "step": 4810 + }, + { + "epoch": 1.45, + "grad_norm": 13.42351245880127, + "learning_rate": 1.0357822992883633e-05, + "loss": 1.3927, + "step": 4811 + }, + { + "epoch": 1.45, + "grad_norm": 36.85655975341797, + "learning_rate": 1.0355818382279243e-05, + "loss": 1.8373, + "step": 4812 + }, + { + "epoch": 1.45, + "grad_norm": 49.87083435058594, + "learning_rate": 1.0353813771674851e-05, + "loss": 1.6954, + "step": 4813 + }, + { + "epoch": 1.45, + "grad_norm": 30.802719116210938, + "learning_rate": 1.0351809161070463e-05, + "loss": 2.0961, + "step": 4814 + }, + { + "epoch": 1.45, + "grad_norm": 22.26266860961914, + "learning_rate": 1.0349804550466073e-05, + "loss": 1.3555, + "step": 4815 + }, + { + "epoch": 1.45, + "grad_norm": 18.78736114501953, + "learning_rate": 1.0347799939861682e-05, + "loss": 1.4008, + "step": 4816 + }, + { + "epoch": 1.45, + "grad_norm": 10.189375877380371, + "learning_rate": 1.0345795329257293e-05, + "loss": 0.6236, + "step": 4817 + }, + { + "epoch": 1.45, + "grad_norm": 14.57485580444336, + "learning_rate": 1.0343790718652902e-05, + "loss": 1.3009, + "step": 4818 + }, + { + "epoch": 1.45, + "grad_norm": 16.516605377197266, + "learning_rate": 1.0341786108048512e-05, + "loss": 1.3581, + "step": 4819 + }, + { + "epoch": 1.45, + "grad_norm": 12.826788902282715, + "learning_rate": 1.0339781497444122e-05, + "loss": 1.9941, + "step": 4820 + }, + { + "epoch": 1.45, + "grad_norm": 43.45224380493164, + "learning_rate": 1.0337776886839732e-05, + "loss": 1.8419, + "step": 4821 + }, + { + "epoch": 1.45, + "grad_norm": 33.118247985839844, + "learning_rate": 1.0335772276235342e-05, + "loss": 1.7156, + "step": 4822 + }, + { + "epoch": 1.45, + "grad_norm": 29.060731887817383, + "learning_rate": 1.0333767665630952e-05, + "loss": 1.5526, + "step": 4823 + }, + { + "epoch": 1.45, + "grad_norm": 30.88375473022461, + "learning_rate": 1.0331763055026562e-05, + "loss": 1.9801, + "step": 4824 + }, + { + "epoch": 1.45, + "grad_norm": 26.168588638305664, + "learning_rate": 1.032975844442217e-05, + "loss": 1.63, + "step": 4825 + }, + { + "epoch": 1.45, + "grad_norm": 20.537208557128906, + "learning_rate": 1.0327753833817783e-05, + "loss": 1.582, + "step": 4826 + }, + { + "epoch": 1.45, + "grad_norm": 15.51192855834961, + "learning_rate": 1.0325749223213393e-05, + "loss": 2.2813, + "step": 4827 + }, + { + "epoch": 1.45, + "grad_norm": 18.50126838684082, + "learning_rate": 1.0323744612609001e-05, + "loss": 1.5379, + "step": 4828 + }, + { + "epoch": 1.45, + "grad_norm": 19.359989166259766, + "learning_rate": 1.0321740002004613e-05, + "loss": 1.4196, + "step": 4829 + }, + { + "epoch": 1.45, + "grad_norm": 71.47821807861328, + "learning_rate": 1.0319735391400221e-05, + "loss": 2.627, + "step": 4830 + }, + { + "epoch": 1.45, + "grad_norm": 8.43783950805664, + "learning_rate": 1.0317730780795831e-05, + "loss": 1.1434, + "step": 4831 + }, + { + "epoch": 1.45, + "grad_norm": 49.501930236816406, + "learning_rate": 1.031572617019144e-05, + "loss": 2.0673, + "step": 4832 + }, + { + "epoch": 1.45, + "grad_norm": 11.1619234085083, + "learning_rate": 1.0313721559587051e-05, + "loss": 1.8033, + "step": 4833 + }, + { + "epoch": 1.45, + "grad_norm": 7.6200408935546875, + "learning_rate": 1.0311716948982662e-05, + "loss": 0.8928, + "step": 4834 + }, + { + "epoch": 1.45, + "grad_norm": 19.349905014038086, + "learning_rate": 1.030971233837827e-05, + "loss": 1.6486, + "step": 4835 + }, + { + "epoch": 1.45, + "grad_norm": 8.616739273071289, + "learning_rate": 1.0307707727773882e-05, + "loss": 1.2562, + "step": 4836 + }, + { + "epoch": 1.45, + "grad_norm": 30.071773529052734, + "learning_rate": 1.030570311716949e-05, + "loss": 1.4047, + "step": 4837 + }, + { + "epoch": 1.45, + "grad_norm": 12.02312183380127, + "learning_rate": 1.03036985065651e-05, + "loss": 1.5015, + "step": 4838 + }, + { + "epoch": 1.45, + "grad_norm": 13.285801887512207, + "learning_rate": 1.0301693895960712e-05, + "loss": 1.4643, + "step": 4839 + }, + { + "epoch": 1.46, + "grad_norm": 20.811077117919922, + "learning_rate": 1.029968928535632e-05, + "loss": 1.8669, + "step": 4840 + }, + { + "epoch": 1.46, + "grad_norm": 21.803550720214844, + "learning_rate": 1.029768467475193e-05, + "loss": 2.0773, + "step": 4841 + }, + { + "epoch": 1.46, + "grad_norm": 35.751792907714844, + "learning_rate": 1.0295680064147539e-05, + "loss": 2.3893, + "step": 4842 + }, + { + "epoch": 1.46, + "grad_norm": 14.943572044372559, + "learning_rate": 1.029367545354315e-05, + "loss": 1.9117, + "step": 4843 + }, + { + "epoch": 1.46, + "grad_norm": 19.128864288330078, + "learning_rate": 1.0291670842938759e-05, + "loss": 1.9138, + "step": 4844 + }, + { + "epoch": 1.46, + "grad_norm": 17.63079833984375, + "learning_rate": 1.028966623233437e-05, + "loss": 1.2719, + "step": 4845 + }, + { + "epoch": 1.46, + "grad_norm": 40.6061897277832, + "learning_rate": 1.028766162172998e-05, + "loss": 1.8636, + "step": 4846 + }, + { + "epoch": 1.46, + "grad_norm": 11.293282508850098, + "learning_rate": 1.028565701112559e-05, + "loss": 1.1617, + "step": 4847 + }, + { + "epoch": 1.46, + "grad_norm": 7.563941955566406, + "learning_rate": 1.0283652400521201e-05, + "loss": 1.266, + "step": 4848 + }, + { + "epoch": 1.46, + "grad_norm": 37.318321228027344, + "learning_rate": 1.028164778991681e-05, + "loss": 4.1312, + "step": 4849 + }, + { + "epoch": 1.46, + "grad_norm": 15.308874130249023, + "learning_rate": 1.027964317931242e-05, + "loss": 1.563, + "step": 4850 + }, + { + "epoch": 1.46, + "grad_norm": 12.4258451461792, + "learning_rate": 1.0277638568708028e-05, + "loss": 1.7674, + "step": 4851 + }, + { + "epoch": 1.46, + "grad_norm": 12.700484275817871, + "learning_rate": 1.027563395810364e-05, + "loss": 1.2176, + "step": 4852 + }, + { + "epoch": 1.46, + "grad_norm": 26.99094581604004, + "learning_rate": 1.027362934749925e-05, + "loss": 1.5912, + "step": 4853 + }, + { + "epoch": 1.46, + "grad_norm": 52.29587936401367, + "learning_rate": 1.0271624736894858e-05, + "loss": 2.6716, + "step": 4854 + }, + { + "epoch": 1.46, + "grad_norm": 28.816272735595703, + "learning_rate": 1.026962012629047e-05, + "loss": 2.7605, + "step": 4855 + }, + { + "epoch": 1.46, + "grad_norm": 28.791126251220703, + "learning_rate": 1.0267615515686078e-05, + "loss": 1.8248, + "step": 4856 + }, + { + "epoch": 1.46, + "grad_norm": 17.190807342529297, + "learning_rate": 1.0265610905081688e-05, + "loss": 1.6728, + "step": 4857 + }, + { + "epoch": 1.46, + "grad_norm": 17.33432960510254, + "learning_rate": 1.02636062944773e-05, + "loss": 1.427, + "step": 4858 + }, + { + "epoch": 1.46, + "grad_norm": 18.81448745727539, + "learning_rate": 1.0261601683872909e-05, + "loss": 1.4079, + "step": 4859 + }, + { + "epoch": 1.46, + "grad_norm": 26.18571662902832, + "learning_rate": 1.0259597073268519e-05, + "loss": 1.3066, + "step": 4860 + }, + { + "epoch": 1.46, + "grad_norm": 22.120250701904297, + "learning_rate": 1.0257592462664127e-05, + "loss": 1.8271, + "step": 4861 + }, + { + "epoch": 1.46, + "grad_norm": 19.113744735717773, + "learning_rate": 1.0255587852059739e-05, + "loss": 0.9674, + "step": 4862 + }, + { + "epoch": 1.46, + "grad_norm": 15.18375301361084, + "learning_rate": 1.0253583241455347e-05, + "loss": 1.3229, + "step": 4863 + }, + { + "epoch": 1.46, + "grad_norm": 14.799633979797363, + "learning_rate": 1.0251578630850957e-05, + "loss": 1.1113, + "step": 4864 + }, + { + "epoch": 1.46, + "grad_norm": 13.80958366394043, + "learning_rate": 1.0249574020246569e-05, + "loss": 1.6729, + "step": 4865 + }, + { + "epoch": 1.46, + "grad_norm": 38.37821578979492, + "learning_rate": 1.0247569409642177e-05, + "loss": 1.5533, + "step": 4866 + }, + { + "epoch": 1.46, + "grad_norm": 28.763893127441406, + "learning_rate": 1.0245564799037788e-05, + "loss": 1.699, + "step": 4867 + }, + { + "epoch": 1.46, + "grad_norm": 20.876445770263672, + "learning_rate": 1.0243560188433398e-05, + "loss": 1.3333, + "step": 4868 + }, + { + "epoch": 1.46, + "grad_norm": 52.730857849121094, + "learning_rate": 1.0241555577829008e-05, + "loss": 1.4987, + "step": 4869 + }, + { + "epoch": 1.46, + "grad_norm": 13.893447875976562, + "learning_rate": 1.0239550967224618e-05, + "loss": 1.6037, + "step": 4870 + }, + { + "epoch": 1.46, + "grad_norm": 28.038808822631836, + "learning_rate": 1.0237546356620228e-05, + "loss": 3.0782, + "step": 4871 + }, + { + "epoch": 1.46, + "grad_norm": 26.272857666015625, + "learning_rate": 1.0235541746015838e-05, + "loss": 1.7294, + "step": 4872 + }, + { + "epoch": 1.47, + "grad_norm": 87.54938507080078, + "learning_rate": 1.0233537135411446e-05, + "loss": 2.8123, + "step": 4873 + }, + { + "epoch": 1.47, + "grad_norm": 25.847455978393555, + "learning_rate": 1.0231532524807058e-05, + "loss": 1.3259, + "step": 4874 + }, + { + "epoch": 1.47, + "grad_norm": 10.328527450561523, + "learning_rate": 1.0229527914202666e-05, + "loss": 0.9254, + "step": 4875 + }, + { + "epoch": 1.47, + "grad_norm": 19.659297943115234, + "learning_rate": 1.0227523303598277e-05, + "loss": 1.1272, + "step": 4876 + }, + { + "epoch": 1.47, + "grad_norm": 25.877544403076172, + "learning_rate": 1.0225518692993888e-05, + "loss": 1.8762, + "step": 4877 + }, + { + "epoch": 1.47, + "grad_norm": 22.409603118896484, + "learning_rate": 1.0223514082389497e-05, + "loss": 2.1701, + "step": 4878 + }, + { + "epoch": 1.47, + "grad_norm": 48.738155364990234, + "learning_rate": 1.0221509471785107e-05, + "loss": 2.1954, + "step": 4879 + }, + { + "epoch": 1.47, + "grad_norm": 13.183887481689453, + "learning_rate": 1.0219504861180715e-05, + "loss": 1.5571, + "step": 4880 + }, + { + "epoch": 1.47, + "grad_norm": 18.5463809967041, + "learning_rate": 1.0217500250576327e-05, + "loss": 1.5617, + "step": 4881 + }, + { + "epoch": 1.47, + "grad_norm": 22.341135025024414, + "learning_rate": 1.0215495639971937e-05, + "loss": 1.0747, + "step": 4882 + }, + { + "epoch": 1.47, + "grad_norm": 10.601929664611816, + "learning_rate": 1.0213491029367545e-05, + "loss": 1.539, + "step": 4883 + }, + { + "epoch": 1.47, + "grad_norm": 12.355496406555176, + "learning_rate": 1.0211486418763157e-05, + "loss": 1.3754, + "step": 4884 + }, + { + "epoch": 1.47, + "grad_norm": 11.556707382202148, + "learning_rate": 1.0209481808158766e-05, + "loss": 1.5293, + "step": 4885 + }, + { + "epoch": 1.47, + "grad_norm": 19.798812866210938, + "learning_rate": 1.0207477197554376e-05, + "loss": 0.9688, + "step": 4886 + }, + { + "epoch": 1.47, + "grad_norm": 13.712343215942383, + "learning_rate": 1.0205472586949984e-05, + "loss": 1.5041, + "step": 4887 + }, + { + "epoch": 1.47, + "grad_norm": 10.897645950317383, + "learning_rate": 1.0203467976345596e-05, + "loss": 1.0905, + "step": 4888 + }, + { + "epoch": 1.47, + "grad_norm": 28.161396026611328, + "learning_rate": 1.0201463365741206e-05, + "loss": 1.7225, + "step": 4889 + }, + { + "epoch": 1.47, + "grad_norm": 34.16568374633789, + "learning_rate": 1.0199458755136814e-05, + "loss": 2.0862, + "step": 4890 + }, + { + "epoch": 1.47, + "grad_norm": 19.398710250854492, + "learning_rate": 1.0197454144532426e-05, + "loss": 1.5471, + "step": 4891 + }, + { + "epoch": 1.47, + "grad_norm": 24.547252655029297, + "learning_rate": 1.0195449533928035e-05, + "loss": 3.0927, + "step": 4892 + }, + { + "epoch": 1.47, + "grad_norm": 11.176749229431152, + "learning_rate": 1.0193444923323645e-05, + "loss": 1.1639, + "step": 4893 + }, + { + "epoch": 1.47, + "grad_norm": 50.76742172241211, + "learning_rate": 1.0191440312719255e-05, + "loss": 2.6678, + "step": 4894 + }, + { + "epoch": 1.47, + "grad_norm": 9.156387329101562, + "learning_rate": 1.0189435702114865e-05, + "loss": 0.9565, + "step": 4895 + }, + { + "epoch": 1.47, + "grad_norm": 23.076967239379883, + "learning_rate": 1.0187431091510475e-05, + "loss": 1.6497, + "step": 4896 + }, + { + "epoch": 1.47, + "grad_norm": 16.60446548461914, + "learning_rate": 1.0185426480906085e-05, + "loss": 1.5919, + "step": 4897 + }, + { + "epoch": 1.47, + "grad_norm": 12.027603149414062, + "learning_rate": 1.0183421870301695e-05, + "loss": 2.2724, + "step": 4898 + }, + { + "epoch": 1.47, + "grad_norm": 18.373804092407227, + "learning_rate": 1.0181417259697303e-05, + "loss": 0.8864, + "step": 4899 + }, + { + "epoch": 1.47, + "grad_norm": 33.86744689941406, + "learning_rate": 1.0179412649092915e-05, + "loss": 2.4096, + "step": 4900 + }, + { + "epoch": 1.47, + "grad_norm": 19.00617218017578, + "learning_rate": 1.0177408038488525e-05, + "loss": 1.4766, + "step": 4901 + }, + { + "epoch": 1.47, + "grad_norm": 10.875265121459961, + "learning_rate": 1.0175403427884134e-05, + "loss": 1.6248, + "step": 4902 + }, + { + "epoch": 1.47, + "grad_norm": 18.423274993896484, + "learning_rate": 1.0173398817279745e-05, + "loss": 2.1238, + "step": 4903 + }, + { + "epoch": 1.47, + "grad_norm": 14.276087760925293, + "learning_rate": 1.0171394206675354e-05, + "loss": 1.471, + "step": 4904 + }, + { + "epoch": 1.47, + "grad_norm": 8.41922664642334, + "learning_rate": 1.0169389596070964e-05, + "loss": 0.8771, + "step": 4905 + }, + { + "epoch": 1.48, + "grad_norm": 17.14394760131836, + "learning_rate": 1.0167384985466572e-05, + "loss": 1.0387, + "step": 4906 + }, + { + "epoch": 1.48, + "grad_norm": 20.45330047607422, + "learning_rate": 1.0165380374862184e-05, + "loss": 1.9674, + "step": 4907 + }, + { + "epoch": 1.48, + "grad_norm": 6.636265754699707, + "learning_rate": 1.0163375764257794e-05, + "loss": 0.3816, + "step": 4908 + }, + { + "epoch": 1.48, + "grad_norm": 15.284008979797363, + "learning_rate": 1.0161371153653403e-05, + "loss": 1.6652, + "step": 4909 + }, + { + "epoch": 1.48, + "grad_norm": 16.42159080505371, + "learning_rate": 1.0159366543049014e-05, + "loss": 2.3129, + "step": 4910 + }, + { + "epoch": 1.48, + "grad_norm": 19.10989761352539, + "learning_rate": 1.0157361932444623e-05, + "loss": 1.8581, + "step": 4911 + }, + { + "epoch": 1.48, + "grad_norm": 30.188232421875, + "learning_rate": 1.0155357321840233e-05, + "loss": 1.5959, + "step": 4912 + }, + { + "epoch": 1.48, + "grad_norm": 12.307696342468262, + "learning_rate": 1.0153352711235845e-05, + "loss": 1.6547, + "step": 4913 + }, + { + "epoch": 1.48, + "grad_norm": 12.962136268615723, + "learning_rate": 1.0151348100631453e-05, + "loss": 1.8137, + "step": 4914 + }, + { + "epoch": 1.48, + "grad_norm": 10.3368558883667, + "learning_rate": 1.0149343490027063e-05, + "loss": 0.9236, + "step": 4915 + }, + { + "epoch": 1.48, + "grad_norm": 20.91614532470703, + "learning_rate": 1.0147338879422673e-05, + "loss": 1.7155, + "step": 4916 + }, + { + "epoch": 1.48, + "grad_norm": 38.6691780090332, + "learning_rate": 1.0145334268818283e-05, + "loss": 2.1508, + "step": 4917 + }, + { + "epoch": 1.48, + "grad_norm": 9.685796737670898, + "learning_rate": 1.0143329658213892e-05, + "loss": 1.7707, + "step": 4918 + }, + { + "epoch": 1.48, + "grad_norm": 14.471162796020508, + "learning_rate": 1.0141325047609503e-05, + "loss": 1.5946, + "step": 4919 + }, + { + "epoch": 1.48, + "grad_norm": 18.22927474975586, + "learning_rate": 1.0139320437005114e-05, + "loss": 1.2375, + "step": 4920 + }, + { + "epoch": 1.48, + "eval_loss": 0.2065747231245041, + "eval_runtime": 43.2855, + "eval_samples_per_second": 34.169, + "eval_steps_per_second": 34.169, + "step": 4920 + }, + { + "epoch": 1.48, + "grad_norm": 51.773658752441406, + "learning_rate": 1.0137315826400722e-05, + "loss": 0.986, + "step": 4921 + }, + { + "epoch": 1.48, + "grad_norm": 105.13095092773438, + "learning_rate": 1.0135311215796334e-05, + "loss": 2.8983, + "step": 4922 + }, + { + "epoch": 1.48, + "grad_norm": 14.783707618713379, + "learning_rate": 1.0133306605191942e-05, + "loss": 1.9219, + "step": 4923 + }, + { + "epoch": 1.48, + "grad_norm": 8.431492805480957, + "learning_rate": 1.0131301994587552e-05, + "loss": 0.9575, + "step": 4924 + }, + { + "epoch": 1.48, + "grad_norm": 14.085335731506348, + "learning_rate": 1.0129297383983164e-05, + "loss": 1.2251, + "step": 4925 + }, + { + "epoch": 1.48, + "grad_norm": 17.102575302124023, + "learning_rate": 1.0127292773378772e-05, + "loss": 1.4003, + "step": 4926 + }, + { + "epoch": 1.48, + "grad_norm": 13.873455047607422, + "learning_rate": 1.0125288162774382e-05, + "loss": 1.3603, + "step": 4927 + }, + { + "epoch": 1.48, + "grad_norm": 8.495047569274902, + "learning_rate": 1.012328355216999e-05, + "loss": 1.8705, + "step": 4928 + }, + { + "epoch": 1.48, + "grad_norm": 12.480233192443848, + "learning_rate": 1.0121278941565603e-05, + "loss": 1.6846, + "step": 4929 + }, + { + "epoch": 1.48, + "grad_norm": 9.334129333496094, + "learning_rate": 1.0119274330961211e-05, + "loss": 1.924, + "step": 4930 + }, + { + "epoch": 1.48, + "grad_norm": 12.200242042541504, + "learning_rate": 1.0117269720356821e-05, + "loss": 1.4727, + "step": 4931 + }, + { + "epoch": 1.48, + "grad_norm": 10.241484642028809, + "learning_rate": 1.0115265109752433e-05, + "loss": 1.2527, + "step": 4932 + }, + { + "epoch": 1.48, + "grad_norm": 18.393444061279297, + "learning_rate": 1.0113260499148041e-05, + "loss": 1.9137, + "step": 4933 + }, + { + "epoch": 1.48, + "grad_norm": 30.996809005737305, + "learning_rate": 1.0111255888543651e-05, + "loss": 1.9626, + "step": 4934 + }, + { + "epoch": 1.48, + "grad_norm": 22.243946075439453, + "learning_rate": 1.010925127793926e-05, + "loss": 1.4565, + "step": 4935 + }, + { + "epoch": 1.48, + "grad_norm": 10.407564163208008, + "learning_rate": 1.0107246667334871e-05, + "loss": 1.0015, + "step": 4936 + }, + { + "epoch": 1.48, + "grad_norm": 17.139793395996094, + "learning_rate": 1.010524205673048e-05, + "loss": 1.3928, + "step": 4937 + }, + { + "epoch": 1.48, + "grad_norm": 34.3438606262207, + "learning_rate": 1.010323744612609e-05, + "loss": 1.5863, + "step": 4938 + }, + { + "epoch": 1.48, + "grad_norm": 14.947382926940918, + "learning_rate": 1.0101232835521702e-05, + "loss": 1.3544, + "step": 4939 + }, + { + "epoch": 1.49, + "grad_norm": 11.249651908874512, + "learning_rate": 1.009922822491731e-05, + "loss": 1.3051, + "step": 4940 + }, + { + "epoch": 1.49, + "grad_norm": 16.109973907470703, + "learning_rate": 1.009722361431292e-05, + "loss": 2.0421, + "step": 4941 + }, + { + "epoch": 1.49, + "grad_norm": 14.892985343933105, + "learning_rate": 1.009521900370853e-05, + "loss": 1.1327, + "step": 4942 + }, + { + "epoch": 1.49, + "grad_norm": 27.952659606933594, + "learning_rate": 1.009321439310414e-05, + "loss": 2.3613, + "step": 4943 + }, + { + "epoch": 1.49, + "grad_norm": 27.172393798828125, + "learning_rate": 1.009120978249975e-05, + "loss": 2.1473, + "step": 4944 + }, + { + "epoch": 1.49, + "grad_norm": 9.170857429504395, + "learning_rate": 1.008920517189536e-05, + "loss": 1.2239, + "step": 4945 + }, + { + "epoch": 1.49, + "grad_norm": 11.424710273742676, + "learning_rate": 1.008720056129097e-05, + "loss": 0.8955, + "step": 4946 + }, + { + "epoch": 1.49, + "grad_norm": 13.28514289855957, + "learning_rate": 1.0085195950686579e-05, + "loss": 1.6111, + "step": 4947 + }, + { + "epoch": 1.49, + "grad_norm": 32.51649475097656, + "learning_rate": 1.008319134008219e-05, + "loss": 1.9871, + "step": 4948 + }, + { + "epoch": 1.49, + "grad_norm": 15.511556625366211, + "learning_rate": 1.00811867294778e-05, + "loss": 1.37, + "step": 4949 + }, + { + "epoch": 1.49, + "grad_norm": 16.7358341217041, + "learning_rate": 1.007918211887341e-05, + "loss": 1.6897, + "step": 4950 + }, + { + "epoch": 1.49, + "grad_norm": 13.070718765258789, + "learning_rate": 1.0077177508269021e-05, + "loss": 0.666, + "step": 4951 + }, + { + "epoch": 1.49, + "grad_norm": 22.743227005004883, + "learning_rate": 1.007517289766463e-05, + "loss": 1.8756, + "step": 4952 + }, + { + "epoch": 1.49, + "grad_norm": 20.257579803466797, + "learning_rate": 1.007316828706024e-05, + "loss": 1.9218, + "step": 4953 + }, + { + "epoch": 1.49, + "grad_norm": 32.02715301513672, + "learning_rate": 1.0071163676455848e-05, + "loss": 1.2685, + "step": 4954 + }, + { + "epoch": 1.49, + "grad_norm": 15.203670501708984, + "learning_rate": 1.006915906585146e-05, + "loss": 1.5685, + "step": 4955 + }, + { + "epoch": 1.49, + "grad_norm": 7.677938461303711, + "learning_rate": 1.006715445524707e-05, + "loss": 1.0445, + "step": 4956 + }, + { + "epoch": 1.49, + "grad_norm": 24.447038650512695, + "learning_rate": 1.0065149844642678e-05, + "loss": 1.7323, + "step": 4957 + }, + { + "epoch": 1.49, + "grad_norm": 12.981391906738281, + "learning_rate": 1.006314523403829e-05, + "loss": 1.9296, + "step": 4958 + }, + { + "epoch": 1.49, + "grad_norm": 38.580570220947266, + "learning_rate": 1.0061140623433898e-05, + "loss": 1.909, + "step": 4959 + }, + { + "epoch": 1.49, + "grad_norm": 11.870725631713867, + "learning_rate": 1.0059136012829508e-05, + "loss": 1.3958, + "step": 4960 + }, + { + "epoch": 1.49, + "grad_norm": 7.389049530029297, + "learning_rate": 1.0057131402225117e-05, + "loss": 0.8741, + "step": 4961 + }, + { + "epoch": 1.49, + "grad_norm": 12.032258033752441, + "learning_rate": 1.0055126791620729e-05, + "loss": 1.4206, + "step": 4962 + }, + { + "epoch": 1.49, + "grad_norm": 22.42498207092285, + "learning_rate": 1.0053122181016339e-05, + "loss": 2.5246, + "step": 4963 + }, + { + "epoch": 1.49, + "grad_norm": 12.639444351196289, + "learning_rate": 1.0051117570411949e-05, + "loss": 1.8031, + "step": 4964 + }, + { + "epoch": 1.49, + "grad_norm": 33.977386474609375, + "learning_rate": 1.0049112959807559e-05, + "loss": 1.595, + "step": 4965 + }, + { + "epoch": 1.49, + "grad_norm": 13.614667892456055, + "learning_rate": 1.0047108349203167e-05, + "loss": 0.975, + "step": 4966 + }, + { + "epoch": 1.49, + "grad_norm": 18.450183868408203, + "learning_rate": 1.0045103738598779e-05, + "loss": 2.5177, + "step": 4967 + }, + { + "epoch": 1.49, + "grad_norm": 15.293846130371094, + "learning_rate": 1.0043099127994389e-05, + "loss": 1.472, + "step": 4968 + }, + { + "epoch": 1.49, + "grad_norm": 16.424936294555664, + "learning_rate": 1.0041094517389997e-05, + "loss": 1.477, + "step": 4969 + }, + { + "epoch": 1.49, + "grad_norm": 13.09062385559082, + "learning_rate": 1.003908990678561e-05, + "loss": 1.414, + "step": 4970 + }, + { + "epoch": 1.49, + "grad_norm": 21.331588745117188, + "learning_rate": 1.0037085296181218e-05, + "loss": 2.6522, + "step": 4971 + }, + { + "epoch": 1.49, + "grad_norm": 44.23732376098633, + "learning_rate": 1.0035080685576828e-05, + "loss": 1.6369, + "step": 4972 + }, + { + "epoch": 1.5, + "grad_norm": 13.58181095123291, + "learning_rate": 1.0033076074972436e-05, + "loss": 1.6649, + "step": 4973 + }, + { + "epoch": 1.5, + "grad_norm": 25.254730224609375, + "learning_rate": 1.0031071464368048e-05, + "loss": 2.0687, + "step": 4974 + }, + { + "epoch": 1.5, + "grad_norm": 10.236112594604492, + "learning_rate": 1.0029066853763658e-05, + "loss": 1.3942, + "step": 4975 + }, + { + "epoch": 1.5, + "grad_norm": 13.936944961547852, + "learning_rate": 1.0027062243159266e-05, + "loss": 2.3225, + "step": 4976 + }, + { + "epoch": 1.5, + "grad_norm": 11.376901626586914, + "learning_rate": 1.0025057632554878e-05, + "loss": 1.3294, + "step": 4977 + }, + { + "epoch": 1.5, + "grad_norm": 15.195333480834961, + "learning_rate": 1.0023053021950487e-05, + "loss": 2.3572, + "step": 4978 + }, + { + "epoch": 1.5, + "grad_norm": 35.94542694091797, + "learning_rate": 1.0021048411346097e-05, + "loss": 3.3368, + "step": 4979 + }, + { + "epoch": 1.5, + "grad_norm": 21.131452560424805, + "learning_rate": 1.0019043800741705e-05, + "loss": 1.618, + "step": 4980 + }, + { + "epoch": 1.5, + "grad_norm": 22.635162353515625, + "learning_rate": 1.0017039190137317e-05, + "loss": 2.0817, + "step": 4981 + }, + { + "epoch": 1.5, + "grad_norm": 22.40682601928711, + "learning_rate": 1.0015034579532927e-05, + "loss": 1.2649, + "step": 4982 + }, + { + "epoch": 1.5, + "grad_norm": 7.878605365753174, + "learning_rate": 1.0013029968928535e-05, + "loss": 1.3573, + "step": 4983 + }, + { + "epoch": 1.5, + "grad_norm": 19.825239181518555, + "learning_rate": 1.0011025358324147e-05, + "loss": 1.9824, + "step": 4984 + }, + { + "epoch": 1.5, + "grad_norm": 14.39383602142334, + "learning_rate": 1.0009020747719755e-05, + "loss": 1.8926, + "step": 4985 + }, + { + "epoch": 1.5, + "grad_norm": 21.175561904907227, + "learning_rate": 1.0007016137115366e-05, + "loss": 1.114, + "step": 4986 + }, + { + "epoch": 1.5, + "grad_norm": 17.892271041870117, + "learning_rate": 1.0005011526510977e-05, + "loss": 1.2682, + "step": 4987 + }, + { + "epoch": 1.5, + "grad_norm": 14.41032600402832, + "learning_rate": 1.0003006915906586e-05, + "loss": 1.3076, + "step": 4988 + }, + { + "epoch": 1.5, + "grad_norm": 10.802579879760742, + "learning_rate": 1.0001002305302196e-05, + "loss": 1.1787, + "step": 4989 + }, + { + "epoch": 1.5, + "grad_norm": 16.22612762451172, + "learning_rate": 9.998997694697806e-06, + "loss": 1.8036, + "step": 4990 + }, + { + "epoch": 1.5, + "grad_norm": 9.207633018493652, + "learning_rate": 9.996993084093416e-06, + "loss": 1.2887, + "step": 4991 + }, + { + "epoch": 1.5, + "grad_norm": 109.87674713134766, + "learning_rate": 9.994988473489026e-06, + "loss": 2.7811, + "step": 4992 + }, + { + "epoch": 1.5, + "grad_norm": 10.483904838562012, + "learning_rate": 9.992983862884636e-06, + "loss": 2.3548, + "step": 4993 + }, + { + "epoch": 1.5, + "grad_norm": 8.791914939880371, + "learning_rate": 9.990979252280245e-06, + "loss": 1.0692, + "step": 4994 + }, + { + "epoch": 1.5, + "grad_norm": 12.697139739990234, + "learning_rate": 9.988974641675855e-06, + "loss": 1.2917, + "step": 4995 + }, + { + "epoch": 1.5, + "grad_norm": 15.190261840820312, + "learning_rate": 9.986970031071466e-06, + "loss": 1.7266, + "step": 4996 + }, + { + "epoch": 1.5, + "grad_norm": 13.911056518554688, + "learning_rate": 9.984965420467075e-06, + "loss": 1.2805, + "step": 4997 + }, + { + "epoch": 1.5, + "grad_norm": 10.715677261352539, + "learning_rate": 9.982960809862685e-06, + "loss": 2.3035, + "step": 4998 + }, + { + "epoch": 1.5, + "grad_norm": 23.649269104003906, + "learning_rate": 9.980956199258295e-06, + "loss": 1.6413, + "step": 4999 + }, + { + "epoch": 1.5, + "grad_norm": 9.821858406066895, + "learning_rate": 9.978951588653905e-06, + "loss": 1.1268, + "step": 5000 + }, + { + "epoch": 1.5, + "grad_norm": 11.555136680603027, + "learning_rate": 9.976946978049515e-06, + "loss": 1.2356, + "step": 5001 + }, + { + "epoch": 1.5, + "grad_norm": 15.840431213378906, + "learning_rate": 9.974942367445125e-06, + "loss": 1.7111, + "step": 5002 + }, + { + "epoch": 1.5, + "grad_norm": 13.110334396362305, + "learning_rate": 9.972937756840735e-06, + "loss": 1.3668, + "step": 5003 + }, + { + "epoch": 1.5, + "grad_norm": 15.277511596679688, + "learning_rate": 9.970933146236345e-06, + "loss": 1.524, + "step": 5004 + }, + { + "epoch": 1.5, + "grad_norm": 22.332592010498047, + "learning_rate": 9.968928535631954e-06, + "loss": 1.8393, + "step": 5005 + }, + { + "epoch": 1.51, + "grad_norm": 10.41042709350586, + "learning_rate": 9.966923925027564e-06, + "loss": 0.9121, + "step": 5006 + }, + { + "epoch": 1.51, + "grad_norm": 17.307802200317383, + "learning_rate": 9.964919314423174e-06, + "loss": 1.2376, + "step": 5007 + }, + { + "epoch": 1.51, + "grad_norm": 28.895647048950195, + "learning_rate": 9.962914703818784e-06, + "loss": 2.9159, + "step": 5008 + }, + { + "epoch": 1.51, + "grad_norm": 54.32232666015625, + "learning_rate": 9.960910093214394e-06, + "loss": 2.1614, + "step": 5009 + }, + { + "epoch": 1.51, + "grad_norm": 13.556760787963867, + "learning_rate": 9.958905482610004e-06, + "loss": 0.914, + "step": 5010 + }, + { + "epoch": 1.51, + "grad_norm": 16.66999053955078, + "learning_rate": 9.956900872005614e-06, + "loss": 1.303, + "step": 5011 + }, + { + "epoch": 1.51, + "grad_norm": 23.729116439819336, + "learning_rate": 9.954896261401223e-06, + "loss": 1.1319, + "step": 5012 + }, + { + "epoch": 1.51, + "grad_norm": 14.170336723327637, + "learning_rate": 9.952891650796833e-06, + "loss": 1.2095, + "step": 5013 + }, + { + "epoch": 1.51, + "grad_norm": 8.873191833496094, + "learning_rate": 9.950887040192443e-06, + "loss": 1.0473, + "step": 5014 + }, + { + "epoch": 1.51, + "grad_norm": 19.656044006347656, + "learning_rate": 9.948882429588053e-06, + "loss": 1.9538, + "step": 5015 + }, + { + "epoch": 1.51, + "grad_norm": 7.630486011505127, + "learning_rate": 9.946877818983663e-06, + "loss": 1.2515, + "step": 5016 + }, + { + "epoch": 1.51, + "grad_norm": 15.626066207885742, + "learning_rate": 9.944873208379273e-06, + "loss": 1.7024, + "step": 5017 + }, + { + "epoch": 1.51, + "grad_norm": 11.820615768432617, + "learning_rate": 9.942868597774883e-06, + "loss": 1.5467, + "step": 5018 + }, + { + "epoch": 1.51, + "grad_norm": 26.351261138916016, + "learning_rate": 9.940863987170493e-06, + "loss": 1.0137, + "step": 5019 + }, + { + "epoch": 1.51, + "grad_norm": 15.984896659851074, + "learning_rate": 9.938859376566102e-06, + "loss": 1.2989, + "step": 5020 + }, + { + "epoch": 1.51, + "grad_norm": 23.20572853088379, + "learning_rate": 9.936854765961713e-06, + "loss": 1.8128, + "step": 5021 + }, + { + "epoch": 1.51, + "grad_norm": 12.120548248291016, + "learning_rate": 9.934850155357323e-06, + "loss": 1.295, + "step": 5022 + }, + { + "epoch": 1.51, + "grad_norm": 14.345947265625, + "learning_rate": 9.932845544752932e-06, + "loss": 1.0609, + "step": 5023 + }, + { + "epoch": 1.51, + "grad_norm": 10.384604454040527, + "learning_rate": 9.930840934148542e-06, + "loss": 0.8389, + "step": 5024 + }, + { + "epoch": 1.51, + "grad_norm": 50.97624588012695, + "learning_rate": 9.928836323544152e-06, + "loss": 2.2094, + "step": 5025 + }, + { + "epoch": 1.51, + "grad_norm": 15.930320739746094, + "learning_rate": 9.926831712939762e-06, + "loss": 1.0719, + "step": 5026 + }, + { + "epoch": 1.51, + "grad_norm": 15.929656982421875, + "learning_rate": 9.924827102335372e-06, + "loss": 1.4516, + "step": 5027 + }, + { + "epoch": 1.51, + "grad_norm": 20.207595825195312, + "learning_rate": 9.922822491730982e-06, + "loss": 1.122, + "step": 5028 + }, + { + "epoch": 1.51, + "grad_norm": 10.85169506072998, + "learning_rate": 9.920817881126592e-06, + "loss": 1.3796, + "step": 5029 + }, + { + "epoch": 1.51, + "grad_norm": 40.899234771728516, + "learning_rate": 9.918813270522202e-06, + "loss": 2.79, + "step": 5030 + }, + { + "epoch": 1.51, + "grad_norm": 77.47624206542969, + "learning_rate": 9.91680865991781e-06, + "loss": 2.6326, + "step": 5031 + }, + { + "epoch": 1.51, + "grad_norm": 11.343374252319336, + "learning_rate": 9.914804049313421e-06, + "loss": 1.2866, + "step": 5032 + }, + { + "epoch": 1.51, + "grad_norm": 35.656246185302734, + "learning_rate": 9.912799438709033e-06, + "loss": 2.6495, + "step": 5033 + }, + { + "epoch": 1.51, + "grad_norm": 17.106098175048828, + "learning_rate": 9.910794828104641e-06, + "loss": 1.4173, + "step": 5034 + }, + { + "epoch": 1.51, + "grad_norm": 25.03550148010254, + "learning_rate": 9.908790217500251e-06, + "loss": 1.5695, + "step": 5035 + }, + { + "epoch": 1.51, + "grad_norm": 14.62669563293457, + "learning_rate": 9.906785606895861e-06, + "loss": 1.6762, + "step": 5036 + }, + { + "epoch": 1.51, + "grad_norm": 98.68997192382812, + "learning_rate": 9.904780996291471e-06, + "loss": 3.2678, + "step": 5037 + }, + { + "epoch": 1.51, + "grad_norm": 10.68436050415039, + "learning_rate": 9.902776385687081e-06, + "loss": 1.6819, + "step": 5038 + }, + { + "epoch": 1.52, + "grad_norm": 17.10354232788086, + "learning_rate": 9.900771775082692e-06, + "loss": 1.5759, + "step": 5039 + }, + { + "epoch": 1.52, + "grad_norm": 24.183143615722656, + "learning_rate": 9.898767164478302e-06, + "loss": 1.51, + "step": 5040 + }, + { + "epoch": 1.52, + "eval_loss": 0.1983981430530548, + "eval_runtime": 43.7634, + "eval_samples_per_second": 33.795, + "eval_steps_per_second": 33.795, + "step": 5040 + }, + { + "epoch": 1.52, + "grad_norm": 11.218318939208984, + "learning_rate": 9.896762553873912e-06, + "loss": 2.6052, + "step": 5041 + }, + { + "epoch": 1.52, + "grad_norm": 28.35023307800293, + "learning_rate": 9.89475794326952e-06, + "loss": 1.5494, + "step": 5042 + }, + { + "epoch": 1.52, + "grad_norm": 39.44877624511719, + "learning_rate": 9.89275333266513e-06, + "loss": 2.2204, + "step": 5043 + }, + { + "epoch": 1.52, + "grad_norm": 16.99179458618164, + "learning_rate": 9.89074872206074e-06, + "loss": 1.9028, + "step": 5044 + }, + { + "epoch": 1.52, + "grad_norm": 22.564708709716797, + "learning_rate": 9.88874411145635e-06, + "loss": 2.0602, + "step": 5045 + }, + { + "epoch": 1.52, + "grad_norm": 10.778221130371094, + "learning_rate": 9.88673950085196e-06, + "loss": 1.071, + "step": 5046 + }, + { + "epoch": 1.52, + "grad_norm": 25.14896583557129, + "learning_rate": 9.88473489024757e-06, + "loss": 1.4653, + "step": 5047 + }, + { + "epoch": 1.52, + "grad_norm": 24.445661544799805, + "learning_rate": 9.88273027964318e-06, + "loss": 1.2668, + "step": 5048 + }, + { + "epoch": 1.52, + "grad_norm": 19.382781982421875, + "learning_rate": 9.88072566903879e-06, + "loss": 1.6534, + "step": 5049 + }, + { + "epoch": 1.52, + "grad_norm": 48.97084426879883, + "learning_rate": 9.878721058434399e-06, + "loss": 1.6285, + "step": 5050 + }, + { + "epoch": 1.52, + "grad_norm": 23.401174545288086, + "learning_rate": 9.87671644783001e-06, + "loss": 2.24, + "step": 5051 + }, + { + "epoch": 1.52, + "grad_norm": 7.404333591461182, + "learning_rate": 9.874711837225621e-06, + "loss": 0.8004, + "step": 5052 + }, + { + "epoch": 1.52, + "grad_norm": 14.808720588684082, + "learning_rate": 9.87270722662123e-06, + "loss": 1.193, + "step": 5053 + }, + { + "epoch": 1.52, + "grad_norm": 11.823633193969727, + "learning_rate": 9.87070261601684e-06, + "loss": 1.1512, + "step": 5054 + }, + { + "epoch": 1.52, + "grad_norm": 20.415996551513672, + "learning_rate": 9.86869800541245e-06, + "loss": 1.7406, + "step": 5055 + }, + { + "epoch": 1.52, + "grad_norm": 584.0230712890625, + "learning_rate": 9.86669339480806e-06, + "loss": 1.3769, + "step": 5056 + }, + { + "epoch": 1.52, + "grad_norm": 14.067730903625488, + "learning_rate": 9.864688784203668e-06, + "loss": 1.1319, + "step": 5057 + }, + { + "epoch": 1.52, + "grad_norm": 27.009614944458008, + "learning_rate": 9.86268417359928e-06, + "loss": 1.6301, + "step": 5058 + }, + { + "epoch": 1.52, + "grad_norm": 12.77624225616455, + "learning_rate": 9.86067956299489e-06, + "loss": 1.4832, + "step": 5059 + }, + { + "epoch": 1.52, + "grad_norm": 14.48194408416748, + "learning_rate": 9.858674952390498e-06, + "loss": 1.3063, + "step": 5060 + }, + { + "epoch": 1.52, + "grad_norm": 23.552724838256836, + "learning_rate": 9.856670341786108e-06, + "loss": 1.726, + "step": 5061 + }, + { + "epoch": 1.52, + "grad_norm": 35.44766616821289, + "learning_rate": 9.854665731181718e-06, + "loss": 1.7173, + "step": 5062 + }, + { + "epoch": 1.52, + "grad_norm": 17.609161376953125, + "learning_rate": 9.852661120577328e-06, + "loss": 1.2193, + "step": 5063 + }, + { + "epoch": 1.52, + "grad_norm": 13.460494041442871, + "learning_rate": 9.850656509972939e-06, + "loss": 1.2271, + "step": 5064 + }, + { + "epoch": 1.52, + "grad_norm": 19.18136215209961, + "learning_rate": 9.848651899368549e-06, + "loss": 1.0003, + "step": 5065 + }, + { + "epoch": 1.52, + "grad_norm": 33.11497497558594, + "learning_rate": 9.846647288764159e-06, + "loss": 1.9836, + "step": 5066 + }, + { + "epoch": 1.52, + "grad_norm": 17.282258987426758, + "learning_rate": 9.844642678159769e-06, + "loss": 1.7075, + "step": 5067 + }, + { + "epoch": 1.52, + "grad_norm": 12.306963920593262, + "learning_rate": 9.842638067555377e-06, + "loss": 0.9739, + "step": 5068 + }, + { + "epoch": 1.52, + "grad_norm": 18.901031494140625, + "learning_rate": 9.840633456950987e-06, + "loss": 1.4667, + "step": 5069 + }, + { + "epoch": 1.52, + "grad_norm": 22.379085540771484, + "learning_rate": 9.838628846346599e-06, + "loss": 1.1675, + "step": 5070 + }, + { + "epoch": 1.52, + "grad_norm": 14.647932052612305, + "learning_rate": 9.836624235742207e-06, + "loss": 1.809, + "step": 5071 + }, + { + "epoch": 1.52, + "grad_norm": 8.400578498840332, + "learning_rate": 9.834619625137818e-06, + "loss": 1.8459, + "step": 5072 + }, + { + "epoch": 1.53, + "grad_norm": 14.352058410644531, + "learning_rate": 9.832615014533428e-06, + "loss": 1.6325, + "step": 5073 + }, + { + "epoch": 1.53, + "grad_norm": 38.508724212646484, + "learning_rate": 9.830610403929038e-06, + "loss": 1.4109, + "step": 5074 + }, + { + "epoch": 1.53, + "grad_norm": 24.62671661376953, + "learning_rate": 9.828605793324648e-06, + "loss": 2.3143, + "step": 5075 + }, + { + "epoch": 1.53, + "grad_norm": 32.014671325683594, + "learning_rate": 9.826601182720258e-06, + "loss": 2.2224, + "step": 5076 + }, + { + "epoch": 1.53, + "grad_norm": 16.45624351501465, + "learning_rate": 9.824596572115868e-06, + "loss": 1.3855, + "step": 5077 + }, + { + "epoch": 1.53, + "grad_norm": 24.989152908325195, + "learning_rate": 9.822591961511478e-06, + "loss": 1.8901, + "step": 5078 + }, + { + "epoch": 1.53, + "grad_norm": 16.87116241455078, + "learning_rate": 9.820587350907086e-06, + "loss": 2.1003, + "step": 5079 + }, + { + "epoch": 1.53, + "grad_norm": 19.88626480102539, + "learning_rate": 9.818582740302697e-06, + "loss": 0.9223, + "step": 5080 + }, + { + "epoch": 1.53, + "grad_norm": 29.52408218383789, + "learning_rate": 9.816578129698307e-06, + "loss": 2.1227, + "step": 5081 + }, + { + "epoch": 1.53, + "grad_norm": 6.786428451538086, + "learning_rate": 9.814573519093917e-06, + "loss": 1.1044, + "step": 5082 + }, + { + "epoch": 1.53, + "grad_norm": 11.187318801879883, + "learning_rate": 9.812568908489527e-06, + "loss": 1.0142, + "step": 5083 + }, + { + "epoch": 1.53, + "grad_norm": 9.082967758178711, + "learning_rate": 9.810564297885137e-06, + "loss": 0.8634, + "step": 5084 + }, + { + "epoch": 1.53, + "grad_norm": 38.15378189086914, + "learning_rate": 9.808559687280747e-06, + "loss": 1.4613, + "step": 5085 + }, + { + "epoch": 1.53, + "grad_norm": 11.708313941955566, + "learning_rate": 9.806555076676357e-06, + "loss": 2.1042, + "step": 5086 + }, + { + "epoch": 1.53, + "grad_norm": 15.158965110778809, + "learning_rate": 9.804550466071965e-06, + "loss": 1.2576, + "step": 5087 + }, + { + "epoch": 1.53, + "grad_norm": 15.897801399230957, + "learning_rate": 9.802545855467577e-06, + "loss": 1.5028, + "step": 5088 + }, + { + "epoch": 1.53, + "grad_norm": 58.225669860839844, + "learning_rate": 9.800541244863187e-06, + "loss": 1.0376, + "step": 5089 + }, + { + "epoch": 1.53, + "grad_norm": 26.382015228271484, + "learning_rate": 9.798536634258796e-06, + "loss": 1.2807, + "step": 5090 + }, + { + "epoch": 1.53, + "grad_norm": 11.506767272949219, + "learning_rate": 9.796532023654406e-06, + "loss": 1.4462, + "step": 5091 + }, + { + "epoch": 1.53, + "grad_norm": 36.81036376953125, + "learning_rate": 9.794527413050016e-06, + "loss": 2.7945, + "step": 5092 + }, + { + "epoch": 1.53, + "grad_norm": 13.376030921936035, + "learning_rate": 9.792522802445626e-06, + "loss": 1.562, + "step": 5093 + }, + { + "epoch": 1.53, + "grad_norm": 17.786754608154297, + "learning_rate": 9.790518191841234e-06, + "loss": 2.5748, + "step": 5094 + }, + { + "epoch": 1.53, + "grad_norm": 22.214862823486328, + "learning_rate": 9.788513581236846e-06, + "loss": 1.8133, + "step": 5095 + }, + { + "epoch": 1.53, + "grad_norm": 7.965408802032471, + "learning_rate": 9.786508970632456e-06, + "loss": 1.2572, + "step": 5096 + }, + { + "epoch": 1.53, + "grad_norm": 16.189109802246094, + "learning_rate": 9.784504360028065e-06, + "loss": 1.1491, + "step": 5097 + }, + { + "epoch": 1.53, + "grad_norm": 21.910438537597656, + "learning_rate": 9.782499749423675e-06, + "loss": 1.8276, + "step": 5098 + }, + { + "epoch": 1.53, + "grad_norm": 29.36257553100586, + "learning_rate": 9.780495138819285e-06, + "loss": 1.69, + "step": 5099 + }, + { + "epoch": 1.53, + "grad_norm": 15.409547805786133, + "learning_rate": 9.778490528214895e-06, + "loss": 1.83, + "step": 5100 + }, + { + "epoch": 1.53, + "grad_norm": 12.414823532104492, + "learning_rate": 9.776485917610505e-06, + "loss": 1.3998, + "step": 5101 + }, + { + "epoch": 1.53, + "grad_norm": 25.707353591918945, + "learning_rate": 9.774481307006115e-06, + "loss": 2.12, + "step": 5102 + }, + { + "epoch": 1.53, + "grad_norm": 43.882991790771484, + "learning_rate": 9.772476696401725e-06, + "loss": 1.7967, + "step": 5103 + }, + { + "epoch": 1.53, + "grad_norm": 12.999314308166504, + "learning_rate": 9.770472085797335e-06, + "loss": 1.1704, + "step": 5104 + }, + { + "epoch": 1.53, + "grad_norm": 30.975318908691406, + "learning_rate": 9.768467475192944e-06, + "loss": 2.4516, + "step": 5105 + }, + { + "epoch": 1.54, + "grad_norm": 10.727395057678223, + "learning_rate": 9.766462864588554e-06, + "loss": 0.5141, + "step": 5106 + }, + { + "epoch": 1.54, + "grad_norm": 16.37921142578125, + "learning_rate": 9.764458253984165e-06, + "loss": 1.9043, + "step": 5107 + }, + { + "epoch": 1.54, + "grad_norm": 11.777350425720215, + "learning_rate": 9.762453643379774e-06, + "loss": 1.3563, + "step": 5108 + }, + { + "epoch": 1.54, + "grad_norm": 10.583935737609863, + "learning_rate": 9.760449032775384e-06, + "loss": 1.3274, + "step": 5109 + }, + { + "epoch": 1.54, + "grad_norm": 24.881576538085938, + "learning_rate": 9.758444422170994e-06, + "loss": 1.0478, + "step": 5110 + }, + { + "epoch": 1.54, + "grad_norm": 24.082473754882812, + "learning_rate": 9.756439811566604e-06, + "loss": 0.8794, + "step": 5111 + }, + { + "epoch": 1.54, + "grad_norm": 15.496257781982422, + "learning_rate": 9.754435200962214e-06, + "loss": 0.8897, + "step": 5112 + }, + { + "epoch": 1.54, + "grad_norm": 27.82330894470215, + "learning_rate": 9.752430590357824e-06, + "loss": 2.5503, + "step": 5113 + }, + { + "epoch": 1.54, + "grad_norm": 14.794507026672363, + "learning_rate": 9.750425979753434e-06, + "loss": 2.2607, + "step": 5114 + }, + { + "epoch": 1.54, + "grad_norm": 25.15207862854004, + "learning_rate": 9.748421369149044e-06, + "loss": 1.2937, + "step": 5115 + }, + { + "epoch": 1.54, + "grad_norm": 28.609935760498047, + "learning_rate": 9.746416758544653e-06, + "loss": 1.8648, + "step": 5116 + }, + { + "epoch": 1.54, + "grad_norm": 19.665058135986328, + "learning_rate": 9.744412147940263e-06, + "loss": 1.3485, + "step": 5117 + }, + { + "epoch": 1.54, + "grad_norm": 16.362789154052734, + "learning_rate": 9.742407537335873e-06, + "loss": 1.6568, + "step": 5118 + }, + { + "epoch": 1.54, + "grad_norm": 43.66654586791992, + "learning_rate": 9.740402926731483e-06, + "loss": 1.5276, + "step": 5119 + }, + { + "epoch": 1.54, + "grad_norm": 11.123966217041016, + "learning_rate": 9.738398316127093e-06, + "loss": 1.4542, + "step": 5120 + }, + { + "epoch": 1.54, + "grad_norm": 10.916337966918945, + "learning_rate": 9.736393705522703e-06, + "loss": 1.7105, + "step": 5121 + }, + { + "epoch": 1.54, + "grad_norm": 38.372467041015625, + "learning_rate": 9.734389094918313e-06, + "loss": 2.5102, + "step": 5122 + }, + { + "epoch": 1.54, + "grad_norm": 12.8078031539917, + "learning_rate": 9.732384484313923e-06, + "loss": 1.573, + "step": 5123 + }, + { + "epoch": 1.54, + "grad_norm": 22.485212326049805, + "learning_rate": 9.730379873709532e-06, + "loss": 1.986, + "step": 5124 + }, + { + "epoch": 1.54, + "grad_norm": 16.489240646362305, + "learning_rate": 9.728375263105144e-06, + "loss": 1.5365, + "step": 5125 + }, + { + "epoch": 1.54, + "grad_norm": 48.157371520996094, + "learning_rate": 9.726370652500754e-06, + "loss": 2.6245, + "step": 5126 + }, + { + "epoch": 1.54, + "grad_norm": 19.086997985839844, + "learning_rate": 9.724366041896362e-06, + "loss": 2.8709, + "step": 5127 + }, + { + "epoch": 1.54, + "grad_norm": 16.59282112121582, + "learning_rate": 9.722361431291972e-06, + "loss": 1.6568, + "step": 5128 + }, + { + "epoch": 1.54, + "grad_norm": 77.64139556884766, + "learning_rate": 9.720356820687582e-06, + "loss": 2.4053, + "step": 5129 + }, + { + "epoch": 1.54, + "grad_norm": 32.31578063964844, + "learning_rate": 9.718352210083192e-06, + "loss": 2.4112, + "step": 5130 + }, + { + "epoch": 1.54, + "grad_norm": 16.772424697875977, + "learning_rate": 9.716347599478802e-06, + "loss": 1.8791, + "step": 5131 + }, + { + "epoch": 1.54, + "grad_norm": 18.102991104125977, + "learning_rate": 9.714342988874412e-06, + "loss": 1.3216, + "step": 5132 + }, + { + "epoch": 1.54, + "grad_norm": 11.130638122558594, + "learning_rate": 9.712338378270023e-06, + "loss": 1.4781, + "step": 5133 + }, + { + "epoch": 1.54, + "grad_norm": 23.45066261291504, + "learning_rate": 9.710333767665631e-06, + "loss": 2.065, + "step": 5134 + }, + { + "epoch": 1.54, + "grad_norm": 24.560604095458984, + "learning_rate": 9.708329157061241e-06, + "loss": 1.383, + "step": 5135 + }, + { + "epoch": 1.54, + "grad_norm": 12.12047290802002, + "learning_rate": 9.706324546456851e-06, + "loss": 1.5988, + "step": 5136 + }, + { + "epoch": 1.54, + "grad_norm": 10.139289855957031, + "learning_rate": 9.704319935852461e-06, + "loss": 0.8568, + "step": 5137 + }, + { + "epoch": 1.54, + "grad_norm": 10.667194366455078, + "learning_rate": 9.702315325248071e-06, + "loss": 1.2382, + "step": 5138 + }, + { + "epoch": 1.55, + "grad_norm": 14.842784881591797, + "learning_rate": 9.700310714643681e-06, + "loss": 1.474, + "step": 5139 + }, + { + "epoch": 1.55, + "grad_norm": 28.56570816040039, + "learning_rate": 9.698306104039291e-06, + "loss": 1.9052, + "step": 5140 + }, + { + "epoch": 1.55, + "grad_norm": 14.900588035583496, + "learning_rate": 9.696301493434901e-06, + "loss": 1.2868, + "step": 5141 + }, + { + "epoch": 1.55, + "grad_norm": 11.330522537231445, + "learning_rate": 9.69429688283051e-06, + "loss": 1.1221, + "step": 5142 + }, + { + "epoch": 1.55, + "grad_norm": 10.978798866271973, + "learning_rate": 9.69229227222612e-06, + "loss": 1.2924, + "step": 5143 + }, + { + "epoch": 1.55, + "grad_norm": 10.9464111328125, + "learning_rate": 9.690287661621732e-06, + "loss": 0.9553, + "step": 5144 + }, + { + "epoch": 1.55, + "grad_norm": 19.434829711914062, + "learning_rate": 9.68828305101734e-06, + "loss": 1.6625, + "step": 5145 + }, + { + "epoch": 1.55, + "grad_norm": 13.196100234985352, + "learning_rate": 9.68627844041295e-06, + "loss": 1.5143, + "step": 5146 + }, + { + "epoch": 1.55, + "grad_norm": 13.529244422912598, + "learning_rate": 9.68427382980856e-06, + "loss": 1.6467, + "step": 5147 + }, + { + "epoch": 1.55, + "grad_norm": 37.84720993041992, + "learning_rate": 9.68226921920417e-06, + "loss": 3.0862, + "step": 5148 + }, + { + "epoch": 1.55, + "grad_norm": 11.348771095275879, + "learning_rate": 9.68026460859978e-06, + "loss": 1.3557, + "step": 5149 + }, + { + "epoch": 1.55, + "grad_norm": 11.032979965209961, + "learning_rate": 9.67825999799539e-06, + "loss": 0.9113, + "step": 5150 + }, + { + "epoch": 1.55, + "grad_norm": 15.761959075927734, + "learning_rate": 9.676255387391e-06, + "loss": 1.7351, + "step": 5151 + }, + { + "epoch": 1.55, + "grad_norm": 53.275428771972656, + "learning_rate": 9.67425077678661e-06, + "loss": 2.9133, + "step": 5152 + }, + { + "epoch": 1.55, + "grad_norm": 18.882171630859375, + "learning_rate": 9.672246166182219e-06, + "loss": 2.1493, + "step": 5153 + }, + { + "epoch": 1.55, + "grad_norm": 13.768794059753418, + "learning_rate": 9.67024155557783e-06, + "loss": 1.1085, + "step": 5154 + }, + { + "epoch": 1.55, + "grad_norm": 15.78554916381836, + "learning_rate": 9.66823694497344e-06, + "loss": 1.7214, + "step": 5155 + }, + { + "epoch": 1.55, + "grad_norm": 18.663970947265625, + "learning_rate": 9.66623233436905e-06, + "loss": 1.5341, + "step": 5156 + }, + { + "epoch": 1.55, + "grad_norm": 91.2535629272461, + "learning_rate": 9.66422772376466e-06, + "loss": 2.4091, + "step": 5157 + }, + { + "epoch": 1.55, + "grad_norm": 30.0625057220459, + "learning_rate": 9.66222311316027e-06, + "loss": 1.2433, + "step": 5158 + }, + { + "epoch": 1.55, + "grad_norm": 10.793164253234863, + "learning_rate": 9.66021850255588e-06, + "loss": 1.3247, + "step": 5159 + }, + { + "epoch": 1.55, + "grad_norm": 14.865962028503418, + "learning_rate": 9.65821389195149e-06, + "loss": 0.7533, + "step": 5160 + }, + { + "epoch": 1.55, + "eval_loss": 0.20735611021518707, + "eval_runtime": 43.7679, + "eval_samples_per_second": 33.792, + "eval_steps_per_second": 33.792, + "step": 5160 + }, + { + "epoch": 1.55, + "grad_norm": 18.602514266967773, + "learning_rate": 9.656209281347098e-06, + "loss": 1.5636, + "step": 5161 + }, + { + "epoch": 1.55, + "grad_norm": 14.882292747497559, + "learning_rate": 9.65420467074271e-06, + "loss": 1.1038, + "step": 5162 + }, + { + "epoch": 1.55, + "grad_norm": 56.73546600341797, + "learning_rate": 9.65220006013832e-06, + "loss": 1.9049, + "step": 5163 + }, + { + "epoch": 1.55, + "grad_norm": 13.342903137207031, + "learning_rate": 9.650195449533928e-06, + "loss": 1.6425, + "step": 5164 + }, + { + "epoch": 1.55, + "grad_norm": 13.89185905456543, + "learning_rate": 9.648190838929538e-06, + "loss": 1.0887, + "step": 5165 + }, + { + "epoch": 1.55, + "grad_norm": 72.23283386230469, + "learning_rate": 9.646186228325149e-06, + "loss": 2.1655, + "step": 5166 + }, + { + "epoch": 1.55, + "grad_norm": 17.974084854125977, + "learning_rate": 9.644181617720759e-06, + "loss": 2.2678, + "step": 5167 + }, + { + "epoch": 1.55, + "grad_norm": 69.75627136230469, + "learning_rate": 9.642177007116369e-06, + "loss": 1.7682, + "step": 5168 + }, + { + "epoch": 1.55, + "grad_norm": 11.242698669433594, + "learning_rate": 9.640172396511979e-06, + "loss": 1.3168, + "step": 5169 + }, + { + "epoch": 1.55, + "grad_norm": 66.39893341064453, + "learning_rate": 9.638167785907589e-06, + "loss": 2.6081, + "step": 5170 + }, + { + "epoch": 1.55, + "grad_norm": 9.396846771240234, + "learning_rate": 9.636163175303199e-06, + "loss": 0.8918, + "step": 5171 + }, + { + "epoch": 1.56, + "grad_norm": 30.329811096191406, + "learning_rate": 9.634158564698807e-06, + "loss": 2.5263, + "step": 5172 + }, + { + "epoch": 1.56, + "grad_norm": 8.505033493041992, + "learning_rate": 9.632153954094417e-06, + "loss": 0.9187, + "step": 5173 + }, + { + "epoch": 1.56, + "grad_norm": 15.874688148498535, + "learning_rate": 9.63014934349003e-06, + "loss": 1.2034, + "step": 5174 + }, + { + "epoch": 1.56, + "grad_norm": 12.436075210571289, + "learning_rate": 9.628144732885638e-06, + "loss": 1.2702, + "step": 5175 + }, + { + "epoch": 1.56, + "grad_norm": 16.405839920043945, + "learning_rate": 9.626140122281248e-06, + "loss": 1.2595, + "step": 5176 + }, + { + "epoch": 1.56, + "grad_norm": 6.489515781402588, + "learning_rate": 9.624135511676858e-06, + "loss": 0.699, + "step": 5177 + }, + { + "epoch": 1.56, + "grad_norm": 15.752748489379883, + "learning_rate": 9.622130901072468e-06, + "loss": 1.8403, + "step": 5178 + }, + { + "epoch": 1.56, + "grad_norm": 21.755474090576172, + "learning_rate": 9.620126290468076e-06, + "loss": 1.9319, + "step": 5179 + }, + { + "epoch": 1.56, + "grad_norm": 14.760236740112305, + "learning_rate": 9.618121679863686e-06, + "loss": 1.3553, + "step": 5180 + }, + { + "epoch": 1.56, + "grad_norm": 20.644493103027344, + "learning_rate": 9.616117069259298e-06, + "loss": 1.2051, + "step": 5181 + }, + { + "epoch": 1.56, + "grad_norm": 18.111047744750977, + "learning_rate": 9.614112458654906e-06, + "loss": 1.1888, + "step": 5182 + }, + { + "epoch": 1.56, + "grad_norm": 9.958390235900879, + "learning_rate": 9.612107848050517e-06, + "loss": 1.2644, + "step": 5183 + }, + { + "epoch": 1.56, + "grad_norm": 16.58241081237793, + "learning_rate": 9.610103237446127e-06, + "loss": 0.8966, + "step": 5184 + }, + { + "epoch": 1.56, + "grad_norm": 27.23797607421875, + "learning_rate": 9.608098626841737e-06, + "loss": 2.5713, + "step": 5185 + }, + { + "epoch": 1.56, + "grad_norm": 28.486438751220703, + "learning_rate": 9.606094016237347e-06, + "loss": 2.6203, + "step": 5186 + }, + { + "epoch": 1.56, + "grad_norm": 15.266054153442383, + "learning_rate": 9.604089405632957e-06, + "loss": 1.8284, + "step": 5187 + }, + { + "epoch": 1.56, + "grad_norm": 27.08075714111328, + "learning_rate": 9.602084795028567e-06, + "loss": 1.749, + "step": 5188 + }, + { + "epoch": 1.56, + "grad_norm": 15.239933013916016, + "learning_rate": 9.600080184424177e-06, + "loss": 1.4726, + "step": 5189 + }, + { + "epoch": 1.56, + "grad_norm": 29.341596603393555, + "learning_rate": 9.598075573819785e-06, + "loss": 1.697, + "step": 5190 + }, + { + "epoch": 1.56, + "grad_norm": 17.9268856048584, + "learning_rate": 9.596070963215396e-06, + "loss": 1.7262, + "step": 5191 + }, + { + "epoch": 1.56, + "grad_norm": 18.277809143066406, + "learning_rate": 9.594066352611006e-06, + "loss": 2.4779, + "step": 5192 + }, + { + "epoch": 1.56, + "grad_norm": 24.198942184448242, + "learning_rate": 9.592061742006616e-06, + "loss": 1.2228, + "step": 5193 + }, + { + "epoch": 1.56, + "grad_norm": 10.138649940490723, + "learning_rate": 9.590057131402226e-06, + "loss": 2.3378, + "step": 5194 + }, + { + "epoch": 1.56, + "grad_norm": 15.209259986877441, + "learning_rate": 9.588052520797836e-06, + "loss": 1.8108, + "step": 5195 + }, + { + "epoch": 1.56, + "grad_norm": 22.023107528686523, + "learning_rate": 9.586047910193446e-06, + "loss": 1.5428, + "step": 5196 + }, + { + "epoch": 1.56, + "grad_norm": 18.7374210357666, + "learning_rate": 9.584043299589056e-06, + "loss": 1.6044, + "step": 5197 + }, + { + "epoch": 1.56, + "grad_norm": 21.584501266479492, + "learning_rate": 9.582038688984664e-06, + "loss": 2.0156, + "step": 5198 + }, + { + "epoch": 1.56, + "grad_norm": 17.4039363861084, + "learning_rate": 9.580034078380276e-06, + "loss": 1.3307, + "step": 5199 + }, + { + "epoch": 1.56, + "grad_norm": 10.785429000854492, + "learning_rate": 9.578029467775886e-06, + "loss": 1.2009, + "step": 5200 + }, + { + "epoch": 1.56, + "grad_norm": 13.932561874389648, + "learning_rate": 9.576024857171495e-06, + "loss": 1.6818, + "step": 5201 + }, + { + "epoch": 1.56, + "grad_norm": 13.384397506713867, + "learning_rate": 9.574020246567105e-06, + "loss": 1.7232, + "step": 5202 + }, + { + "epoch": 1.56, + "grad_norm": 32.59089279174805, + "learning_rate": 9.572015635962715e-06, + "loss": 2.1975, + "step": 5203 + }, + { + "epoch": 1.56, + "grad_norm": 19.552274703979492, + "learning_rate": 9.570011025358325e-06, + "loss": 2.2128, + "step": 5204 + }, + { + "epoch": 1.56, + "grad_norm": 15.523249626159668, + "learning_rate": 9.568006414753935e-06, + "loss": 1.1143, + "step": 5205 + }, + { + "epoch": 1.57, + "grad_norm": 35.84797286987305, + "learning_rate": 9.566001804149545e-06, + "loss": 2.5374, + "step": 5206 + }, + { + "epoch": 1.57, + "grad_norm": 14.734626770019531, + "learning_rate": 9.563997193545155e-06, + "loss": 1.2287, + "step": 5207 + }, + { + "epoch": 1.57, + "grad_norm": 13.902031898498535, + "learning_rate": 9.561992582940765e-06, + "loss": 1.4145, + "step": 5208 + }, + { + "epoch": 1.57, + "grad_norm": 9.233687400817871, + "learning_rate": 9.559987972336374e-06, + "loss": 1.2157, + "step": 5209 + }, + { + "epoch": 1.57, + "grad_norm": 47.99122619628906, + "learning_rate": 9.557983361731984e-06, + "loss": 1.6352, + "step": 5210 + }, + { + "epoch": 1.57, + "grad_norm": 18.855663299560547, + "learning_rate": 9.555978751127596e-06, + "loss": 2.1963, + "step": 5211 + }, + { + "epoch": 1.57, + "grad_norm": 25.103059768676758, + "learning_rate": 9.553974140523204e-06, + "loss": 2.7753, + "step": 5212 + }, + { + "epoch": 1.57, + "grad_norm": 19.241830825805664, + "learning_rate": 9.551969529918814e-06, + "loss": 1.5551, + "step": 5213 + }, + { + "epoch": 1.57, + "grad_norm": 13.835005760192871, + "learning_rate": 9.549964919314424e-06, + "loss": 1.4738, + "step": 5214 + }, + { + "epoch": 1.57, + "grad_norm": 8.090837478637695, + "learning_rate": 9.547960308710034e-06, + "loss": 1.7791, + "step": 5215 + }, + { + "epoch": 1.57, + "grad_norm": 22.887216567993164, + "learning_rate": 9.545955698105643e-06, + "loss": 1.4865, + "step": 5216 + }, + { + "epoch": 1.57, + "grad_norm": 18.71986961364746, + "learning_rate": 9.543951087501254e-06, + "loss": 1.7043, + "step": 5217 + }, + { + "epoch": 1.57, + "grad_norm": 9.452502250671387, + "learning_rate": 9.541946476896864e-06, + "loss": 1.9759, + "step": 5218 + }, + { + "epoch": 1.57, + "grad_norm": 18.228004455566406, + "learning_rate": 9.539941866292473e-06, + "loss": 1.4368, + "step": 5219 + }, + { + "epoch": 1.57, + "grad_norm": 27.472970962524414, + "learning_rate": 9.537937255688083e-06, + "loss": 1.5878, + "step": 5220 + }, + { + "epoch": 1.57, + "grad_norm": 34.94017028808594, + "learning_rate": 9.535932645083693e-06, + "loss": 1.7284, + "step": 5221 + }, + { + "epoch": 1.57, + "grad_norm": 19.611160278320312, + "learning_rate": 9.533928034479303e-06, + "loss": 1.9294, + "step": 5222 + }, + { + "epoch": 1.57, + "grad_norm": 181.42205810546875, + "learning_rate": 9.531923423874913e-06, + "loss": 1.5287, + "step": 5223 + }, + { + "epoch": 1.57, + "grad_norm": 27.391780853271484, + "learning_rate": 9.529918813270523e-06, + "loss": 1.0979, + "step": 5224 + }, + { + "epoch": 1.57, + "grad_norm": 19.48676872253418, + "learning_rate": 9.527914202666133e-06, + "loss": 1.9428, + "step": 5225 + }, + { + "epoch": 1.57, + "grad_norm": 60.94241714477539, + "learning_rate": 9.525909592061743e-06, + "loss": 2.5457, + "step": 5226 + }, + { + "epoch": 1.57, + "grad_norm": 39.271629333496094, + "learning_rate": 9.523904981457352e-06, + "loss": 1.834, + "step": 5227 + }, + { + "epoch": 1.57, + "grad_norm": 7.752985000610352, + "learning_rate": 9.521900370852962e-06, + "loss": 1.3035, + "step": 5228 + }, + { + "epoch": 1.57, + "grad_norm": 11.235089302062988, + "learning_rate": 9.519895760248572e-06, + "loss": 0.659, + "step": 5229 + }, + { + "epoch": 1.57, + "grad_norm": 22.830162048339844, + "learning_rate": 9.517891149644182e-06, + "loss": 2.1836, + "step": 5230 + }, + { + "epoch": 1.57, + "grad_norm": 40.63211441040039, + "learning_rate": 9.515886539039792e-06, + "loss": 2.0237, + "step": 5231 + }, + { + "epoch": 1.57, + "grad_norm": 15.20790958404541, + "learning_rate": 9.513881928435402e-06, + "loss": 0.9827, + "step": 5232 + }, + { + "epoch": 1.57, + "grad_norm": 24.22981071472168, + "learning_rate": 9.511877317831012e-06, + "loss": 1.6325, + "step": 5233 + }, + { + "epoch": 1.57, + "grad_norm": 69.4743423461914, + "learning_rate": 9.509872707226622e-06, + "loss": 2.8366, + "step": 5234 + }, + { + "epoch": 1.57, + "grad_norm": 16.03273582458496, + "learning_rate": 9.50786809662223e-06, + "loss": 1.8027, + "step": 5235 + }, + { + "epoch": 1.57, + "grad_norm": 27.365671157836914, + "learning_rate": 9.505863486017843e-06, + "loss": 1.7576, + "step": 5236 + }, + { + "epoch": 1.57, + "grad_norm": 24.969091415405273, + "learning_rate": 9.503858875413453e-06, + "loss": 2.0772, + "step": 5237 + }, + { + "epoch": 1.57, + "grad_norm": 17.248416900634766, + "learning_rate": 9.501854264809061e-06, + "loss": 1.3726, + "step": 5238 + }, + { + "epoch": 1.58, + "grad_norm": 15.464237213134766, + "learning_rate": 9.499849654204671e-06, + "loss": 1.4722, + "step": 5239 + }, + { + "epoch": 1.58, + "grad_norm": 24.053791046142578, + "learning_rate": 9.497845043600281e-06, + "loss": 1.4818, + "step": 5240 + }, + { + "epoch": 1.58, + "grad_norm": 21.699386596679688, + "learning_rate": 9.495840432995891e-06, + "loss": 2.1324, + "step": 5241 + }, + { + "epoch": 1.58, + "grad_norm": 32.680335998535156, + "learning_rate": 9.493835822391501e-06, + "loss": 2.5121, + "step": 5242 + }, + { + "epoch": 1.58, + "grad_norm": 9.323308944702148, + "learning_rate": 9.491831211787111e-06, + "loss": 1.7795, + "step": 5243 + }, + { + "epoch": 1.58, + "grad_norm": 9.845460891723633, + "learning_rate": 9.489826601182722e-06, + "loss": 1.5512, + "step": 5244 + }, + { + "epoch": 1.58, + "grad_norm": 12.808528900146484, + "learning_rate": 9.487821990578332e-06, + "loss": 1.6268, + "step": 5245 + }, + { + "epoch": 1.58, + "grad_norm": 30.653648376464844, + "learning_rate": 9.48581737997394e-06, + "loss": 2.2517, + "step": 5246 + }, + { + "epoch": 1.58, + "grad_norm": 37.43178939819336, + "learning_rate": 9.48381276936955e-06, + "loss": 1.6323, + "step": 5247 + }, + { + "epoch": 1.58, + "grad_norm": 23.361366271972656, + "learning_rate": 9.481808158765162e-06, + "loss": 1.6171, + "step": 5248 + }, + { + "epoch": 1.58, + "grad_norm": 43.43690872192383, + "learning_rate": 9.47980354816077e-06, + "loss": 1.8716, + "step": 5249 + }, + { + "epoch": 1.58, + "grad_norm": 16.062522888183594, + "learning_rate": 9.47779893755638e-06, + "loss": 2.9676, + "step": 5250 + }, + { + "epoch": 1.58, + "grad_norm": 34.8533935546875, + "learning_rate": 9.47579432695199e-06, + "loss": 1.6051, + "step": 5251 + }, + { + "epoch": 1.58, + "grad_norm": 20.79990577697754, + "learning_rate": 9.4737897163476e-06, + "loss": 2.5814, + "step": 5252 + }, + { + "epoch": 1.58, + "grad_norm": 94.41461181640625, + "learning_rate": 9.47178510574321e-06, + "loss": 3.8194, + "step": 5253 + }, + { + "epoch": 1.58, + "grad_norm": 9.540099143981934, + "learning_rate": 9.46978049513882e-06, + "loss": 2.1172, + "step": 5254 + }, + { + "epoch": 1.58, + "grad_norm": 18.652847290039062, + "learning_rate": 9.46777588453443e-06, + "loss": 1.3757, + "step": 5255 + }, + { + "epoch": 1.58, + "grad_norm": 6.206528663635254, + "learning_rate": 9.465771273930041e-06, + "loss": 0.92, + "step": 5256 + }, + { + "epoch": 1.58, + "grad_norm": 18.400524139404297, + "learning_rate": 9.46376666332565e-06, + "loss": 1.8978, + "step": 5257 + }, + { + "epoch": 1.58, + "grad_norm": 22.74957275390625, + "learning_rate": 9.46176205272126e-06, + "loss": 1.8973, + "step": 5258 + }, + { + "epoch": 1.58, + "grad_norm": 15.965211868286133, + "learning_rate": 9.45975744211687e-06, + "loss": 1.8043, + "step": 5259 + }, + { + "epoch": 1.58, + "grad_norm": 19.99260139465332, + "learning_rate": 9.45775283151248e-06, + "loss": 1.6596, + "step": 5260 + }, + { + "epoch": 1.58, + "grad_norm": 9.259965896606445, + "learning_rate": 9.45574822090809e-06, + "loss": 1.4668, + "step": 5261 + }, + { + "epoch": 1.58, + "grad_norm": 49.869110107421875, + "learning_rate": 9.4537436103037e-06, + "loss": 3.4404, + "step": 5262 + }, + { + "epoch": 1.58, + "grad_norm": 26.978092193603516, + "learning_rate": 9.45173899969931e-06, + "loss": 2.3785, + "step": 5263 + }, + { + "epoch": 1.58, + "grad_norm": 37.760353088378906, + "learning_rate": 9.449734389094918e-06, + "loss": 1.1802, + "step": 5264 + }, + { + "epoch": 1.58, + "grad_norm": 24.364702224731445, + "learning_rate": 9.447729778490528e-06, + "loss": 2.288, + "step": 5265 + }, + { + "epoch": 1.58, + "grad_norm": 17.613224029541016, + "learning_rate": 9.445725167886138e-06, + "loss": 1.8934, + "step": 5266 + }, + { + "epoch": 1.58, + "grad_norm": 25.8133544921875, + "learning_rate": 9.443720557281748e-06, + "loss": 1.6238, + "step": 5267 + }, + { + "epoch": 1.58, + "grad_norm": 32.17334747314453, + "learning_rate": 9.441715946677358e-06, + "loss": 1.8157, + "step": 5268 + }, + { + "epoch": 1.58, + "grad_norm": 16.442485809326172, + "learning_rate": 9.439711336072969e-06, + "loss": 1.1689, + "step": 5269 + }, + { + "epoch": 1.58, + "grad_norm": 11.316102027893066, + "learning_rate": 9.437706725468579e-06, + "loss": 2.063, + "step": 5270 + }, + { + "epoch": 1.58, + "grad_norm": 11.197502136230469, + "learning_rate": 9.435702114864189e-06, + "loss": 1.4149, + "step": 5271 + }, + { + "epoch": 1.59, + "grad_norm": 15.829837799072266, + "learning_rate": 9.433697504259797e-06, + "loss": 1.6842, + "step": 5272 + }, + { + "epoch": 1.59, + "grad_norm": 44.153804779052734, + "learning_rate": 9.431692893655409e-06, + "loss": 1.8206, + "step": 5273 + }, + { + "epoch": 1.59, + "grad_norm": 8.531728744506836, + "learning_rate": 9.429688283051019e-06, + "loss": 0.9972, + "step": 5274 + }, + { + "epoch": 1.59, + "grad_norm": 16.26192283630371, + "learning_rate": 9.427683672446627e-06, + "loss": 1.8538, + "step": 5275 + }, + { + "epoch": 1.59, + "grad_norm": 10.455742835998535, + "learning_rate": 9.425679061842237e-06, + "loss": 1.0675, + "step": 5276 + }, + { + "epoch": 1.59, + "grad_norm": 19.000938415527344, + "learning_rate": 9.423674451237848e-06, + "loss": 1.9213, + "step": 5277 + }, + { + "epoch": 1.59, + "grad_norm": 17.04540252685547, + "learning_rate": 9.421669840633458e-06, + "loss": 1.4436, + "step": 5278 + }, + { + "epoch": 1.59, + "grad_norm": 15.419578552246094, + "learning_rate": 9.419665230029068e-06, + "loss": 1.9087, + "step": 5279 + }, + { + "epoch": 1.59, + "grad_norm": 18.85577392578125, + "learning_rate": 9.417660619424678e-06, + "loss": 2.2146, + "step": 5280 + }, + { + "epoch": 1.59, + "eval_loss": 0.1917702704668045, + "eval_runtime": 43.7028, + "eval_samples_per_second": 33.842, + "eval_steps_per_second": 33.842, + "step": 5280 + }, + { + "epoch": 1.59, + "grad_norm": 24.73578453063965, + "learning_rate": 9.415656008820288e-06, + "loss": 2.0709, + "step": 5281 + }, + { + "epoch": 1.59, + "grad_norm": 10.795376777648926, + "learning_rate": 9.413651398215898e-06, + "loss": 0.9204, + "step": 5282 + }, + { + "epoch": 1.59, + "grad_norm": 14.635207176208496, + "learning_rate": 9.411646787611506e-06, + "loss": 1.7222, + "step": 5283 + }, + { + "epoch": 1.59, + "grad_norm": 12.989263534545898, + "learning_rate": 9.409642177007116e-06, + "loss": 1.3394, + "step": 5284 + }, + { + "epoch": 1.59, + "grad_norm": 12.602009773254395, + "learning_rate": 9.407637566402728e-06, + "loss": 1.7059, + "step": 5285 + }, + { + "epoch": 1.59, + "grad_norm": 20.175142288208008, + "learning_rate": 9.405632955798337e-06, + "loss": 1.7654, + "step": 5286 + }, + { + "epoch": 1.59, + "grad_norm": 18.197296142578125, + "learning_rate": 9.403628345193947e-06, + "loss": 1.7068, + "step": 5287 + }, + { + "epoch": 1.59, + "grad_norm": 34.1553955078125, + "learning_rate": 9.401623734589557e-06, + "loss": 1.3868, + "step": 5288 + }, + { + "epoch": 1.59, + "grad_norm": 27.550334930419922, + "learning_rate": 9.399619123985167e-06, + "loss": 1.2817, + "step": 5289 + }, + { + "epoch": 1.59, + "grad_norm": 19.8583984375, + "learning_rate": 9.397614513380777e-06, + "loss": 1.9351, + "step": 5290 + }, + { + "epoch": 1.59, + "grad_norm": 6.953277587890625, + "learning_rate": 9.395609902776387e-06, + "loss": 1.1993, + "step": 5291 + }, + { + "epoch": 1.59, + "grad_norm": 15.303847312927246, + "learning_rate": 9.393605292171997e-06, + "loss": 1.6499, + "step": 5292 + }, + { + "epoch": 1.59, + "grad_norm": 28.220230102539062, + "learning_rate": 9.391600681567607e-06, + "loss": 1.7453, + "step": 5293 + }, + { + "epoch": 1.59, + "grad_norm": 40.280029296875, + "learning_rate": 9.389596070963216e-06, + "loss": 1.9758, + "step": 5294 + }, + { + "epoch": 1.59, + "grad_norm": 15.754435539245605, + "learning_rate": 9.387591460358826e-06, + "loss": 1.1859, + "step": 5295 + }, + { + "epoch": 1.59, + "grad_norm": 64.05276489257812, + "learning_rate": 9.385586849754436e-06, + "loss": 2.8985, + "step": 5296 + }, + { + "epoch": 1.59, + "grad_norm": 12.505768775939941, + "learning_rate": 9.383582239150046e-06, + "loss": 1.7193, + "step": 5297 + }, + { + "epoch": 1.59, + "grad_norm": 22.20297622680664, + "learning_rate": 9.381577628545656e-06, + "loss": 1.8241, + "step": 5298 + }, + { + "epoch": 1.59, + "grad_norm": 17.095487594604492, + "learning_rate": 9.379573017941266e-06, + "loss": 1.7652, + "step": 5299 + }, + { + "epoch": 1.59, + "grad_norm": 30.220666885375977, + "learning_rate": 9.377568407336876e-06, + "loss": 2.0137, + "step": 5300 + }, + { + "epoch": 1.59, + "grad_norm": 16.760820388793945, + "learning_rate": 9.375563796732484e-06, + "loss": 1.4244, + "step": 5301 + }, + { + "epoch": 1.59, + "grad_norm": 74.30367279052734, + "learning_rate": 9.373559186128095e-06, + "loss": 1.818, + "step": 5302 + }, + { + "epoch": 1.59, + "grad_norm": 8.43071460723877, + "learning_rate": 9.371554575523705e-06, + "loss": 0.9715, + "step": 5303 + }, + { + "epoch": 1.59, + "grad_norm": 19.98003387451172, + "learning_rate": 9.369549964919315e-06, + "loss": 1.5193, + "step": 5304 + }, + { + "epoch": 1.6, + "grad_norm": 15.74586009979248, + "learning_rate": 9.367545354314925e-06, + "loss": 1.7498, + "step": 5305 + }, + { + "epoch": 1.6, + "grad_norm": 16.798616409301758, + "learning_rate": 9.365540743710535e-06, + "loss": 0.9751, + "step": 5306 + }, + { + "epoch": 1.6, + "grad_norm": 15.125486373901367, + "learning_rate": 9.363536133106145e-06, + "loss": 2.1381, + "step": 5307 + }, + { + "epoch": 1.6, + "grad_norm": 26.87381362915039, + "learning_rate": 9.361531522501755e-06, + "loss": 1.6783, + "step": 5308 + }, + { + "epoch": 1.6, + "grad_norm": 23.354263305664062, + "learning_rate": 9.359526911897363e-06, + "loss": 2.2363, + "step": 5309 + }, + { + "epoch": 1.6, + "grad_norm": 11.7379789352417, + "learning_rate": 9.357522301292975e-06, + "loss": 1.6394, + "step": 5310 + }, + { + "epoch": 1.6, + "grad_norm": 16.918331146240234, + "learning_rate": 9.355517690688585e-06, + "loss": 2.0194, + "step": 5311 + }, + { + "epoch": 1.6, + "grad_norm": 10.654472351074219, + "learning_rate": 9.353513080084194e-06, + "loss": 2.1258, + "step": 5312 + }, + { + "epoch": 1.6, + "grad_norm": 23.184656143188477, + "learning_rate": 9.351508469479804e-06, + "loss": 2.9148, + "step": 5313 + }, + { + "epoch": 1.6, + "grad_norm": 10.799365997314453, + "learning_rate": 9.349503858875414e-06, + "loss": 0.941, + "step": 5314 + }, + { + "epoch": 1.6, + "grad_norm": 12.17577838897705, + "learning_rate": 9.347499248271024e-06, + "loss": 0.8023, + "step": 5315 + }, + { + "epoch": 1.6, + "grad_norm": 25.213882446289062, + "learning_rate": 9.345494637666634e-06, + "loss": 1.6508, + "step": 5316 + }, + { + "epoch": 1.6, + "grad_norm": 11.12057113647461, + "learning_rate": 9.343490027062244e-06, + "loss": 1.589, + "step": 5317 + }, + { + "epoch": 1.6, + "grad_norm": 27.033601760864258, + "learning_rate": 9.341485416457854e-06, + "loss": 1.8537, + "step": 5318 + }, + { + "epoch": 1.6, + "grad_norm": 16.07554054260254, + "learning_rate": 9.339480805853464e-06, + "loss": 1.4078, + "step": 5319 + }, + { + "epoch": 1.6, + "grad_norm": 16.774127960205078, + "learning_rate": 9.337476195249073e-06, + "loss": 1.2248, + "step": 5320 + }, + { + "epoch": 1.6, + "grad_norm": 12.099883079528809, + "learning_rate": 9.335471584644683e-06, + "loss": 0.5273, + "step": 5321 + }, + { + "epoch": 1.6, + "grad_norm": 14.563944816589355, + "learning_rate": 9.333466974040295e-06, + "loss": 1.2972, + "step": 5322 + }, + { + "epoch": 1.6, + "grad_norm": 6.988048553466797, + "learning_rate": 9.331462363435903e-06, + "loss": 1.2299, + "step": 5323 + }, + { + "epoch": 1.6, + "grad_norm": 15.345603942871094, + "learning_rate": 9.329457752831513e-06, + "loss": 1.8004, + "step": 5324 + }, + { + "epoch": 1.6, + "grad_norm": 8.920988082885742, + "learning_rate": 9.327453142227123e-06, + "loss": 1.799, + "step": 5325 + }, + { + "epoch": 1.6, + "grad_norm": 13.415757179260254, + "learning_rate": 9.325448531622733e-06, + "loss": 1.3893, + "step": 5326 + }, + { + "epoch": 1.6, + "grad_norm": 14.817873001098633, + "learning_rate": 9.323443921018343e-06, + "loss": 2.0269, + "step": 5327 + }, + { + "epoch": 1.6, + "grad_norm": 17.05929183959961, + "learning_rate": 9.321439310413953e-06, + "loss": 1.9584, + "step": 5328 + }, + { + "epoch": 1.6, + "grad_norm": 10.261931419372559, + "learning_rate": 9.319434699809563e-06, + "loss": 1.5321, + "step": 5329 + }, + { + "epoch": 1.6, + "grad_norm": 18.25226402282715, + "learning_rate": 9.317430089205174e-06, + "loss": 1.5914, + "step": 5330 + }, + { + "epoch": 1.6, + "grad_norm": 13.496363639831543, + "learning_rate": 9.315425478600782e-06, + "loss": 1.6435, + "step": 5331 + }, + { + "epoch": 1.6, + "grad_norm": 26.062244415283203, + "learning_rate": 9.313420867996392e-06, + "loss": 1.892, + "step": 5332 + }, + { + "epoch": 1.6, + "grad_norm": 28.14862632751465, + "learning_rate": 9.311416257392002e-06, + "loss": 0.9718, + "step": 5333 + }, + { + "epoch": 1.6, + "grad_norm": 9.369063377380371, + "learning_rate": 9.309411646787612e-06, + "loss": 1.6377, + "step": 5334 + }, + { + "epoch": 1.6, + "grad_norm": 15.546137809753418, + "learning_rate": 9.307407036183222e-06, + "loss": 1.9211, + "step": 5335 + }, + { + "epoch": 1.6, + "grad_norm": 32.82224655151367, + "learning_rate": 9.305402425578832e-06, + "loss": 1.2737, + "step": 5336 + }, + { + "epoch": 1.6, + "grad_norm": 58.94491958618164, + "learning_rate": 9.303397814974442e-06, + "loss": 2.3802, + "step": 5337 + }, + { + "epoch": 1.6, + "grad_norm": 17.832128524780273, + "learning_rate": 9.30139320437005e-06, + "loss": 2.0701, + "step": 5338 + }, + { + "epoch": 1.61, + "grad_norm": 25.756505966186523, + "learning_rate": 9.299388593765661e-06, + "loss": 1.6935, + "step": 5339 + }, + { + "epoch": 1.61, + "grad_norm": 12.220959663391113, + "learning_rate": 9.297383983161273e-06, + "loss": 2.3572, + "step": 5340 + }, + { + "epoch": 1.61, + "grad_norm": 28.716758728027344, + "learning_rate": 9.295379372556883e-06, + "loss": 1.8092, + "step": 5341 + }, + { + "epoch": 1.61, + "grad_norm": 17.98668670654297, + "learning_rate": 9.293374761952491e-06, + "loss": 1.61, + "step": 5342 + }, + { + "epoch": 1.61, + "grad_norm": 15.28249454498291, + "learning_rate": 9.291370151348101e-06, + "loss": 2.0471, + "step": 5343 + }, + { + "epoch": 1.61, + "grad_norm": 10.611080169677734, + "learning_rate": 9.289365540743711e-06, + "loss": 0.9532, + "step": 5344 + }, + { + "epoch": 1.61, + "grad_norm": 18.82200813293457, + "learning_rate": 9.287360930139321e-06, + "loss": 1.315, + "step": 5345 + }, + { + "epoch": 1.61, + "grad_norm": 19.420570373535156, + "learning_rate": 9.28535631953493e-06, + "loss": 1.0041, + "step": 5346 + }, + { + "epoch": 1.61, + "grad_norm": 16.25090980529785, + "learning_rate": 9.283351708930542e-06, + "loss": 2.2406, + "step": 5347 + }, + { + "epoch": 1.61, + "grad_norm": 36.150394439697266, + "learning_rate": 9.281347098326152e-06, + "loss": 2.0748, + "step": 5348 + }, + { + "epoch": 1.61, + "grad_norm": 69.37711334228516, + "learning_rate": 9.27934248772176e-06, + "loss": 2.1891, + "step": 5349 + }, + { + "epoch": 1.61, + "grad_norm": 9.345280647277832, + "learning_rate": 9.27733787711737e-06, + "loss": 1.2469, + "step": 5350 + }, + { + "epoch": 1.61, + "grad_norm": 51.229942321777344, + "learning_rate": 9.27533326651298e-06, + "loss": 2.5097, + "step": 5351 + }, + { + "epoch": 1.61, + "grad_norm": 33.678749084472656, + "learning_rate": 9.27332865590859e-06, + "loss": 2.5323, + "step": 5352 + }, + { + "epoch": 1.61, + "grad_norm": 20.67754364013672, + "learning_rate": 9.2713240453042e-06, + "loss": 3.1563, + "step": 5353 + }, + { + "epoch": 1.61, + "grad_norm": 19.53748321533203, + "learning_rate": 9.26931943469981e-06, + "loss": 1.1978, + "step": 5354 + }, + { + "epoch": 1.61, + "grad_norm": 16.690845489501953, + "learning_rate": 9.26731482409542e-06, + "loss": 1.1068, + "step": 5355 + }, + { + "epoch": 1.61, + "grad_norm": 19.17826271057129, + "learning_rate": 9.26531021349103e-06, + "loss": 1.8296, + "step": 5356 + }, + { + "epoch": 1.61, + "grad_norm": 15.507776260375977, + "learning_rate": 9.263305602886639e-06, + "loss": 1.6507, + "step": 5357 + }, + { + "epoch": 1.61, + "grad_norm": 129.35165405273438, + "learning_rate": 9.261300992282249e-06, + "loss": 1.632, + "step": 5358 + }, + { + "epoch": 1.61, + "grad_norm": 9.657642364501953, + "learning_rate": 9.259296381677861e-06, + "loss": 1.315, + "step": 5359 + }, + { + "epoch": 1.61, + "grad_norm": 18.087684631347656, + "learning_rate": 9.25729177107347e-06, + "loss": 1.2448, + "step": 5360 + }, + { + "epoch": 1.61, + "grad_norm": 16.07610511779785, + "learning_rate": 9.25528716046908e-06, + "loss": 1.2188, + "step": 5361 + }, + { + "epoch": 1.61, + "grad_norm": 14.000444412231445, + "learning_rate": 9.25328254986469e-06, + "loss": 1.9099, + "step": 5362 + }, + { + "epoch": 1.61, + "grad_norm": 22.69326400756836, + "learning_rate": 9.2512779392603e-06, + "loss": 2.2418, + "step": 5363 + }, + { + "epoch": 1.61, + "grad_norm": 41.782066345214844, + "learning_rate": 9.24927332865591e-06, + "loss": 1.8902, + "step": 5364 + }, + { + "epoch": 1.61, + "grad_norm": 9.715392112731934, + "learning_rate": 9.24726871805152e-06, + "loss": 1.1494, + "step": 5365 + }, + { + "epoch": 1.61, + "grad_norm": 13.382871627807617, + "learning_rate": 9.24526410744713e-06, + "loss": 1.6138, + "step": 5366 + }, + { + "epoch": 1.61, + "grad_norm": 33.94945526123047, + "learning_rate": 9.24325949684274e-06, + "loss": 1.562, + "step": 5367 + }, + { + "epoch": 1.61, + "grad_norm": 24.688888549804688, + "learning_rate": 9.241254886238348e-06, + "loss": 1.3487, + "step": 5368 + }, + { + "epoch": 1.61, + "grad_norm": 19.392419815063477, + "learning_rate": 9.239250275633958e-06, + "loss": 1.7019, + "step": 5369 + }, + { + "epoch": 1.61, + "grad_norm": 9.527002334594727, + "learning_rate": 9.237245665029568e-06, + "loss": 1.6555, + "step": 5370 + }, + { + "epoch": 1.61, + "grad_norm": 11.718170166015625, + "learning_rate": 9.235241054425179e-06, + "loss": 1.143, + "step": 5371 + }, + { + "epoch": 1.62, + "grad_norm": 14.71898078918457, + "learning_rate": 9.233236443820789e-06, + "loss": 1.3483, + "step": 5372 + }, + { + "epoch": 1.62, + "grad_norm": 12.641324996948242, + "learning_rate": 9.231231833216399e-06, + "loss": 1.3701, + "step": 5373 + }, + { + "epoch": 1.62, + "grad_norm": 26.643348693847656, + "learning_rate": 9.229227222612009e-06, + "loss": 1.8719, + "step": 5374 + }, + { + "epoch": 1.62, + "grad_norm": 11.661134719848633, + "learning_rate": 9.227222612007619e-06, + "loss": 1.5889, + "step": 5375 + }, + { + "epoch": 1.62, + "grad_norm": 14.931244850158691, + "learning_rate": 9.225218001403227e-06, + "loss": 1.7465, + "step": 5376 + }, + { + "epoch": 1.62, + "grad_norm": 11.898720741271973, + "learning_rate": 9.223213390798839e-06, + "loss": 1.6241, + "step": 5377 + }, + { + "epoch": 1.62, + "grad_norm": 16.589155197143555, + "learning_rate": 9.221208780194449e-06, + "loss": 1.4509, + "step": 5378 + }, + { + "epoch": 1.62, + "grad_norm": 23.514259338378906, + "learning_rate": 9.219204169590058e-06, + "loss": 2.5493, + "step": 5379 + }, + { + "epoch": 1.62, + "grad_norm": 17.52058982849121, + "learning_rate": 9.217199558985668e-06, + "loss": 1.2928, + "step": 5380 + }, + { + "epoch": 1.62, + "grad_norm": 28.47098159790039, + "learning_rate": 9.215194948381278e-06, + "loss": 1.2744, + "step": 5381 + }, + { + "epoch": 1.62, + "grad_norm": 15.93428897857666, + "learning_rate": 9.213190337776888e-06, + "loss": 1.5216, + "step": 5382 + }, + { + "epoch": 1.62, + "grad_norm": 73.04627227783203, + "learning_rate": 9.211185727172498e-06, + "loss": 0.9938, + "step": 5383 + }, + { + "epoch": 1.62, + "grad_norm": 26.94357681274414, + "learning_rate": 9.209181116568108e-06, + "loss": 1.0955, + "step": 5384 + }, + { + "epoch": 1.62, + "grad_norm": 15.998095512390137, + "learning_rate": 9.207176505963718e-06, + "loss": 1.2675, + "step": 5385 + }, + { + "epoch": 1.62, + "grad_norm": 6.676560401916504, + "learning_rate": 9.205171895359326e-06, + "loss": 0.8448, + "step": 5386 + }, + { + "epoch": 1.62, + "grad_norm": 17.63335609436035, + "learning_rate": 9.203167284754936e-06, + "loss": 1.771, + "step": 5387 + }, + { + "epoch": 1.62, + "grad_norm": 15.837037086486816, + "learning_rate": 9.201162674150547e-06, + "loss": 1.1493, + "step": 5388 + }, + { + "epoch": 1.62, + "grad_norm": 15.462651252746582, + "learning_rate": 9.199158063546157e-06, + "loss": 1.4158, + "step": 5389 + }, + { + "epoch": 1.62, + "grad_norm": 14.927474975585938, + "learning_rate": 9.197153452941767e-06, + "loss": 2.7956, + "step": 5390 + }, + { + "epoch": 1.62, + "grad_norm": 25.812740325927734, + "learning_rate": 9.195148842337377e-06, + "loss": 2.2199, + "step": 5391 + }, + { + "epoch": 1.62, + "grad_norm": 17.734699249267578, + "learning_rate": 9.193144231732987e-06, + "loss": 2.1131, + "step": 5392 + }, + { + "epoch": 1.62, + "grad_norm": 15.593416213989258, + "learning_rate": 9.191139621128597e-06, + "loss": 1.3466, + "step": 5393 + }, + { + "epoch": 1.62, + "grad_norm": 14.880626678466797, + "learning_rate": 9.189135010524205e-06, + "loss": 1.4476, + "step": 5394 + }, + { + "epoch": 1.62, + "grad_norm": 17.891508102416992, + "learning_rate": 9.187130399919815e-06, + "loss": 1.5905, + "step": 5395 + }, + { + "epoch": 1.62, + "grad_norm": 20.343963623046875, + "learning_rate": 9.185125789315427e-06, + "loss": 2.0189, + "step": 5396 + }, + { + "epoch": 1.62, + "grad_norm": 19.60910415649414, + "learning_rate": 9.183121178711036e-06, + "loss": 2.392, + "step": 5397 + }, + { + "epoch": 1.62, + "grad_norm": 19.910167694091797, + "learning_rate": 9.181116568106646e-06, + "loss": 1.713, + "step": 5398 + }, + { + "epoch": 1.62, + "grad_norm": 13.35189151763916, + "learning_rate": 9.179111957502256e-06, + "loss": 1.2514, + "step": 5399 + }, + { + "epoch": 1.62, + "grad_norm": 27.091657638549805, + "learning_rate": 9.177107346897866e-06, + "loss": 1.4003, + "step": 5400 + }, + { + "epoch": 1.62, + "eval_loss": 0.18608930706977844, + "eval_runtime": 43.4962, + "eval_samples_per_second": 34.003, + "eval_steps_per_second": 34.003, + "step": 5400 + }, + { + "epoch": 1.62, + "grad_norm": 38.99711608886719, + "learning_rate": 9.175102736293476e-06, + "loss": 2.452, + "step": 5401 + }, + { + "epoch": 1.62, + "grad_norm": 17.14046859741211, + "learning_rate": 9.173098125689086e-06, + "loss": 1.0333, + "step": 5402 + }, + { + "epoch": 1.62, + "grad_norm": 14.545239448547363, + "learning_rate": 9.171093515084696e-06, + "loss": 1.1729, + "step": 5403 + }, + { + "epoch": 1.62, + "grad_norm": 21.479055404663086, + "learning_rate": 9.169088904480306e-06, + "loss": 2.0552, + "step": 5404 + }, + { + "epoch": 1.63, + "grad_norm": 52.615394592285156, + "learning_rate": 9.167084293875915e-06, + "loss": 1.6672, + "step": 5405 + }, + { + "epoch": 1.63, + "grad_norm": 12.193754196166992, + "learning_rate": 9.165079683271525e-06, + "loss": 1.3845, + "step": 5406 + }, + { + "epoch": 1.63, + "grad_norm": 10.990750312805176, + "learning_rate": 9.163075072667135e-06, + "loss": 1.3259, + "step": 5407 + }, + { + "epoch": 1.63, + "grad_norm": 14.663102149963379, + "learning_rate": 9.161070462062745e-06, + "loss": 1.432, + "step": 5408 + }, + { + "epoch": 1.63, + "grad_norm": 8.971065521240234, + "learning_rate": 9.159065851458355e-06, + "loss": 0.9139, + "step": 5409 + }, + { + "epoch": 1.63, + "grad_norm": 12.251411437988281, + "learning_rate": 9.157061240853965e-06, + "loss": 1.2786, + "step": 5410 + }, + { + "epoch": 1.63, + "grad_norm": 10.386260032653809, + "learning_rate": 9.155056630249575e-06, + "loss": 1.8492, + "step": 5411 + }, + { + "epoch": 1.63, + "grad_norm": 25.967205047607422, + "learning_rate": 9.153052019645185e-06, + "loss": 1.8648, + "step": 5412 + }, + { + "epoch": 1.63, + "grad_norm": 15.101587295532227, + "learning_rate": 9.151047409040794e-06, + "loss": 1.3361, + "step": 5413 + }, + { + "epoch": 1.63, + "grad_norm": 21.25626564025879, + "learning_rate": 9.149042798436405e-06, + "loss": 1.624, + "step": 5414 + }, + { + "epoch": 1.63, + "grad_norm": 12.912954330444336, + "learning_rate": 9.147038187832015e-06, + "loss": 1.2722, + "step": 5415 + }, + { + "epoch": 1.63, + "grad_norm": 19.924049377441406, + "learning_rate": 9.145033577227624e-06, + "loss": 1.8115, + "step": 5416 + }, + { + "epoch": 1.63, + "grad_norm": 10.341268539428711, + "learning_rate": 9.143028966623234e-06, + "loss": 1.1745, + "step": 5417 + }, + { + "epoch": 1.63, + "grad_norm": 19.03986167907715, + "learning_rate": 9.141024356018844e-06, + "loss": 1.8332, + "step": 5418 + }, + { + "epoch": 1.63, + "grad_norm": 10.237374305725098, + "learning_rate": 9.139019745414454e-06, + "loss": 1.1372, + "step": 5419 + }, + { + "epoch": 1.63, + "grad_norm": 12.311965942382812, + "learning_rate": 9.137015134810064e-06, + "loss": 1.3524, + "step": 5420 + }, + { + "epoch": 1.63, + "grad_norm": 17.118541717529297, + "learning_rate": 9.135010524205674e-06, + "loss": 1.4631, + "step": 5421 + }, + { + "epoch": 1.63, + "grad_norm": 13.466850280761719, + "learning_rate": 9.133005913601284e-06, + "loss": 2.0476, + "step": 5422 + }, + { + "epoch": 1.63, + "grad_norm": 22.777727127075195, + "learning_rate": 9.131001302996893e-06, + "loss": 1.6859, + "step": 5423 + }, + { + "epoch": 1.63, + "grad_norm": 24.398303985595703, + "learning_rate": 9.128996692392503e-06, + "loss": 2.0839, + "step": 5424 + }, + { + "epoch": 1.63, + "grad_norm": 22.156652450561523, + "learning_rate": 9.126992081788113e-06, + "loss": 1.3133, + "step": 5425 + }, + { + "epoch": 1.63, + "grad_norm": 10.810490608215332, + "learning_rate": 9.124987471183725e-06, + "loss": 0.9388, + "step": 5426 + }, + { + "epoch": 1.63, + "grad_norm": 17.22280502319336, + "learning_rate": 9.122982860579333e-06, + "loss": 2.081, + "step": 5427 + }, + { + "epoch": 1.63, + "grad_norm": 11.593786239624023, + "learning_rate": 9.120978249974943e-06, + "loss": 1.6318, + "step": 5428 + }, + { + "epoch": 1.63, + "grad_norm": 111.95779418945312, + "learning_rate": 9.118973639370553e-06, + "loss": 2.2157, + "step": 5429 + }, + { + "epoch": 1.63, + "grad_norm": 10.570384979248047, + "learning_rate": 9.116969028766163e-06, + "loss": 1.8122, + "step": 5430 + }, + { + "epoch": 1.63, + "grad_norm": 47.375701904296875, + "learning_rate": 9.114964418161772e-06, + "loss": 2.1275, + "step": 5431 + }, + { + "epoch": 1.63, + "grad_norm": 15.159667015075684, + "learning_rate": 9.112959807557382e-06, + "loss": 2.2646, + "step": 5432 + }, + { + "epoch": 1.63, + "grad_norm": 10.478726387023926, + "learning_rate": 9.110955196952994e-06, + "loss": 1.4377, + "step": 5433 + }, + { + "epoch": 1.63, + "grad_norm": 27.897777557373047, + "learning_rate": 9.108950586348602e-06, + "loss": 2.1688, + "step": 5434 + }, + { + "epoch": 1.63, + "grad_norm": 26.705507278442383, + "learning_rate": 9.106945975744212e-06, + "loss": 1.7822, + "step": 5435 + }, + { + "epoch": 1.63, + "grad_norm": 23.994443893432617, + "learning_rate": 9.104941365139822e-06, + "loss": 2.0833, + "step": 5436 + }, + { + "epoch": 1.63, + "grad_norm": 66.67716217041016, + "learning_rate": 9.102936754535432e-06, + "loss": 1.3532, + "step": 5437 + }, + { + "epoch": 1.63, + "grad_norm": 30.51035499572754, + "learning_rate": 9.100932143931042e-06, + "loss": 2.0045, + "step": 5438 + }, + { + "epoch": 1.64, + "grad_norm": 10.716803550720215, + "learning_rate": 9.098927533326652e-06, + "loss": 1.6747, + "step": 5439 + }, + { + "epoch": 1.64, + "grad_norm": 13.023651123046875, + "learning_rate": 9.096922922722262e-06, + "loss": 0.7083, + "step": 5440 + }, + { + "epoch": 1.64, + "grad_norm": 25.49052619934082, + "learning_rate": 9.094918312117873e-06, + "loss": 1.0234, + "step": 5441 + }, + { + "epoch": 1.64, + "grad_norm": 33.57606887817383, + "learning_rate": 9.092913701513481e-06, + "loss": 2.2643, + "step": 5442 + }, + { + "epoch": 1.64, + "grad_norm": 11.915518760681152, + "learning_rate": 9.090909090909091e-06, + "loss": 1.0967, + "step": 5443 + }, + { + "epoch": 1.64, + "grad_norm": 15.158368110656738, + "learning_rate": 9.088904480304701e-06, + "loss": 1.534, + "step": 5444 + }, + { + "epoch": 1.64, + "grad_norm": 11.216906547546387, + "learning_rate": 9.086899869700311e-06, + "loss": 1.4261, + "step": 5445 + }, + { + "epoch": 1.64, + "grad_norm": 26.098888397216797, + "learning_rate": 9.084895259095921e-06, + "loss": 2.0426, + "step": 5446 + }, + { + "epoch": 1.64, + "grad_norm": 13.751211166381836, + "learning_rate": 9.082890648491531e-06, + "loss": 1.5604, + "step": 5447 + }, + { + "epoch": 1.64, + "grad_norm": 12.62765884399414, + "learning_rate": 9.080886037887141e-06, + "loss": 1.7308, + "step": 5448 + }, + { + "epoch": 1.64, + "grad_norm": 20.71200942993164, + "learning_rate": 9.078881427282752e-06, + "loss": 0.9927, + "step": 5449 + }, + { + "epoch": 1.64, + "grad_norm": 23.00042152404785, + "learning_rate": 9.07687681667836e-06, + "loss": 2.4304, + "step": 5450 + }, + { + "epoch": 1.64, + "grad_norm": 28.70808982849121, + "learning_rate": 9.074872206073972e-06, + "loss": 1.5417, + "step": 5451 + }, + { + "epoch": 1.64, + "grad_norm": 9.605061531066895, + "learning_rate": 9.072867595469582e-06, + "loss": 1.3111, + "step": 5452 + }, + { + "epoch": 1.64, + "grad_norm": 7.0347490310668945, + "learning_rate": 9.07086298486519e-06, + "loss": 0.7902, + "step": 5453 + }, + { + "epoch": 1.64, + "grad_norm": 7.641110420227051, + "learning_rate": 9.0688583742608e-06, + "loss": 0.5109, + "step": 5454 + }, + { + "epoch": 1.64, + "grad_norm": 39.103248596191406, + "learning_rate": 9.06685376365641e-06, + "loss": 1.863, + "step": 5455 + }, + { + "epoch": 1.64, + "grad_norm": 10.603463172912598, + "learning_rate": 9.06484915305202e-06, + "loss": 1.5076, + "step": 5456 + }, + { + "epoch": 1.64, + "grad_norm": 53.90522766113281, + "learning_rate": 9.06284454244763e-06, + "loss": 3.2397, + "step": 5457 + }, + { + "epoch": 1.64, + "grad_norm": 55.33717727661133, + "learning_rate": 9.06083993184324e-06, + "loss": 2.0653, + "step": 5458 + }, + { + "epoch": 1.64, + "grad_norm": 18.05723762512207, + "learning_rate": 9.05883532123885e-06, + "loss": 1.8223, + "step": 5459 + }, + { + "epoch": 1.64, + "grad_norm": 45.47984313964844, + "learning_rate": 9.05683071063446e-06, + "loss": 1.3857, + "step": 5460 + }, + { + "epoch": 1.64, + "grad_norm": 64.19039916992188, + "learning_rate": 9.05482610003007e-06, + "loss": 1.9055, + "step": 5461 + }, + { + "epoch": 1.64, + "grad_norm": 16.403478622436523, + "learning_rate": 9.05282148942568e-06, + "loss": 1.2061, + "step": 5462 + }, + { + "epoch": 1.64, + "grad_norm": 15.951025009155273, + "learning_rate": 9.050816878821291e-06, + "loss": 2.1111, + "step": 5463 + }, + { + "epoch": 1.64, + "grad_norm": 32.44859313964844, + "learning_rate": 9.0488122682169e-06, + "loss": 1.3111, + "step": 5464 + }, + { + "epoch": 1.64, + "grad_norm": 22.92570686340332, + "learning_rate": 9.04680765761251e-06, + "loss": 1.4222, + "step": 5465 + }, + { + "epoch": 1.64, + "grad_norm": 11.310336112976074, + "learning_rate": 9.04480304700812e-06, + "loss": 1.0347, + "step": 5466 + }, + { + "epoch": 1.64, + "grad_norm": 48.00858688354492, + "learning_rate": 9.04279843640373e-06, + "loss": 1.5265, + "step": 5467 + }, + { + "epoch": 1.64, + "grad_norm": 22.504119873046875, + "learning_rate": 9.040793825799338e-06, + "loss": 1.2273, + "step": 5468 + }, + { + "epoch": 1.64, + "grad_norm": 23.860111236572266, + "learning_rate": 9.03878921519495e-06, + "loss": 1.4192, + "step": 5469 + }, + { + "epoch": 1.64, + "grad_norm": 11.303675651550293, + "learning_rate": 9.03678460459056e-06, + "loss": 1.0257, + "step": 5470 + }, + { + "epoch": 1.64, + "grad_norm": 12.571292877197266, + "learning_rate": 9.034779993986168e-06, + "loss": 1.4753, + "step": 5471 + }, + { + "epoch": 1.65, + "grad_norm": 45.509620666503906, + "learning_rate": 9.032775383381778e-06, + "loss": 2.5814, + "step": 5472 + }, + { + "epoch": 1.65, + "grad_norm": 11.296418190002441, + "learning_rate": 9.030770772777388e-06, + "loss": 0.8345, + "step": 5473 + }, + { + "epoch": 1.65, + "grad_norm": 19.737348556518555, + "learning_rate": 9.028766162172999e-06, + "loss": 2.0533, + "step": 5474 + }, + { + "epoch": 1.65, + "grad_norm": 32.28429412841797, + "learning_rate": 9.026761551568609e-06, + "loss": 1.0527, + "step": 5475 + }, + { + "epoch": 1.65, + "grad_norm": 5.6402106285095215, + "learning_rate": 9.024756940964219e-06, + "loss": 0.7558, + "step": 5476 + }, + { + "epoch": 1.65, + "grad_norm": 11.547632217407227, + "learning_rate": 9.022752330359829e-06, + "loss": 1.089, + "step": 5477 + }, + { + "epoch": 1.65, + "grad_norm": 7.933753967285156, + "learning_rate": 9.020747719755439e-06, + "loss": 1.1976, + "step": 5478 + }, + { + "epoch": 1.65, + "grad_norm": 25.738367080688477, + "learning_rate": 9.018743109151047e-06, + "loss": 2.3856, + "step": 5479 + }, + { + "epoch": 1.65, + "grad_norm": 14.972805976867676, + "learning_rate": 9.016738498546657e-06, + "loss": 2.7914, + "step": 5480 + }, + { + "epoch": 1.65, + "grad_norm": 23.992773056030273, + "learning_rate": 9.014733887942267e-06, + "loss": 2.568, + "step": 5481 + }, + { + "epoch": 1.65, + "grad_norm": 12.318902015686035, + "learning_rate": 9.012729277337878e-06, + "loss": 1.2272, + "step": 5482 + }, + { + "epoch": 1.65, + "grad_norm": 13.769071578979492, + "learning_rate": 9.010724666733488e-06, + "loss": 1.4336, + "step": 5483 + }, + { + "epoch": 1.65, + "grad_norm": 29.345212936401367, + "learning_rate": 9.008720056129098e-06, + "loss": 1.8268, + "step": 5484 + }, + { + "epoch": 1.65, + "grad_norm": 10.752758026123047, + "learning_rate": 9.006715445524708e-06, + "loss": 0.75, + "step": 5485 + }, + { + "epoch": 1.65, + "grad_norm": 11.773067474365234, + "learning_rate": 9.004710834920318e-06, + "loss": 1.5268, + "step": 5486 + }, + { + "epoch": 1.65, + "grad_norm": 20.230098724365234, + "learning_rate": 9.002706224315926e-06, + "loss": 1.2181, + "step": 5487 + }, + { + "epoch": 1.65, + "grad_norm": 19.085262298583984, + "learning_rate": 9.000701613711538e-06, + "loss": 2.0583, + "step": 5488 + }, + { + "epoch": 1.65, + "grad_norm": 15.87958812713623, + "learning_rate": 8.998697003107148e-06, + "loss": 1.2279, + "step": 5489 + }, + { + "epoch": 1.65, + "grad_norm": 10.969903945922852, + "learning_rate": 8.996692392502757e-06, + "loss": 1.4018, + "step": 5490 + }, + { + "epoch": 1.65, + "grad_norm": 16.4309024810791, + "learning_rate": 8.994687781898367e-06, + "loss": 1.7592, + "step": 5491 + }, + { + "epoch": 1.65, + "grad_norm": 17.328758239746094, + "learning_rate": 8.992683171293977e-06, + "loss": 1.3829, + "step": 5492 + }, + { + "epoch": 1.65, + "grad_norm": 23.75958824157715, + "learning_rate": 8.990678560689587e-06, + "loss": 1.7726, + "step": 5493 + }, + { + "epoch": 1.65, + "grad_norm": 7.305182933807373, + "learning_rate": 8.988673950085197e-06, + "loss": 0.9196, + "step": 5494 + }, + { + "epoch": 1.65, + "grad_norm": 23.182376861572266, + "learning_rate": 8.986669339480807e-06, + "loss": 2.1978, + "step": 5495 + }, + { + "epoch": 1.65, + "grad_norm": 16.009490966796875, + "learning_rate": 8.984664728876417e-06, + "loss": 1.3805, + "step": 5496 + }, + { + "epoch": 1.65, + "grad_norm": 25.598642349243164, + "learning_rate": 8.982660118272027e-06, + "loss": 1.7893, + "step": 5497 + }, + { + "epoch": 1.65, + "grad_norm": 14.513004302978516, + "learning_rate": 8.980655507667636e-06, + "loss": 1.0957, + "step": 5498 + }, + { + "epoch": 1.65, + "grad_norm": 33.75980758666992, + "learning_rate": 8.978650897063246e-06, + "loss": 1.9849, + "step": 5499 + }, + { + "epoch": 1.65, + "grad_norm": 17.729082107543945, + "learning_rate": 8.976646286458857e-06, + "loss": 1.2108, + "step": 5500 + }, + { + "epoch": 1.65, + "grad_norm": 15.00300121307373, + "learning_rate": 8.974641675854466e-06, + "loss": 1.9234, + "step": 5501 + }, + { + "epoch": 1.65, + "grad_norm": 37.787193298339844, + "learning_rate": 8.972637065250076e-06, + "loss": 1.5762, + "step": 5502 + }, + { + "epoch": 1.65, + "grad_norm": 18.156917572021484, + "learning_rate": 8.970632454645686e-06, + "loss": 1.2841, + "step": 5503 + }, + { + "epoch": 1.65, + "grad_norm": 17.250186920166016, + "learning_rate": 8.968627844041296e-06, + "loss": 1.4742, + "step": 5504 + }, + { + "epoch": 1.66, + "grad_norm": 27.10199737548828, + "learning_rate": 8.966623233436904e-06, + "loss": 1.9449, + "step": 5505 + }, + { + "epoch": 1.66, + "grad_norm": 17.242645263671875, + "learning_rate": 8.964618622832516e-06, + "loss": 1.3392, + "step": 5506 + }, + { + "epoch": 1.66, + "grad_norm": 12.527267456054688, + "learning_rate": 8.962614012228126e-06, + "loss": 1.559, + "step": 5507 + }, + { + "epoch": 1.66, + "grad_norm": 12.160088539123535, + "learning_rate": 8.960609401623735e-06, + "loss": 1.2504, + "step": 5508 + }, + { + "epoch": 1.66, + "grad_norm": 9.42102336883545, + "learning_rate": 8.958604791019345e-06, + "loss": 0.9488, + "step": 5509 + }, + { + "epoch": 1.66, + "grad_norm": 16.88702964782715, + "learning_rate": 8.956600180414955e-06, + "loss": 1.5164, + "step": 5510 + }, + { + "epoch": 1.66, + "grad_norm": 8.981550216674805, + "learning_rate": 8.954595569810565e-06, + "loss": 1.2683, + "step": 5511 + }, + { + "epoch": 1.66, + "grad_norm": 14.901409149169922, + "learning_rate": 8.952590959206175e-06, + "loss": 1.3492, + "step": 5512 + }, + { + "epoch": 1.66, + "grad_norm": 38.2581787109375, + "learning_rate": 8.950586348601785e-06, + "loss": 1.4274, + "step": 5513 + }, + { + "epoch": 1.66, + "grad_norm": 11.266697883605957, + "learning_rate": 8.948581737997395e-06, + "loss": 1.2673, + "step": 5514 + }, + { + "epoch": 1.66, + "grad_norm": 12.022287368774414, + "learning_rate": 8.946577127393005e-06, + "loss": 1.7178, + "step": 5515 + }, + { + "epoch": 1.66, + "grad_norm": 18.94673728942871, + "learning_rate": 8.944572516788614e-06, + "loss": 1.3636, + "step": 5516 + }, + { + "epoch": 1.66, + "grad_norm": 30.44729232788086, + "learning_rate": 8.942567906184224e-06, + "loss": 1.623, + "step": 5517 + }, + { + "epoch": 1.66, + "grad_norm": 13.971253395080566, + "learning_rate": 8.940563295579834e-06, + "loss": 1.3185, + "step": 5518 + }, + { + "epoch": 1.66, + "grad_norm": 10.269094467163086, + "learning_rate": 8.938558684975444e-06, + "loss": 1.2367, + "step": 5519 + }, + { + "epoch": 1.66, + "grad_norm": 19.084022521972656, + "learning_rate": 8.936554074371054e-06, + "loss": 0.9847, + "step": 5520 + }, + { + "epoch": 1.66, + "eval_loss": 0.1914350539445877, + "eval_runtime": 43.4865, + "eval_samples_per_second": 34.011, + "eval_steps_per_second": 34.011, + "step": 5520 + }, + { + "epoch": 1.66, + "grad_norm": 11.31253719329834, + "learning_rate": 8.934549463766664e-06, + "loss": 1.363, + "step": 5521 + }, + { + "epoch": 1.66, + "grad_norm": 22.407804489135742, + "learning_rate": 8.932544853162274e-06, + "loss": 1.953, + "step": 5522 + }, + { + "epoch": 1.66, + "grad_norm": 36.859561920166016, + "learning_rate": 8.930540242557884e-06, + "loss": 2.4004, + "step": 5523 + }, + { + "epoch": 1.66, + "grad_norm": 28.108509063720703, + "learning_rate": 8.928535631953493e-06, + "loss": 1.4061, + "step": 5524 + }, + { + "epoch": 1.66, + "grad_norm": 20.431781768798828, + "learning_rate": 8.926531021349104e-06, + "loss": 1.4469, + "step": 5525 + }, + { + "epoch": 1.66, + "grad_norm": 20.850435256958008, + "learning_rate": 8.924526410744714e-06, + "loss": 1.789, + "step": 5526 + }, + { + "epoch": 1.66, + "grad_norm": 11.911688804626465, + "learning_rate": 8.922521800140323e-06, + "loss": 2.0367, + "step": 5527 + }, + { + "epoch": 1.66, + "grad_norm": 10.329282760620117, + "learning_rate": 8.920517189535933e-06, + "loss": 1.4678, + "step": 5528 + }, + { + "epoch": 1.66, + "grad_norm": 12.712068557739258, + "learning_rate": 8.918512578931543e-06, + "loss": 1.6763, + "step": 5529 + }, + { + "epoch": 1.66, + "grad_norm": 58.54856872558594, + "learning_rate": 8.916507968327153e-06, + "loss": 2.0221, + "step": 5530 + }, + { + "epoch": 1.66, + "grad_norm": 13.266801834106445, + "learning_rate": 8.914503357722763e-06, + "loss": 1.7743, + "step": 5531 + }, + { + "epoch": 1.66, + "grad_norm": 27.546993255615234, + "learning_rate": 8.912498747118373e-06, + "loss": 2.6575, + "step": 5532 + }, + { + "epoch": 1.66, + "grad_norm": 11.523126602172852, + "learning_rate": 8.910494136513983e-06, + "loss": 1.1144, + "step": 5533 + }, + { + "epoch": 1.66, + "grad_norm": 11.785201072692871, + "learning_rate": 8.908489525909593e-06, + "loss": 1.1003, + "step": 5534 + }, + { + "epoch": 1.66, + "grad_norm": 14.82313346862793, + "learning_rate": 8.906484915305202e-06, + "loss": 1.8222, + "step": 5535 + }, + { + "epoch": 1.66, + "grad_norm": 14.647685050964355, + "learning_rate": 8.904480304700812e-06, + "loss": 0.8845, + "step": 5536 + }, + { + "epoch": 1.66, + "grad_norm": 8.299193382263184, + "learning_rate": 8.902475694096424e-06, + "loss": 1.3426, + "step": 5537 + }, + { + "epoch": 1.67, + "grad_norm": 31.66522979736328, + "learning_rate": 8.900471083492032e-06, + "loss": 1.5596, + "step": 5538 + }, + { + "epoch": 1.67, + "grad_norm": 10.633186340332031, + "learning_rate": 8.898466472887642e-06, + "loss": 0.8703, + "step": 5539 + }, + { + "epoch": 1.67, + "grad_norm": 12.473995208740234, + "learning_rate": 8.896461862283252e-06, + "loss": 1.1009, + "step": 5540 + }, + { + "epoch": 1.67, + "grad_norm": 16.58210563659668, + "learning_rate": 8.894457251678862e-06, + "loss": 0.7829, + "step": 5541 + }, + { + "epoch": 1.67, + "grad_norm": 9.069561004638672, + "learning_rate": 8.89245264107447e-06, + "loss": 0.5217, + "step": 5542 + }, + { + "epoch": 1.67, + "grad_norm": 14.045775413513184, + "learning_rate": 8.890448030470083e-06, + "loss": 1.2927, + "step": 5543 + }, + { + "epoch": 1.67, + "grad_norm": 10.430898666381836, + "learning_rate": 8.888443419865693e-06, + "loss": 1.0182, + "step": 5544 + }, + { + "epoch": 1.67, + "grad_norm": 28.892423629760742, + "learning_rate": 8.886438809261303e-06, + "loss": 2.1135, + "step": 5545 + }, + { + "epoch": 1.67, + "grad_norm": 33.67451095581055, + "learning_rate": 8.884434198656911e-06, + "loss": 2.2505, + "step": 5546 + }, + { + "epoch": 1.67, + "grad_norm": 22.55853271484375, + "learning_rate": 8.882429588052521e-06, + "loss": 1.275, + "step": 5547 + }, + { + "epoch": 1.67, + "grad_norm": 11.746838569641113, + "learning_rate": 8.880424977448131e-06, + "loss": 1.0598, + "step": 5548 + }, + { + "epoch": 1.67, + "grad_norm": 18.632312774658203, + "learning_rate": 8.878420366843741e-06, + "loss": 2.3075, + "step": 5549 + }, + { + "epoch": 1.67, + "grad_norm": 23.912031173706055, + "learning_rate": 8.876415756239351e-06, + "loss": 1.5192, + "step": 5550 + }, + { + "epoch": 1.67, + "grad_norm": 19.950305938720703, + "learning_rate": 8.874411145634962e-06, + "loss": 1.5378, + "step": 5551 + }, + { + "epoch": 1.67, + "grad_norm": 15.393779754638672, + "learning_rate": 8.872406535030572e-06, + "loss": 1.4361, + "step": 5552 + }, + { + "epoch": 1.67, + "grad_norm": 11.668610572814941, + "learning_rate": 8.87040192442618e-06, + "loss": 1.1552, + "step": 5553 + }, + { + "epoch": 1.67, + "grad_norm": 11.19984245300293, + "learning_rate": 8.86839731382179e-06, + "loss": 1.3071, + "step": 5554 + }, + { + "epoch": 1.67, + "grad_norm": 12.745835304260254, + "learning_rate": 8.8663927032174e-06, + "loss": 0.7712, + "step": 5555 + }, + { + "epoch": 1.67, + "grad_norm": 32.49069595336914, + "learning_rate": 8.86438809261301e-06, + "loss": 1.9268, + "step": 5556 + }, + { + "epoch": 1.67, + "grad_norm": 8.485462188720703, + "learning_rate": 8.86238348200862e-06, + "loss": 1.0959, + "step": 5557 + }, + { + "epoch": 1.67, + "grad_norm": 14.367705345153809, + "learning_rate": 8.86037887140423e-06, + "loss": 1.0967, + "step": 5558 + }, + { + "epoch": 1.67, + "grad_norm": 22.95987892150879, + "learning_rate": 8.85837426079984e-06, + "loss": 1.6232, + "step": 5559 + }, + { + "epoch": 1.67, + "grad_norm": 15.968416213989258, + "learning_rate": 8.85636965019545e-06, + "loss": 1.2239, + "step": 5560 + }, + { + "epoch": 1.67, + "grad_norm": 10.333566665649414, + "learning_rate": 8.854365039591059e-06, + "loss": 1.248, + "step": 5561 + }, + { + "epoch": 1.67, + "grad_norm": 45.59640884399414, + "learning_rate": 8.85236042898667e-06, + "loss": 1.6009, + "step": 5562 + }, + { + "epoch": 1.67, + "grad_norm": 28.932910919189453, + "learning_rate": 8.85035581838228e-06, + "loss": 1.88, + "step": 5563 + }, + { + "epoch": 1.67, + "grad_norm": 12.517439842224121, + "learning_rate": 8.84835120777789e-06, + "loss": 1.8985, + "step": 5564 + }, + { + "epoch": 1.67, + "grad_norm": 17.193233489990234, + "learning_rate": 8.8463465971735e-06, + "loss": 1.1834, + "step": 5565 + }, + { + "epoch": 1.67, + "grad_norm": 26.31612205505371, + "learning_rate": 8.84434198656911e-06, + "loss": 2.4826, + "step": 5566 + }, + { + "epoch": 1.67, + "grad_norm": 16.692577362060547, + "learning_rate": 8.84233737596472e-06, + "loss": 1.3551, + "step": 5567 + }, + { + "epoch": 1.67, + "grad_norm": 19.141315460205078, + "learning_rate": 8.84033276536033e-06, + "loss": 1.5564, + "step": 5568 + }, + { + "epoch": 1.67, + "grad_norm": 12.95341682434082, + "learning_rate": 8.83832815475594e-06, + "loss": 1.3491, + "step": 5569 + }, + { + "epoch": 1.67, + "grad_norm": 11.91671085357666, + "learning_rate": 8.83632354415155e-06, + "loss": 1.4844, + "step": 5570 + }, + { + "epoch": 1.67, + "grad_norm": 17.263015747070312, + "learning_rate": 8.83431893354716e-06, + "loss": 1.5285, + "step": 5571 + }, + { + "epoch": 1.68, + "grad_norm": 10.586352348327637, + "learning_rate": 8.832314322942768e-06, + "loss": 1.4477, + "step": 5572 + }, + { + "epoch": 1.68, + "grad_norm": 17.991832733154297, + "learning_rate": 8.830309712338378e-06, + "loss": 1.3879, + "step": 5573 + }, + { + "epoch": 1.68, + "grad_norm": 19.859966278076172, + "learning_rate": 8.82830510173399e-06, + "loss": 1.8104, + "step": 5574 + }, + { + "epoch": 1.68, + "grad_norm": 436.9852294921875, + "learning_rate": 8.826300491129598e-06, + "loss": 2.6175, + "step": 5575 + }, + { + "epoch": 1.68, + "grad_norm": 12.303122520446777, + "learning_rate": 8.824295880525209e-06, + "loss": 0.9817, + "step": 5576 + }, + { + "epoch": 1.68, + "grad_norm": 13.081160545349121, + "learning_rate": 8.822291269920819e-06, + "loss": 2.0876, + "step": 5577 + }, + { + "epoch": 1.68, + "grad_norm": 9.616783142089844, + "learning_rate": 8.820286659316429e-06, + "loss": 1.7439, + "step": 5578 + }, + { + "epoch": 1.68, + "grad_norm": 15.427207946777344, + "learning_rate": 8.818282048712039e-06, + "loss": 1.205, + "step": 5579 + }, + { + "epoch": 1.68, + "grad_norm": 17.407861709594727, + "learning_rate": 8.816277438107649e-06, + "loss": 1.9737, + "step": 5580 + }, + { + "epoch": 1.68, + "grad_norm": 30.608501434326172, + "learning_rate": 8.814272827503259e-06, + "loss": 1.4772, + "step": 5581 + }, + { + "epoch": 1.68, + "grad_norm": 17.050554275512695, + "learning_rate": 8.812268216898869e-06, + "loss": 1.2003, + "step": 5582 + }, + { + "epoch": 1.68, + "grad_norm": 21.977340698242188, + "learning_rate": 8.810263606294477e-06, + "loss": 2.4612, + "step": 5583 + }, + { + "epoch": 1.68, + "grad_norm": 48.15430450439453, + "learning_rate": 8.808258995690088e-06, + "loss": 1.7486, + "step": 5584 + }, + { + "epoch": 1.68, + "grad_norm": 15.752915382385254, + "learning_rate": 8.806254385085698e-06, + "loss": 1.6484, + "step": 5585 + }, + { + "epoch": 1.68, + "grad_norm": 37.53594207763672, + "learning_rate": 8.804249774481308e-06, + "loss": 2.1715, + "step": 5586 + }, + { + "epoch": 1.68, + "grad_norm": 10.964682579040527, + "learning_rate": 8.802245163876918e-06, + "loss": 1.4082, + "step": 5587 + }, + { + "epoch": 1.68, + "grad_norm": 23.61287498474121, + "learning_rate": 8.800240553272528e-06, + "loss": 0.7415, + "step": 5588 + }, + { + "epoch": 1.68, + "grad_norm": 15.340435981750488, + "learning_rate": 8.798235942668138e-06, + "loss": 1.5422, + "step": 5589 + }, + { + "epoch": 1.68, + "grad_norm": 12.990081787109375, + "learning_rate": 8.796231332063746e-06, + "loss": 1.4462, + "step": 5590 + }, + { + "epoch": 1.68, + "grad_norm": 10.152392387390137, + "learning_rate": 8.794226721459356e-06, + "loss": 0.9929, + "step": 5591 + }, + { + "epoch": 1.68, + "grad_norm": 13.210673332214355, + "learning_rate": 8.792222110854968e-06, + "loss": 1.7244, + "step": 5592 + }, + { + "epoch": 1.68, + "grad_norm": 13.106926918029785, + "learning_rate": 8.790217500250577e-06, + "loss": 1.1276, + "step": 5593 + }, + { + "epoch": 1.68, + "grad_norm": 11.7565336227417, + "learning_rate": 8.788212889646187e-06, + "loss": 1.1467, + "step": 5594 + }, + { + "epoch": 1.68, + "grad_norm": 32.24980163574219, + "learning_rate": 8.786208279041797e-06, + "loss": 1.8956, + "step": 5595 + }, + { + "epoch": 1.68, + "grad_norm": 15.186212539672852, + "learning_rate": 8.784203668437407e-06, + "loss": 1.8275, + "step": 5596 + }, + { + "epoch": 1.68, + "grad_norm": 26.510732650756836, + "learning_rate": 8.782199057833017e-06, + "loss": 1.2493, + "step": 5597 + }, + { + "epoch": 1.68, + "grad_norm": 9.903314590454102, + "learning_rate": 8.780194447228625e-06, + "loss": 0.8917, + "step": 5598 + }, + { + "epoch": 1.68, + "grad_norm": 9.444249153137207, + "learning_rate": 8.778189836624237e-06, + "loss": 1.073, + "step": 5599 + }, + { + "epoch": 1.68, + "grad_norm": 17.22249412536621, + "learning_rate": 8.776185226019847e-06, + "loss": 1.2148, + "step": 5600 + }, + { + "epoch": 1.68, + "grad_norm": 9.89676284790039, + "learning_rate": 8.774180615415456e-06, + "loss": 1.2275, + "step": 5601 + }, + { + "epoch": 1.68, + "grad_norm": 12.96465015411377, + "learning_rate": 8.772176004811066e-06, + "loss": 1.0976, + "step": 5602 + }, + { + "epoch": 1.68, + "grad_norm": 15.916191101074219, + "learning_rate": 8.770171394206676e-06, + "loss": 1.7115, + "step": 5603 + }, + { + "epoch": 1.68, + "grad_norm": 18.037708282470703, + "learning_rate": 8.768166783602286e-06, + "loss": 1.7032, + "step": 5604 + }, + { + "epoch": 1.69, + "grad_norm": 15.334985733032227, + "learning_rate": 8.766162172997896e-06, + "loss": 1.3752, + "step": 5605 + }, + { + "epoch": 1.69, + "grad_norm": 9.522956848144531, + "learning_rate": 8.764157562393506e-06, + "loss": 0.623, + "step": 5606 + }, + { + "epoch": 1.69, + "grad_norm": 21.579017639160156, + "learning_rate": 8.762152951789116e-06, + "loss": 1.5285, + "step": 5607 + }, + { + "epoch": 1.69, + "grad_norm": 10.721216201782227, + "learning_rate": 8.760148341184726e-06, + "loss": 0.893, + "step": 5608 + }, + { + "epoch": 1.69, + "grad_norm": 29.23563575744629, + "learning_rate": 8.758143730580335e-06, + "loss": 2.0828, + "step": 5609 + }, + { + "epoch": 1.69, + "grad_norm": 5.085595607757568, + "learning_rate": 8.756139119975945e-06, + "loss": 0.6403, + "step": 5610 + }, + { + "epoch": 1.69, + "grad_norm": 16.123058319091797, + "learning_rate": 8.754134509371556e-06, + "loss": 1.7075, + "step": 5611 + }, + { + "epoch": 1.69, + "grad_norm": 54.834938049316406, + "learning_rate": 8.752129898767165e-06, + "loss": 1.6164, + "step": 5612 + }, + { + "epoch": 1.69, + "grad_norm": 15.420845031738281, + "learning_rate": 8.750125288162775e-06, + "loss": 1.3926, + "step": 5613 + }, + { + "epoch": 1.69, + "grad_norm": 14.784860610961914, + "learning_rate": 8.748120677558385e-06, + "loss": 1.175, + "step": 5614 + }, + { + "epoch": 1.69, + "grad_norm": 32.64680862426758, + "learning_rate": 8.746116066953995e-06, + "loss": 1.5805, + "step": 5615 + }, + { + "epoch": 1.69, + "grad_norm": 28.060972213745117, + "learning_rate": 8.744111456349605e-06, + "loss": 1.803, + "step": 5616 + }, + { + "epoch": 1.69, + "grad_norm": 12.223137855529785, + "learning_rate": 8.742106845745215e-06, + "loss": 1.149, + "step": 5617 + }, + { + "epoch": 1.69, + "grad_norm": 14.413814544677734, + "learning_rate": 8.740102235140825e-06, + "loss": 1.5615, + "step": 5618 + }, + { + "epoch": 1.69, + "grad_norm": 24.743038177490234, + "learning_rate": 8.738097624536435e-06, + "loss": 1.5725, + "step": 5619 + }, + { + "epoch": 1.69, + "grad_norm": 37.43949508666992, + "learning_rate": 8.736093013932044e-06, + "loss": 2.383, + "step": 5620 + }, + { + "epoch": 1.69, + "grad_norm": 19.173534393310547, + "learning_rate": 8.734088403327654e-06, + "loss": 2.1207, + "step": 5621 + }, + { + "epoch": 1.69, + "grad_norm": 17.49724006652832, + "learning_rate": 8.732083792723264e-06, + "loss": 1.7264, + "step": 5622 + }, + { + "epoch": 1.69, + "grad_norm": 9.76134967803955, + "learning_rate": 8.730079182118874e-06, + "loss": 1.5944, + "step": 5623 + }, + { + "epoch": 1.69, + "grad_norm": 11.717310905456543, + "learning_rate": 8.728074571514484e-06, + "loss": 1.756, + "step": 5624 + }, + { + "epoch": 1.69, + "grad_norm": 21.17192840576172, + "learning_rate": 8.726069960910094e-06, + "loss": 2.2556, + "step": 5625 + }, + { + "epoch": 1.69, + "grad_norm": 16.35768699645996, + "learning_rate": 8.724065350305704e-06, + "loss": 1.2782, + "step": 5626 + }, + { + "epoch": 1.69, + "grad_norm": 12.22645378112793, + "learning_rate": 8.722060739701313e-06, + "loss": 1.3418, + "step": 5627 + }, + { + "epoch": 1.69, + "grad_norm": 19.546850204467773, + "learning_rate": 8.720056129096923e-06, + "loss": 1.2375, + "step": 5628 + }, + { + "epoch": 1.69, + "grad_norm": 14.505806922912598, + "learning_rate": 8.718051518492535e-06, + "loss": 1.5247, + "step": 5629 + }, + { + "epoch": 1.69, + "grad_norm": 11.707478523254395, + "learning_rate": 8.716046907888145e-06, + "loss": 1.2686, + "step": 5630 + }, + { + "epoch": 1.69, + "grad_norm": 12.525168418884277, + "learning_rate": 8.714042297283753e-06, + "loss": 2.1298, + "step": 5631 + }, + { + "epoch": 1.69, + "grad_norm": 13.70942497253418, + "learning_rate": 8.712037686679363e-06, + "loss": 0.8167, + "step": 5632 + }, + { + "epoch": 1.69, + "grad_norm": 18.232576370239258, + "learning_rate": 8.710033076074973e-06, + "loss": 1.0261, + "step": 5633 + }, + { + "epoch": 1.69, + "grad_norm": 19.63440704345703, + "learning_rate": 8.708028465470583e-06, + "loss": 2.1018, + "step": 5634 + }, + { + "epoch": 1.69, + "grad_norm": 17.513425827026367, + "learning_rate": 8.706023854866193e-06, + "loss": 1.2959, + "step": 5635 + }, + { + "epoch": 1.69, + "grad_norm": 15.64637565612793, + "learning_rate": 8.704019244261803e-06, + "loss": 1.8533, + "step": 5636 + }, + { + "epoch": 1.69, + "grad_norm": 7.151846885681152, + "learning_rate": 8.702014633657414e-06, + "loss": 0.8085, + "step": 5637 + }, + { + "epoch": 1.7, + "grad_norm": 9.188348770141602, + "learning_rate": 8.700010023053022e-06, + "loss": 1.1322, + "step": 5638 + }, + { + "epoch": 1.7, + "grad_norm": 9.800728797912598, + "learning_rate": 8.698005412448632e-06, + "loss": 0.9624, + "step": 5639 + }, + { + "epoch": 1.7, + "grad_norm": 14.96066951751709, + "learning_rate": 8.696000801844242e-06, + "loss": 1.3465, + "step": 5640 + }, + { + "epoch": 1.7, + "eval_loss": 0.1950412392616272, + "eval_runtime": 43.6602, + "eval_samples_per_second": 33.875, + "eval_steps_per_second": 33.875, + "step": 5640 + }, + { + "epoch": 1.7, + "grad_norm": 21.22214698791504, + "learning_rate": 8.693996191239852e-06, + "loss": 1.9302, + "step": 5641 + }, + { + "epoch": 1.7, + "grad_norm": 11.030116081237793, + "learning_rate": 8.691991580635462e-06, + "loss": 0.9403, + "step": 5642 + }, + { + "epoch": 1.7, + "grad_norm": 10.79010009765625, + "learning_rate": 8.689986970031072e-06, + "loss": 1.2434, + "step": 5643 + }, + { + "epoch": 1.7, + "grad_norm": 17.979541778564453, + "learning_rate": 8.687982359426682e-06, + "loss": 1.4883, + "step": 5644 + }, + { + "epoch": 1.7, + "grad_norm": 11.965887069702148, + "learning_rate": 8.685977748822292e-06, + "loss": 1.381, + "step": 5645 + }, + { + "epoch": 1.7, + "grad_norm": 16.491119384765625, + "learning_rate": 8.683973138217901e-06, + "loss": 1.1538, + "step": 5646 + }, + { + "epoch": 1.7, + "grad_norm": 12.865142822265625, + "learning_rate": 8.681968527613511e-06, + "loss": 1.2, + "step": 5647 + }, + { + "epoch": 1.7, + "grad_norm": 22.270700454711914, + "learning_rate": 8.679963917009123e-06, + "loss": 1.3727, + "step": 5648 + }, + { + "epoch": 1.7, + "grad_norm": 31.27298927307129, + "learning_rate": 8.677959306404731e-06, + "loss": 2.6597, + "step": 5649 + }, + { + "epoch": 1.7, + "grad_norm": 10.323209762573242, + "learning_rate": 8.675954695800341e-06, + "loss": 0.9057, + "step": 5650 + }, + { + "epoch": 1.7, + "grad_norm": 14.9830961227417, + "learning_rate": 8.673950085195951e-06, + "loss": 2.7499, + "step": 5651 + }, + { + "epoch": 1.7, + "grad_norm": 17.780193328857422, + "learning_rate": 8.671945474591561e-06, + "loss": 1.4829, + "step": 5652 + }, + { + "epoch": 1.7, + "grad_norm": 14.081412315368652, + "learning_rate": 8.669940863987171e-06, + "loss": 1.159, + "step": 5653 + }, + { + "epoch": 1.7, + "grad_norm": 16.137582778930664, + "learning_rate": 8.667936253382782e-06, + "loss": 1.0004, + "step": 5654 + }, + { + "epoch": 1.7, + "grad_norm": 21.85881805419922, + "learning_rate": 8.665931642778392e-06, + "loss": 1.4793, + "step": 5655 + }, + { + "epoch": 1.7, + "grad_norm": 21.70241355895996, + "learning_rate": 8.663927032174002e-06, + "loss": 1.6126, + "step": 5656 + }, + { + "epoch": 1.7, + "grad_norm": 16.00739288330078, + "learning_rate": 8.66192242156961e-06, + "loss": 1.7453, + "step": 5657 + }, + { + "epoch": 1.7, + "grad_norm": 41.264095306396484, + "learning_rate": 8.65991781096522e-06, + "loss": 1.8027, + "step": 5658 + }, + { + "epoch": 1.7, + "grad_norm": 59.908199310302734, + "learning_rate": 8.65791320036083e-06, + "loss": 1.4352, + "step": 5659 + }, + { + "epoch": 1.7, + "grad_norm": 26.252933502197266, + "learning_rate": 8.65590858975644e-06, + "loss": 1.1851, + "step": 5660 + }, + { + "epoch": 1.7, + "grad_norm": 14.983129501342773, + "learning_rate": 8.65390397915205e-06, + "loss": 1.1969, + "step": 5661 + }, + { + "epoch": 1.7, + "grad_norm": 7.037642478942871, + "learning_rate": 8.65189936854766e-06, + "loss": 0.6684, + "step": 5662 + }, + { + "epoch": 1.7, + "grad_norm": 12.585198402404785, + "learning_rate": 8.64989475794327e-06, + "loss": 1.4334, + "step": 5663 + }, + { + "epoch": 1.7, + "grad_norm": 21.677915573120117, + "learning_rate": 8.64789014733888e-06, + "loss": 2.5165, + "step": 5664 + }, + { + "epoch": 1.7, + "grad_norm": 23.204702377319336, + "learning_rate": 8.645885536734489e-06, + "loss": 1.5472, + "step": 5665 + }, + { + "epoch": 1.7, + "grad_norm": 10.986197471618652, + "learning_rate": 8.643880926130101e-06, + "loss": 1.6953, + "step": 5666 + }, + { + "epoch": 1.7, + "grad_norm": 11.328323364257812, + "learning_rate": 8.641876315525711e-06, + "loss": 0.5602, + "step": 5667 + }, + { + "epoch": 1.7, + "grad_norm": 9.922806739807129, + "learning_rate": 8.63987170492132e-06, + "loss": 1.3713, + "step": 5668 + }, + { + "epoch": 1.7, + "grad_norm": 8.560861587524414, + "learning_rate": 8.63786709431693e-06, + "loss": 1.0511, + "step": 5669 + }, + { + "epoch": 1.7, + "grad_norm": 38.06193923950195, + "learning_rate": 8.63586248371254e-06, + "loss": 1.5991, + "step": 5670 + }, + { + "epoch": 1.71, + "grad_norm": 17.680063247680664, + "learning_rate": 8.63385787310815e-06, + "loss": 2.7477, + "step": 5671 + }, + { + "epoch": 1.71, + "grad_norm": 29.49445915222168, + "learning_rate": 8.63185326250376e-06, + "loss": 1.7613, + "step": 5672 + }, + { + "epoch": 1.71, + "grad_norm": 32.89970397949219, + "learning_rate": 8.62984865189937e-06, + "loss": 1.6423, + "step": 5673 + }, + { + "epoch": 1.71, + "grad_norm": 40.36264419555664, + "learning_rate": 8.62784404129498e-06, + "loss": 1.2661, + "step": 5674 + }, + { + "epoch": 1.71, + "grad_norm": 15.551639556884766, + "learning_rate": 8.625839430690588e-06, + "loss": 1.6711, + "step": 5675 + }, + { + "epoch": 1.71, + "grad_norm": 30.854095458984375, + "learning_rate": 8.623834820086198e-06, + "loss": 1.0708, + "step": 5676 + }, + { + "epoch": 1.71, + "grad_norm": 30.149341583251953, + "learning_rate": 8.621830209481808e-06, + "loss": 1.7262, + "step": 5677 + }, + { + "epoch": 1.71, + "grad_norm": 17.061527252197266, + "learning_rate": 8.619825598877418e-06, + "loss": 1.4248, + "step": 5678 + }, + { + "epoch": 1.71, + "grad_norm": 78.73602294921875, + "learning_rate": 8.617820988273029e-06, + "loss": 2.1561, + "step": 5679 + }, + { + "epoch": 1.71, + "grad_norm": 51.467491149902344, + "learning_rate": 8.615816377668639e-06, + "loss": 1.7134, + "step": 5680 + }, + { + "epoch": 1.71, + "grad_norm": 17.935325622558594, + "learning_rate": 8.613811767064249e-06, + "loss": 1.576, + "step": 5681 + }, + { + "epoch": 1.71, + "grad_norm": 14.966944694519043, + "learning_rate": 8.611807156459859e-06, + "loss": 2.0311, + "step": 5682 + }, + { + "epoch": 1.71, + "grad_norm": 14.564214706420898, + "learning_rate": 8.609802545855467e-06, + "loss": 1.7562, + "step": 5683 + }, + { + "epoch": 1.71, + "grad_norm": 11.899775505065918, + "learning_rate": 8.607797935251077e-06, + "loss": 1.2177, + "step": 5684 + }, + { + "epoch": 1.71, + "grad_norm": 14.998151779174805, + "learning_rate": 8.605793324646689e-06, + "loss": 1.6503, + "step": 5685 + }, + { + "epoch": 1.71, + "grad_norm": 18.94357681274414, + "learning_rate": 8.603788714042297e-06, + "loss": 1.411, + "step": 5686 + }, + { + "epoch": 1.71, + "grad_norm": 14.1841459274292, + "learning_rate": 8.601784103437908e-06, + "loss": 1.5919, + "step": 5687 + }, + { + "epoch": 1.71, + "grad_norm": 32.609100341796875, + "learning_rate": 8.599779492833518e-06, + "loss": 1.7305, + "step": 5688 + }, + { + "epoch": 1.71, + "grad_norm": 35.52297592163086, + "learning_rate": 8.597774882229128e-06, + "loss": 2.0784, + "step": 5689 + }, + { + "epoch": 1.71, + "grad_norm": 13.570123672485352, + "learning_rate": 8.595770271624738e-06, + "loss": 1.0387, + "step": 5690 + }, + { + "epoch": 1.71, + "grad_norm": 12.182998657226562, + "learning_rate": 8.593765661020348e-06, + "loss": 1.042, + "step": 5691 + }, + { + "epoch": 1.71, + "grad_norm": 56.89901351928711, + "learning_rate": 8.591761050415958e-06, + "loss": 2.2758, + "step": 5692 + }, + { + "epoch": 1.71, + "grad_norm": 16.903789520263672, + "learning_rate": 8.589756439811568e-06, + "loss": 1.0774, + "step": 5693 + }, + { + "epoch": 1.71, + "grad_norm": 36.77326583862305, + "learning_rate": 8.587751829207176e-06, + "loss": 2.6291, + "step": 5694 + }, + { + "epoch": 1.71, + "grad_norm": 24.694618225097656, + "learning_rate": 8.585747218602787e-06, + "loss": 1.4135, + "step": 5695 + }, + { + "epoch": 1.71, + "grad_norm": 44.71474075317383, + "learning_rate": 8.583742607998397e-06, + "loss": 1.5254, + "step": 5696 + }, + { + "epoch": 1.71, + "grad_norm": 27.899707794189453, + "learning_rate": 8.581737997394007e-06, + "loss": 1.8512, + "step": 5697 + }, + { + "epoch": 1.71, + "grad_norm": 8.52084732055664, + "learning_rate": 8.579733386789617e-06, + "loss": 1.3119, + "step": 5698 + }, + { + "epoch": 1.71, + "grad_norm": 12.418741226196289, + "learning_rate": 8.577728776185227e-06, + "loss": 1.7178, + "step": 5699 + }, + { + "epoch": 1.71, + "grad_norm": 11.793305397033691, + "learning_rate": 8.575724165580837e-06, + "loss": 1.0568, + "step": 5700 + }, + { + "epoch": 1.71, + "grad_norm": 9.70165729522705, + "learning_rate": 8.573719554976447e-06, + "loss": 1.6653, + "step": 5701 + }, + { + "epoch": 1.71, + "grad_norm": 15.392131805419922, + "learning_rate": 8.571714944372055e-06, + "loss": 2.2157, + "step": 5702 + }, + { + "epoch": 1.71, + "grad_norm": 35.93635177612305, + "learning_rate": 8.569710333767667e-06, + "loss": 1.2638, + "step": 5703 + }, + { + "epoch": 1.71, + "grad_norm": 12.030016899108887, + "learning_rate": 8.567705723163277e-06, + "loss": 1.3019, + "step": 5704 + }, + { + "epoch": 1.72, + "grad_norm": 39.288814544677734, + "learning_rate": 8.565701112558886e-06, + "loss": 2.3544, + "step": 5705 + }, + { + "epoch": 1.72, + "grad_norm": 23.307493209838867, + "learning_rate": 8.563696501954496e-06, + "loss": 1.5503, + "step": 5706 + }, + { + "epoch": 1.72, + "grad_norm": 26.393465042114258, + "learning_rate": 8.561691891350106e-06, + "loss": 2.5251, + "step": 5707 + }, + { + "epoch": 1.72, + "grad_norm": 10.045159339904785, + "learning_rate": 8.559687280745716e-06, + "loss": 1.1675, + "step": 5708 + }, + { + "epoch": 1.72, + "grad_norm": 35.52949142456055, + "learning_rate": 8.557682670141326e-06, + "loss": 2.1613, + "step": 5709 + }, + { + "epoch": 1.72, + "grad_norm": 19.822158813476562, + "learning_rate": 8.555678059536936e-06, + "loss": 1.5757, + "step": 5710 + }, + { + "epoch": 1.72, + "grad_norm": 41.30690383911133, + "learning_rate": 8.553673448932546e-06, + "loss": 2.7388, + "step": 5711 + }, + { + "epoch": 1.72, + "grad_norm": 20.42682456970215, + "learning_rate": 8.551668838328155e-06, + "loss": 2.3481, + "step": 5712 + }, + { + "epoch": 1.72, + "grad_norm": 9.312350273132324, + "learning_rate": 8.549664227723765e-06, + "loss": 0.9268, + "step": 5713 + }, + { + "epoch": 1.72, + "grad_norm": 11.393967628479004, + "learning_rate": 8.547659617119375e-06, + "loss": 1.6853, + "step": 5714 + }, + { + "epoch": 1.72, + "grad_norm": 22.255510330200195, + "learning_rate": 8.545655006514987e-06, + "loss": 1.0106, + "step": 5715 + }, + { + "epoch": 1.72, + "grad_norm": 12.787086486816406, + "learning_rate": 8.543650395910595e-06, + "loss": 1.6355, + "step": 5716 + }, + { + "epoch": 1.72, + "grad_norm": 39.999839782714844, + "learning_rate": 8.541645785306205e-06, + "loss": 1.8123, + "step": 5717 + }, + { + "epoch": 1.72, + "grad_norm": 7.7507853507995605, + "learning_rate": 8.539641174701815e-06, + "loss": 1.4238, + "step": 5718 + }, + { + "epoch": 1.72, + "grad_norm": 15.439056396484375, + "learning_rate": 8.537636564097425e-06, + "loss": 1.7531, + "step": 5719 + }, + { + "epoch": 1.72, + "grad_norm": 17.616294860839844, + "learning_rate": 8.535631953493034e-06, + "loss": 1.8694, + "step": 5720 + }, + { + "epoch": 1.72, + "grad_norm": 10.567305564880371, + "learning_rate": 8.533627342888644e-06, + "loss": 1.0895, + "step": 5721 + }, + { + "epoch": 1.72, + "grad_norm": 14.490511894226074, + "learning_rate": 8.531622732284255e-06, + "loss": 1.4669, + "step": 5722 + }, + { + "epoch": 1.72, + "grad_norm": 9.218794822692871, + "learning_rate": 8.529618121679864e-06, + "loss": 1.0658, + "step": 5723 + }, + { + "epoch": 1.72, + "grad_norm": 10.850655555725098, + "learning_rate": 8.527613511075474e-06, + "loss": 1.0692, + "step": 5724 + }, + { + "epoch": 1.72, + "grad_norm": 29.35923194885254, + "learning_rate": 8.525608900471084e-06, + "loss": 1.4617, + "step": 5725 + }, + { + "epoch": 1.72, + "grad_norm": 19.219905853271484, + "learning_rate": 8.523604289866694e-06, + "loss": 2.7337, + "step": 5726 + }, + { + "epoch": 1.72, + "grad_norm": 17.148277282714844, + "learning_rate": 8.521599679262304e-06, + "loss": 1.4809, + "step": 5727 + }, + { + "epoch": 1.72, + "grad_norm": 55.859092712402344, + "learning_rate": 8.519595068657914e-06, + "loss": 1.3051, + "step": 5728 + }, + { + "epoch": 1.72, + "grad_norm": 12.13316822052002, + "learning_rate": 8.517590458053524e-06, + "loss": 1.3944, + "step": 5729 + }, + { + "epoch": 1.72, + "grad_norm": 26.604719161987305, + "learning_rate": 8.515585847449134e-06, + "loss": 1.6258, + "step": 5730 + }, + { + "epoch": 1.72, + "grad_norm": 13.429365158081055, + "learning_rate": 8.513581236844743e-06, + "loss": 1.0267, + "step": 5731 + }, + { + "epoch": 1.72, + "grad_norm": 13.195096969604492, + "learning_rate": 8.511576626240353e-06, + "loss": 1.1112, + "step": 5732 + }, + { + "epoch": 1.72, + "grad_norm": 19.855567932128906, + "learning_rate": 8.509572015635963e-06, + "loss": 1.1505, + "step": 5733 + }, + { + "epoch": 1.72, + "grad_norm": 13.332557678222656, + "learning_rate": 8.507567405031573e-06, + "loss": 1.5326, + "step": 5734 + }, + { + "epoch": 1.72, + "grad_norm": 11.699455261230469, + "learning_rate": 8.505562794427183e-06, + "loss": 1.3027, + "step": 5735 + }, + { + "epoch": 1.72, + "grad_norm": 55.0339469909668, + "learning_rate": 8.503558183822793e-06, + "loss": 1.8905, + "step": 5736 + }, + { + "epoch": 1.72, + "grad_norm": 14.223106384277344, + "learning_rate": 8.501553573218403e-06, + "loss": 1.1121, + "step": 5737 + }, + { + "epoch": 1.73, + "grad_norm": 19.943954467773438, + "learning_rate": 8.499548962614013e-06, + "loss": 1.9677, + "step": 5738 + }, + { + "epoch": 1.73, + "grad_norm": 30.25005340576172, + "learning_rate": 8.497544352009622e-06, + "loss": 1.7023, + "step": 5739 + }, + { + "epoch": 1.73, + "grad_norm": 19.0448055267334, + "learning_rate": 8.495539741405234e-06, + "loss": 1.3845, + "step": 5740 + }, + { + "epoch": 1.73, + "grad_norm": 16.908838272094727, + "learning_rate": 8.493535130800844e-06, + "loss": 1.7335, + "step": 5741 + }, + { + "epoch": 1.73, + "grad_norm": 24.786378860473633, + "learning_rate": 8.491530520196452e-06, + "loss": 1.7752, + "step": 5742 + }, + { + "epoch": 1.73, + "grad_norm": 14.871647834777832, + "learning_rate": 8.489525909592062e-06, + "loss": 1.7707, + "step": 5743 + }, + { + "epoch": 1.73, + "grad_norm": 9.237364768981934, + "learning_rate": 8.487521298987672e-06, + "loss": 1.2148, + "step": 5744 + }, + { + "epoch": 1.73, + "grad_norm": 20.291860580444336, + "learning_rate": 8.485516688383282e-06, + "loss": 1.3439, + "step": 5745 + }, + { + "epoch": 1.73, + "grad_norm": 23.027454376220703, + "learning_rate": 8.483512077778892e-06, + "loss": 1.6966, + "step": 5746 + }, + { + "epoch": 1.73, + "grad_norm": 14.931220054626465, + "learning_rate": 8.481507467174502e-06, + "loss": 1.8485, + "step": 5747 + }, + { + "epoch": 1.73, + "grad_norm": 8.827281951904297, + "learning_rate": 8.479502856570113e-06, + "loss": 0.8949, + "step": 5748 + }, + { + "epoch": 1.73, + "grad_norm": 13.313647270202637, + "learning_rate": 8.477498245965723e-06, + "loss": 1.3746, + "step": 5749 + }, + { + "epoch": 1.73, + "grad_norm": 40.448280334472656, + "learning_rate": 8.475493635361331e-06, + "loss": 2.6908, + "step": 5750 + }, + { + "epoch": 1.73, + "grad_norm": 15.871477127075195, + "learning_rate": 8.473489024756941e-06, + "loss": 1.6084, + "step": 5751 + }, + { + "epoch": 1.73, + "grad_norm": 38.03894805908203, + "learning_rate": 8.471484414152553e-06, + "loss": 1.693, + "step": 5752 + }, + { + "epoch": 1.73, + "grad_norm": 53.909751892089844, + "learning_rate": 8.469479803548161e-06, + "loss": 1.3998, + "step": 5753 + }, + { + "epoch": 1.73, + "grad_norm": 23.2731876373291, + "learning_rate": 8.467475192943771e-06, + "loss": 1.1304, + "step": 5754 + }, + { + "epoch": 1.73, + "grad_norm": 16.913698196411133, + "learning_rate": 8.465470582339381e-06, + "loss": 1.2357, + "step": 5755 + }, + { + "epoch": 1.73, + "grad_norm": 25.4580135345459, + "learning_rate": 8.463465971734992e-06, + "loss": 1.669, + "step": 5756 + }, + { + "epoch": 1.73, + "grad_norm": 17.684297561645508, + "learning_rate": 8.4614613611306e-06, + "loss": 1.8284, + "step": 5757 + }, + { + "epoch": 1.73, + "grad_norm": 11.82136058807373, + "learning_rate": 8.459456750526212e-06, + "loss": 1.5129, + "step": 5758 + }, + { + "epoch": 1.73, + "grad_norm": 19.27686882019043, + "learning_rate": 8.457452139921822e-06, + "loss": 2.0577, + "step": 5759 + }, + { + "epoch": 1.73, + "grad_norm": 19.104829788208008, + "learning_rate": 8.45544752931743e-06, + "loss": 1.8845, + "step": 5760 + }, + { + "epoch": 1.73, + "eval_loss": 0.19915775954723358, + "eval_runtime": 43.571, + "eval_samples_per_second": 33.945, + "eval_steps_per_second": 33.945, + "step": 5760 + }, + { + "epoch": 1.73, + "grad_norm": 30.6223201751709, + "learning_rate": 8.45344291871304e-06, + "loss": 1.4588, + "step": 5761 + }, + { + "epoch": 1.73, + "grad_norm": 20.711313247680664, + "learning_rate": 8.45143830810865e-06, + "loss": 1.9552, + "step": 5762 + }, + { + "epoch": 1.73, + "grad_norm": 9.937006950378418, + "learning_rate": 8.44943369750426e-06, + "loss": 1.4769, + "step": 5763 + }, + { + "epoch": 1.73, + "grad_norm": 11.456831932067871, + "learning_rate": 8.44742908689987e-06, + "loss": 1.486, + "step": 5764 + }, + { + "epoch": 1.73, + "grad_norm": 11.295050621032715, + "learning_rate": 8.44542447629548e-06, + "loss": 1.1309, + "step": 5765 + }, + { + "epoch": 1.73, + "grad_norm": 10.123924255371094, + "learning_rate": 8.44341986569109e-06, + "loss": 0.8497, + "step": 5766 + }, + { + "epoch": 1.73, + "grad_norm": 23.44426727294922, + "learning_rate": 8.4414152550867e-06, + "loss": 1.9237, + "step": 5767 + }, + { + "epoch": 1.73, + "grad_norm": 23.85992431640625, + "learning_rate": 8.439410644482309e-06, + "loss": 2.8773, + "step": 5768 + }, + { + "epoch": 1.73, + "grad_norm": 12.136680603027344, + "learning_rate": 8.43740603387792e-06, + "loss": 1.9736, + "step": 5769 + }, + { + "epoch": 1.73, + "grad_norm": 13.099090576171875, + "learning_rate": 8.43540142327353e-06, + "loss": 1.9405, + "step": 5770 + }, + { + "epoch": 1.74, + "grad_norm": 18.678722381591797, + "learning_rate": 8.43339681266914e-06, + "loss": 2.2382, + "step": 5771 + }, + { + "epoch": 1.74, + "grad_norm": 14.240285873413086, + "learning_rate": 8.43139220206475e-06, + "loss": 1.3103, + "step": 5772 + }, + { + "epoch": 1.74, + "grad_norm": 21.930389404296875, + "learning_rate": 8.42938759146036e-06, + "loss": 1.4554, + "step": 5773 + }, + { + "epoch": 1.74, + "grad_norm": 20.750886917114258, + "learning_rate": 8.42738298085597e-06, + "loss": 1.1572, + "step": 5774 + }, + { + "epoch": 1.74, + "grad_norm": 13.55035400390625, + "learning_rate": 8.42537837025158e-06, + "loss": 1.0804, + "step": 5775 + }, + { + "epoch": 1.74, + "grad_norm": 12.210824966430664, + "learning_rate": 8.423373759647188e-06, + "loss": 1.526, + "step": 5776 + }, + { + "epoch": 1.74, + "grad_norm": 9.945907592773438, + "learning_rate": 8.4213691490428e-06, + "loss": 1.1015, + "step": 5777 + }, + { + "epoch": 1.74, + "grad_norm": 16.482894897460938, + "learning_rate": 8.41936453843841e-06, + "loss": 1.9265, + "step": 5778 + }, + { + "epoch": 1.74, + "grad_norm": 16.412376403808594, + "learning_rate": 8.417359927834018e-06, + "loss": 1.9719, + "step": 5779 + }, + { + "epoch": 1.74, + "grad_norm": 49.32164764404297, + "learning_rate": 8.415355317229628e-06, + "loss": 2.7723, + "step": 5780 + }, + { + "epoch": 1.74, + "grad_norm": 27.84273910522461, + "learning_rate": 8.413350706625239e-06, + "loss": 1.1557, + "step": 5781 + }, + { + "epoch": 1.74, + "grad_norm": 25.465152740478516, + "learning_rate": 8.411346096020849e-06, + "loss": 1.241, + "step": 5782 + }, + { + "epoch": 1.74, + "grad_norm": 21.347864151000977, + "learning_rate": 8.409341485416459e-06, + "loss": 2.8817, + "step": 5783 + }, + { + "epoch": 1.74, + "grad_norm": 17.83620834350586, + "learning_rate": 8.407336874812069e-06, + "loss": 1.4303, + "step": 5784 + }, + { + "epoch": 1.74, + "grad_norm": 24.558000564575195, + "learning_rate": 8.405332264207679e-06, + "loss": 0.9856, + "step": 5785 + }, + { + "epoch": 1.74, + "grad_norm": 16.54327392578125, + "learning_rate": 8.403327653603289e-06, + "loss": 0.702, + "step": 5786 + }, + { + "epoch": 1.74, + "grad_norm": 21.651714324951172, + "learning_rate": 8.401323042998897e-06, + "loss": 1.6273, + "step": 5787 + }, + { + "epoch": 1.74, + "grad_norm": 13.498676300048828, + "learning_rate": 8.399318432394507e-06, + "loss": 0.949, + "step": 5788 + }, + { + "epoch": 1.74, + "grad_norm": 10.88339900970459, + "learning_rate": 8.39731382179012e-06, + "loss": 1.7288, + "step": 5789 + }, + { + "epoch": 1.74, + "grad_norm": 10.3096284866333, + "learning_rate": 8.395309211185728e-06, + "loss": 1.1514, + "step": 5790 + }, + { + "epoch": 1.74, + "grad_norm": 17.805374145507812, + "learning_rate": 8.393304600581338e-06, + "loss": 1.5093, + "step": 5791 + }, + { + "epoch": 1.74, + "grad_norm": 16.813570022583008, + "learning_rate": 8.391299989976948e-06, + "loss": 0.8186, + "step": 5792 + }, + { + "epoch": 1.74, + "grad_norm": 20.913787841796875, + "learning_rate": 8.389295379372558e-06, + "loss": 1.7035, + "step": 5793 + }, + { + "epoch": 1.74, + "grad_norm": 52.88319778442383, + "learning_rate": 8.387290768768166e-06, + "loss": 1.7603, + "step": 5794 + }, + { + "epoch": 1.74, + "grad_norm": 15.752192497253418, + "learning_rate": 8.385286158163778e-06, + "loss": 1.2862, + "step": 5795 + }, + { + "epoch": 1.74, + "grad_norm": 8.408364295959473, + "learning_rate": 8.383281547559388e-06, + "loss": 0.9196, + "step": 5796 + }, + { + "epoch": 1.74, + "grad_norm": 18.868669509887695, + "learning_rate": 8.381276936954997e-06, + "loss": 1.5121, + "step": 5797 + }, + { + "epoch": 1.74, + "grad_norm": 20.32513999938965, + "learning_rate": 8.379272326350607e-06, + "loss": 2.3994, + "step": 5798 + }, + { + "epoch": 1.74, + "grad_norm": 19.855003356933594, + "learning_rate": 8.377267715746217e-06, + "loss": 1.8385, + "step": 5799 + }, + { + "epoch": 1.74, + "grad_norm": 15.905160903930664, + "learning_rate": 8.375263105141827e-06, + "loss": 1.1867, + "step": 5800 + }, + { + "epoch": 1.74, + "grad_norm": 8.156031608581543, + "learning_rate": 8.373258494537437e-06, + "loss": 0.9248, + "step": 5801 + }, + { + "epoch": 1.74, + "grad_norm": 20.510618209838867, + "learning_rate": 8.371253883933047e-06, + "loss": 1.6939, + "step": 5802 + }, + { + "epoch": 1.74, + "grad_norm": 39.78818130493164, + "learning_rate": 8.369249273328657e-06, + "loss": 1.3656, + "step": 5803 + }, + { + "epoch": 1.75, + "grad_norm": 33.16683578491211, + "learning_rate": 8.367244662724267e-06, + "loss": 1.5406, + "step": 5804 + }, + { + "epoch": 1.75, + "grad_norm": 21.53314971923828, + "learning_rate": 8.365240052119875e-06, + "loss": 1.0033, + "step": 5805 + }, + { + "epoch": 1.75, + "grad_norm": 24.56745719909668, + "learning_rate": 8.363235441515486e-06, + "loss": 1.7757, + "step": 5806 + }, + { + "epoch": 1.75, + "grad_norm": 15.24342155456543, + "learning_rate": 8.361230830911096e-06, + "loss": 1.292, + "step": 5807 + }, + { + "epoch": 1.75, + "grad_norm": 27.435335159301758, + "learning_rate": 8.359226220306706e-06, + "loss": 1.9828, + "step": 5808 + }, + { + "epoch": 1.75, + "grad_norm": 20.47377586364746, + "learning_rate": 8.357221609702316e-06, + "loss": 1.9325, + "step": 5809 + }, + { + "epoch": 1.75, + "grad_norm": 6.70631742477417, + "learning_rate": 8.355216999097926e-06, + "loss": 1.1601, + "step": 5810 + }, + { + "epoch": 1.75, + "grad_norm": 13.876242637634277, + "learning_rate": 8.353212388493536e-06, + "loss": 1.3469, + "step": 5811 + }, + { + "epoch": 1.75, + "grad_norm": 22.673198699951172, + "learning_rate": 8.351207777889146e-06, + "loss": 2.9192, + "step": 5812 + }, + { + "epoch": 1.75, + "grad_norm": 9.959436416625977, + "learning_rate": 8.349203167284754e-06, + "loss": 1.4124, + "step": 5813 + }, + { + "epoch": 1.75, + "grad_norm": 30.46356773376465, + "learning_rate": 8.347198556680366e-06, + "loss": 2.1221, + "step": 5814 + }, + { + "epoch": 1.75, + "grad_norm": 12.394341468811035, + "learning_rate": 8.345193946075976e-06, + "loss": 1.2795, + "step": 5815 + }, + { + "epoch": 1.75, + "grad_norm": 71.86808013916016, + "learning_rate": 8.343189335471585e-06, + "loss": 3.5364, + "step": 5816 + }, + { + "epoch": 1.75, + "grad_norm": 13.528796195983887, + "learning_rate": 8.341184724867195e-06, + "loss": 1.1612, + "step": 5817 + }, + { + "epoch": 1.75, + "grad_norm": 38.51426696777344, + "learning_rate": 8.339180114262805e-06, + "loss": 1.6762, + "step": 5818 + }, + { + "epoch": 1.75, + "grad_norm": 15.68415355682373, + "learning_rate": 8.337175503658415e-06, + "loss": 2.1607, + "step": 5819 + }, + { + "epoch": 1.75, + "grad_norm": 11.136553764343262, + "learning_rate": 8.335170893054025e-06, + "loss": 0.7781, + "step": 5820 + }, + { + "epoch": 1.75, + "grad_norm": 15.836930274963379, + "learning_rate": 8.333166282449635e-06, + "loss": 1.9072, + "step": 5821 + }, + { + "epoch": 1.75, + "grad_norm": 27.627634048461914, + "learning_rate": 8.331161671845245e-06, + "loss": 1.4755, + "step": 5822 + }, + { + "epoch": 1.75, + "grad_norm": 11.358156204223633, + "learning_rate": 8.329157061240855e-06, + "loss": 1.3935, + "step": 5823 + }, + { + "epoch": 1.75, + "grad_norm": 11.681863784790039, + "learning_rate": 8.327152450636464e-06, + "loss": 1.2521, + "step": 5824 + }, + { + "epoch": 1.75, + "grad_norm": 37.17042541503906, + "learning_rate": 8.325147840032074e-06, + "loss": 1.7879, + "step": 5825 + }, + { + "epoch": 1.75, + "grad_norm": 17.618371963500977, + "learning_rate": 8.323143229427686e-06, + "loss": 1.283, + "step": 5826 + }, + { + "epoch": 1.75, + "grad_norm": 13.247238159179688, + "learning_rate": 8.321138618823294e-06, + "loss": 0.9504, + "step": 5827 + }, + { + "epoch": 1.75, + "grad_norm": 22.750028610229492, + "learning_rate": 8.319134008218904e-06, + "loss": 1.7133, + "step": 5828 + }, + { + "epoch": 1.75, + "grad_norm": 23.75035285949707, + "learning_rate": 8.317129397614514e-06, + "loss": 1.4273, + "step": 5829 + }, + { + "epoch": 1.75, + "grad_norm": 17.772930145263672, + "learning_rate": 8.315124787010124e-06, + "loss": 1.4838, + "step": 5830 + }, + { + "epoch": 1.75, + "grad_norm": 14.174525260925293, + "learning_rate": 8.313120176405733e-06, + "loss": 1.5128, + "step": 5831 + }, + { + "epoch": 1.75, + "grad_norm": 59.5333366394043, + "learning_rate": 8.311115565801344e-06, + "loss": 1.0576, + "step": 5832 + }, + { + "epoch": 1.75, + "grad_norm": 20.53312873840332, + "learning_rate": 8.309110955196954e-06, + "loss": 1.0399, + "step": 5833 + }, + { + "epoch": 1.75, + "grad_norm": 31.05029296875, + "learning_rate": 8.307106344592565e-06, + "loss": 1.5115, + "step": 5834 + }, + { + "epoch": 1.75, + "grad_norm": 12.972631454467773, + "learning_rate": 8.305101733988173e-06, + "loss": 1.2375, + "step": 5835 + }, + { + "epoch": 1.75, + "grad_norm": 13.963759422302246, + "learning_rate": 8.303097123383783e-06, + "loss": 1.5626, + "step": 5836 + }, + { + "epoch": 1.75, + "grad_norm": 11.903511047363281, + "learning_rate": 8.301092512779393e-06, + "loss": 1.4446, + "step": 5837 + }, + { + "epoch": 1.76, + "grad_norm": 18.711076736450195, + "learning_rate": 8.299087902175003e-06, + "loss": 1.3406, + "step": 5838 + }, + { + "epoch": 1.76, + "grad_norm": 84.2789535522461, + "learning_rate": 8.297083291570613e-06, + "loss": 2.1487, + "step": 5839 + }, + { + "epoch": 1.76, + "grad_norm": 22.546630859375, + "learning_rate": 8.295078680966223e-06, + "loss": 1.1753, + "step": 5840 + }, + { + "epoch": 1.76, + "grad_norm": 12.994787216186523, + "learning_rate": 8.293074070361833e-06, + "loss": 1.6728, + "step": 5841 + }, + { + "epoch": 1.76, + "grad_norm": 8.531490325927734, + "learning_rate": 8.291069459757442e-06, + "loss": 0.8257, + "step": 5842 + }, + { + "epoch": 1.76, + "grad_norm": 21.086061477661133, + "learning_rate": 8.289064849153052e-06, + "loss": 1.1679, + "step": 5843 + }, + { + "epoch": 1.76, + "grad_norm": 23.762374877929688, + "learning_rate": 8.287060238548664e-06, + "loss": 3.049, + "step": 5844 + }, + { + "epoch": 1.76, + "grad_norm": 18.857986450195312, + "learning_rate": 8.285055627944272e-06, + "loss": 1.5555, + "step": 5845 + }, + { + "epoch": 1.76, + "grad_norm": 11.890861511230469, + "learning_rate": 8.283051017339882e-06, + "loss": 1.1098, + "step": 5846 + }, + { + "epoch": 1.76, + "grad_norm": 8.958333969116211, + "learning_rate": 8.281046406735492e-06, + "loss": 0.8418, + "step": 5847 + }, + { + "epoch": 1.76, + "grad_norm": 15.934253692626953, + "learning_rate": 8.279041796131102e-06, + "loss": 1.3008, + "step": 5848 + }, + { + "epoch": 1.76, + "grad_norm": 21.336219787597656, + "learning_rate": 8.277037185526712e-06, + "loss": 1.1645, + "step": 5849 + }, + { + "epoch": 1.76, + "grad_norm": 12.621298789978027, + "learning_rate": 8.27503257492232e-06, + "loss": 2.0196, + "step": 5850 + }, + { + "epoch": 1.76, + "grad_norm": 13.761375427246094, + "learning_rate": 8.273027964317933e-06, + "loss": 2.1065, + "step": 5851 + }, + { + "epoch": 1.76, + "grad_norm": 35.57234191894531, + "learning_rate": 8.271023353713543e-06, + "loss": 1.1288, + "step": 5852 + }, + { + "epoch": 1.76, + "grad_norm": 10.00034236907959, + "learning_rate": 8.269018743109151e-06, + "loss": 1.3954, + "step": 5853 + }, + { + "epoch": 1.76, + "grad_norm": 37.05139923095703, + "learning_rate": 8.267014132504761e-06, + "loss": 1.4784, + "step": 5854 + }, + { + "epoch": 1.76, + "grad_norm": 22.268341064453125, + "learning_rate": 8.265009521900371e-06, + "loss": 1.3877, + "step": 5855 + }, + { + "epoch": 1.76, + "grad_norm": 13.753389358520508, + "learning_rate": 8.263004911295981e-06, + "loss": 1.5012, + "step": 5856 + }, + { + "epoch": 1.76, + "grad_norm": 18.250154495239258, + "learning_rate": 8.261000300691591e-06, + "loss": 1.7476, + "step": 5857 + }, + { + "epoch": 1.76, + "grad_norm": 14.758729934692383, + "learning_rate": 8.258995690087201e-06, + "loss": 1.5117, + "step": 5858 + }, + { + "epoch": 1.76, + "grad_norm": 20.81943130493164, + "learning_rate": 8.256991079482812e-06, + "loss": 1.363, + "step": 5859 + }, + { + "epoch": 1.76, + "grad_norm": 26.64165496826172, + "learning_rate": 8.254986468878422e-06, + "loss": 1.7073, + "step": 5860 + }, + { + "epoch": 1.76, + "grad_norm": 14.987895965576172, + "learning_rate": 8.25298185827403e-06, + "loss": 0.7878, + "step": 5861 + }, + { + "epoch": 1.76, + "grad_norm": 17.91753387451172, + "learning_rate": 8.25097724766964e-06, + "loss": 1.4102, + "step": 5862 + }, + { + "epoch": 1.76, + "grad_norm": 6.570464134216309, + "learning_rate": 8.248972637065252e-06, + "loss": 0.7017, + "step": 5863 + }, + { + "epoch": 1.76, + "grad_norm": 21.44278335571289, + "learning_rate": 8.24696802646086e-06, + "loss": 1.4487, + "step": 5864 + }, + { + "epoch": 1.76, + "grad_norm": 15.875109672546387, + "learning_rate": 8.24496341585647e-06, + "loss": 1.178, + "step": 5865 + }, + { + "epoch": 1.76, + "grad_norm": 36.79011154174805, + "learning_rate": 8.24295880525208e-06, + "loss": 1.9915, + "step": 5866 + }, + { + "epoch": 1.76, + "grad_norm": 21.93402862548828, + "learning_rate": 8.24095419464769e-06, + "loss": 1.4122, + "step": 5867 + }, + { + "epoch": 1.76, + "grad_norm": 23.80208969116211, + "learning_rate": 8.2389495840433e-06, + "loss": 2.1488, + "step": 5868 + }, + { + "epoch": 1.76, + "grad_norm": 10.25802993774414, + "learning_rate": 8.23694497343891e-06, + "loss": 1.7158, + "step": 5869 + }, + { + "epoch": 1.76, + "grad_norm": 38.02273178100586, + "learning_rate": 8.23494036283452e-06, + "loss": 1.6496, + "step": 5870 + }, + { + "epoch": 1.77, + "grad_norm": 27.261823654174805, + "learning_rate": 8.232935752230131e-06, + "loss": 2.1812, + "step": 5871 + }, + { + "epoch": 1.77, + "grad_norm": 10.888236999511719, + "learning_rate": 8.23093114162574e-06, + "loss": 1.3974, + "step": 5872 + }, + { + "epoch": 1.77, + "grad_norm": 33.661293029785156, + "learning_rate": 8.22892653102135e-06, + "loss": 2.1975, + "step": 5873 + }, + { + "epoch": 1.77, + "grad_norm": 38.529293060302734, + "learning_rate": 8.22692192041696e-06, + "loss": 1.8204, + "step": 5874 + }, + { + "epoch": 1.77, + "grad_norm": 28.78758430480957, + "learning_rate": 8.22491730981257e-06, + "loss": 2.9932, + "step": 5875 + }, + { + "epoch": 1.77, + "grad_norm": 10.32004165649414, + "learning_rate": 8.22291269920818e-06, + "loss": 2.0646, + "step": 5876 + }, + { + "epoch": 1.77, + "grad_norm": 18.924707412719727, + "learning_rate": 8.22090808860379e-06, + "loss": 2.3191, + "step": 5877 + }, + { + "epoch": 1.77, + "grad_norm": 14.681024551391602, + "learning_rate": 8.2189034779994e-06, + "loss": 1.1271, + "step": 5878 + }, + { + "epoch": 1.77, + "grad_norm": 13.027987480163574, + "learning_rate": 8.216898867395008e-06, + "loss": 1.6397, + "step": 5879 + }, + { + "epoch": 1.77, + "grad_norm": 37.9376335144043, + "learning_rate": 8.214894256790618e-06, + "loss": 1.7215, + "step": 5880 + }, + { + "epoch": 1.77, + "eval_loss": 0.19819287955760956, + "eval_runtime": 43.4174, + "eval_samples_per_second": 34.065, + "eval_steps_per_second": 34.065, + "step": 5880 + }, + { + "epoch": 1.77, + "grad_norm": 37.3797492980957, + "learning_rate": 8.21288964618623e-06, + "loss": 2.3832, + "step": 5881 + }, + { + "epoch": 1.77, + "grad_norm": 20.478801727294922, + "learning_rate": 8.210885035581838e-06, + "loss": 1.8481, + "step": 5882 + }, + { + "epoch": 1.77, + "grad_norm": 22.586971282958984, + "learning_rate": 8.208880424977449e-06, + "loss": 1.8847, + "step": 5883 + }, + { + "epoch": 1.77, + "grad_norm": 25.06772232055664, + "learning_rate": 8.206875814373059e-06, + "loss": 2.011, + "step": 5884 + }, + { + "epoch": 1.77, + "grad_norm": 13.59852123260498, + "learning_rate": 8.204871203768669e-06, + "loss": 1.5316, + "step": 5885 + }, + { + "epoch": 1.77, + "grad_norm": 15.501117706298828, + "learning_rate": 8.202866593164279e-06, + "loss": 1.5476, + "step": 5886 + }, + { + "epoch": 1.77, + "grad_norm": 15.47221851348877, + "learning_rate": 8.200861982559889e-06, + "loss": 1.5741, + "step": 5887 + }, + { + "epoch": 1.77, + "grad_norm": 25.984909057617188, + "learning_rate": 8.198857371955499e-06, + "loss": 1.8725, + "step": 5888 + }, + { + "epoch": 1.77, + "grad_norm": 23.49461555480957, + "learning_rate": 8.196852761351109e-06, + "loss": 2.3657, + "step": 5889 + }, + { + "epoch": 1.77, + "grad_norm": 12.506348609924316, + "learning_rate": 8.194848150746717e-06, + "loss": 1.4631, + "step": 5890 + }, + { + "epoch": 1.77, + "grad_norm": 22.092987060546875, + "learning_rate": 8.192843540142327e-06, + "loss": 1.3425, + "step": 5891 + }, + { + "epoch": 1.77, + "grad_norm": 24.98388671875, + "learning_rate": 8.190838929537938e-06, + "loss": 2.064, + "step": 5892 + }, + { + "epoch": 1.77, + "grad_norm": 15.064332962036133, + "learning_rate": 8.188834318933548e-06, + "loss": 1.257, + "step": 5893 + }, + { + "epoch": 1.77, + "grad_norm": 54.17677307128906, + "learning_rate": 8.186829708329158e-06, + "loss": 2.009, + "step": 5894 + }, + { + "epoch": 1.77, + "grad_norm": 18.08331298828125, + "learning_rate": 8.184825097724768e-06, + "loss": 1.2969, + "step": 5895 + }, + { + "epoch": 1.77, + "grad_norm": 32.95471954345703, + "learning_rate": 8.182820487120378e-06, + "loss": 1.2109, + "step": 5896 + }, + { + "epoch": 1.77, + "grad_norm": 11.22049331665039, + "learning_rate": 8.180815876515988e-06, + "loss": 1.2192, + "step": 5897 + }, + { + "epoch": 1.77, + "grad_norm": 14.255977630615234, + "learning_rate": 8.178811265911596e-06, + "loss": 1.887, + "step": 5898 + }, + { + "epoch": 1.77, + "grad_norm": 12.499604225158691, + "learning_rate": 8.176806655307206e-06, + "loss": 1.3196, + "step": 5899 + }, + { + "epoch": 1.77, + "grad_norm": 15.631196975708008, + "learning_rate": 8.174802044702818e-06, + "loss": 1.3063, + "step": 5900 + }, + { + "epoch": 1.77, + "grad_norm": 13.534256935119629, + "learning_rate": 8.172797434098427e-06, + "loss": 1.6275, + "step": 5901 + }, + { + "epoch": 1.77, + "grad_norm": 10.233455657958984, + "learning_rate": 8.170792823494037e-06, + "loss": 1.1873, + "step": 5902 + }, + { + "epoch": 1.77, + "grad_norm": 11.301042556762695, + "learning_rate": 8.168788212889647e-06, + "loss": 1.2987, + "step": 5903 + }, + { + "epoch": 1.78, + "grad_norm": 46.812076568603516, + "learning_rate": 8.166783602285257e-06, + "loss": 1.7821, + "step": 5904 + }, + { + "epoch": 1.78, + "grad_norm": 34.08024597167969, + "learning_rate": 8.164778991680867e-06, + "loss": 1.3355, + "step": 5905 + }, + { + "epoch": 1.78, + "grad_norm": 12.29112720489502, + "learning_rate": 8.162774381076477e-06, + "loss": 1.3053, + "step": 5906 + }, + { + "epoch": 1.78, + "grad_norm": 16.215862274169922, + "learning_rate": 8.160769770472087e-06, + "loss": 1.6378, + "step": 5907 + }, + { + "epoch": 1.78, + "grad_norm": 33.6343994140625, + "learning_rate": 8.158765159867697e-06, + "loss": 1.9444, + "step": 5908 + }, + { + "epoch": 1.78, + "grad_norm": 27.786399841308594, + "learning_rate": 8.156760549263306e-06, + "loss": 1.1768, + "step": 5909 + }, + { + "epoch": 1.78, + "grad_norm": 22.714481353759766, + "learning_rate": 8.154755938658916e-06, + "loss": 1.5109, + "step": 5910 + }, + { + "epoch": 1.78, + "grad_norm": 14.203557968139648, + "learning_rate": 8.152751328054526e-06, + "loss": 1.2664, + "step": 5911 + }, + { + "epoch": 1.78, + "grad_norm": 25.13650894165039, + "learning_rate": 8.150746717450136e-06, + "loss": 1.6833, + "step": 5912 + }, + { + "epoch": 1.78, + "grad_norm": 8.750547409057617, + "learning_rate": 8.148742106845746e-06, + "loss": 0.7758, + "step": 5913 + }, + { + "epoch": 1.78, + "grad_norm": 13.47951889038086, + "learning_rate": 8.146737496241356e-06, + "loss": 0.9706, + "step": 5914 + }, + { + "epoch": 1.78, + "grad_norm": 25.456212997436523, + "learning_rate": 8.144732885636966e-06, + "loss": 2.0665, + "step": 5915 + }, + { + "epoch": 1.78, + "grad_norm": 26.591684341430664, + "learning_rate": 8.142728275032575e-06, + "loss": 2.694, + "step": 5916 + }, + { + "epoch": 1.78, + "grad_norm": 141.93113708496094, + "learning_rate": 8.140723664428185e-06, + "loss": 2.0094, + "step": 5917 + }, + { + "epoch": 1.78, + "grad_norm": 14.334453582763672, + "learning_rate": 8.138719053823796e-06, + "loss": 1.156, + "step": 5918 + }, + { + "epoch": 1.78, + "grad_norm": 31.279136657714844, + "learning_rate": 8.136714443219406e-06, + "loss": 1.2964, + "step": 5919 + }, + { + "epoch": 1.78, + "grad_norm": 13.835858345031738, + "learning_rate": 8.134709832615015e-06, + "loss": 0.992, + "step": 5920 + }, + { + "epoch": 1.78, + "grad_norm": 20.324710845947266, + "learning_rate": 8.132705222010625e-06, + "loss": 2.0425, + "step": 5921 + }, + { + "epoch": 1.78, + "grad_norm": 9.181856155395508, + "learning_rate": 8.130700611406235e-06, + "loss": 1.2267, + "step": 5922 + }, + { + "epoch": 1.78, + "grad_norm": 12.382397651672363, + "learning_rate": 8.128696000801845e-06, + "loss": 1.4096, + "step": 5923 + }, + { + "epoch": 1.78, + "grad_norm": 19.25230598449707, + "learning_rate": 8.126691390197455e-06, + "loss": 1.86, + "step": 5924 + }, + { + "epoch": 1.78, + "grad_norm": 9.224576950073242, + "learning_rate": 8.124686779593065e-06, + "loss": 1.291, + "step": 5925 + }, + { + "epoch": 1.78, + "grad_norm": 26.03525161743164, + "learning_rate": 8.122682168988675e-06, + "loss": 2.0772, + "step": 5926 + }, + { + "epoch": 1.78, + "grad_norm": 12.974356651306152, + "learning_rate": 8.120677558384284e-06, + "loss": 1.524, + "step": 5927 + }, + { + "epoch": 1.78, + "grad_norm": 9.397273063659668, + "learning_rate": 8.118672947779894e-06, + "loss": 1.2373, + "step": 5928 + }, + { + "epoch": 1.78, + "grad_norm": 12.543724060058594, + "learning_rate": 8.116668337175504e-06, + "loss": 1.4601, + "step": 5929 + }, + { + "epoch": 1.78, + "grad_norm": 10.495887756347656, + "learning_rate": 8.114663726571114e-06, + "loss": 1.3694, + "step": 5930 + }, + { + "epoch": 1.78, + "grad_norm": 12.478328704833984, + "learning_rate": 8.112659115966724e-06, + "loss": 1.8051, + "step": 5931 + }, + { + "epoch": 1.78, + "grad_norm": 18.770479202270508, + "learning_rate": 8.110654505362334e-06, + "loss": 1.3905, + "step": 5932 + }, + { + "epoch": 1.78, + "grad_norm": 14.52056884765625, + "learning_rate": 8.108649894757944e-06, + "loss": 1.4971, + "step": 5933 + }, + { + "epoch": 1.78, + "grad_norm": 25.12417221069336, + "learning_rate": 8.106645284153554e-06, + "loss": 1.9117, + "step": 5934 + }, + { + "epoch": 1.78, + "grad_norm": 40.989715576171875, + "learning_rate": 8.104640673549163e-06, + "loss": 2.4517, + "step": 5935 + }, + { + "epoch": 1.78, + "grad_norm": 7.442935943603516, + "learning_rate": 8.102636062944773e-06, + "loss": 1.0286, + "step": 5936 + }, + { + "epoch": 1.79, + "grad_norm": 13.013799667358398, + "learning_rate": 8.100631452340385e-06, + "loss": 1.1536, + "step": 5937 + }, + { + "epoch": 1.79, + "grad_norm": 10.014057159423828, + "learning_rate": 8.098626841735993e-06, + "loss": 1.193, + "step": 5938 + }, + { + "epoch": 1.79, + "grad_norm": 24.215864181518555, + "learning_rate": 8.096622231131603e-06, + "loss": 1.3133, + "step": 5939 + }, + { + "epoch": 1.79, + "grad_norm": 14.128569602966309, + "learning_rate": 8.094617620527213e-06, + "loss": 1.0683, + "step": 5940 + }, + { + "epoch": 1.79, + "grad_norm": 18.90635108947754, + "learning_rate": 8.092613009922823e-06, + "loss": 1.3165, + "step": 5941 + }, + { + "epoch": 1.79, + "grad_norm": 7.926889896392822, + "learning_rate": 8.090608399318433e-06, + "loss": 1.3542, + "step": 5942 + }, + { + "epoch": 1.79, + "grad_norm": 15.462095260620117, + "learning_rate": 8.088603788714043e-06, + "loss": 1.7081, + "step": 5943 + }, + { + "epoch": 1.79, + "grad_norm": 13.095417976379395, + "learning_rate": 8.086599178109653e-06, + "loss": 1.3562, + "step": 5944 + }, + { + "epoch": 1.79, + "grad_norm": 16.457101821899414, + "learning_rate": 8.084594567505264e-06, + "loss": 1.1891, + "step": 5945 + }, + { + "epoch": 1.79, + "grad_norm": 9.37852668762207, + "learning_rate": 8.082589956900872e-06, + "loss": 1.3149, + "step": 5946 + }, + { + "epoch": 1.79, + "grad_norm": 10.058578491210938, + "learning_rate": 8.080585346296482e-06, + "loss": 1.2129, + "step": 5947 + }, + { + "epoch": 1.79, + "grad_norm": 15.420865058898926, + "learning_rate": 8.078580735692092e-06, + "loss": 2.2392, + "step": 5948 + }, + { + "epoch": 1.79, + "grad_norm": 17.6695556640625, + "learning_rate": 8.076576125087702e-06, + "loss": 1.1624, + "step": 5949 + }, + { + "epoch": 1.79, + "grad_norm": 12.30250072479248, + "learning_rate": 8.074571514483312e-06, + "loss": 1.4646, + "step": 5950 + }, + { + "epoch": 1.79, + "grad_norm": 26.107072830200195, + "learning_rate": 8.072566903878922e-06, + "loss": 1.4681, + "step": 5951 + }, + { + "epoch": 1.79, + "grad_norm": 16.788949966430664, + "learning_rate": 8.070562293274532e-06, + "loss": 1.3324, + "step": 5952 + }, + { + "epoch": 1.79, + "grad_norm": 43.306427001953125, + "learning_rate": 8.068557682670143e-06, + "loss": 1.6477, + "step": 5953 + }, + { + "epoch": 1.79, + "grad_norm": 14.007320404052734, + "learning_rate": 8.066553072065751e-06, + "loss": 0.9753, + "step": 5954 + }, + { + "epoch": 1.79, + "grad_norm": 9.967177391052246, + "learning_rate": 8.064548461461363e-06, + "loss": 0.8905, + "step": 5955 + }, + { + "epoch": 1.79, + "grad_norm": 63.55514144897461, + "learning_rate": 8.062543850856973e-06, + "loss": 2.9485, + "step": 5956 + }, + { + "epoch": 1.79, + "grad_norm": 18.12586212158203, + "learning_rate": 8.060539240252581e-06, + "loss": 1.9415, + "step": 5957 + }, + { + "epoch": 1.79, + "grad_norm": 9.148773193359375, + "learning_rate": 8.058534629648191e-06, + "loss": 1.476, + "step": 5958 + }, + { + "epoch": 1.79, + "grad_norm": 10.708128929138184, + "learning_rate": 8.056530019043801e-06, + "loss": 1.3661, + "step": 5959 + }, + { + "epoch": 1.79, + "grad_norm": 20.42496681213379, + "learning_rate": 8.054525408439411e-06, + "loss": 1.4746, + "step": 5960 + }, + { + "epoch": 1.79, + "grad_norm": 15.104199409484863, + "learning_rate": 8.052520797835022e-06, + "loss": 1.8083, + "step": 5961 + }, + { + "epoch": 1.79, + "grad_norm": 20.05451011657715, + "learning_rate": 8.050516187230632e-06, + "loss": 1.6234, + "step": 5962 + }, + { + "epoch": 1.79, + "grad_norm": 23.32365608215332, + "learning_rate": 8.048511576626242e-06, + "loss": 1.5622, + "step": 5963 + }, + { + "epoch": 1.79, + "grad_norm": 17.466800689697266, + "learning_rate": 8.04650696602185e-06, + "loss": 0.6051, + "step": 5964 + }, + { + "epoch": 1.79, + "grad_norm": 26.45103645324707, + "learning_rate": 8.04450235541746e-06, + "loss": 1.5636, + "step": 5965 + }, + { + "epoch": 1.79, + "grad_norm": 25.677671432495117, + "learning_rate": 8.04249774481307e-06, + "loss": 1.1426, + "step": 5966 + }, + { + "epoch": 1.79, + "grad_norm": 17.478527069091797, + "learning_rate": 8.04049313420868e-06, + "loss": 0.8924, + "step": 5967 + }, + { + "epoch": 1.79, + "grad_norm": 15.59758472442627, + "learning_rate": 8.03848852360429e-06, + "loss": 1.4224, + "step": 5968 + }, + { + "epoch": 1.79, + "grad_norm": 20.508508682250977, + "learning_rate": 8.0364839129999e-06, + "loss": 2.0134, + "step": 5969 + }, + { + "epoch": 1.79, + "grad_norm": 14.481962203979492, + "learning_rate": 8.03447930239551e-06, + "loss": 1.176, + "step": 5970 + }, + { + "epoch": 1.8, + "grad_norm": 17.673538208007812, + "learning_rate": 8.03247469179112e-06, + "loss": 1.5274, + "step": 5971 + }, + { + "epoch": 1.8, + "grad_norm": 16.05086898803711, + "learning_rate": 8.030470081186729e-06, + "loss": 1.2769, + "step": 5972 + }, + { + "epoch": 1.8, + "grad_norm": 26.023082733154297, + "learning_rate": 8.028465470582339e-06, + "loss": 0.8421, + "step": 5973 + }, + { + "epoch": 1.8, + "grad_norm": 19.35530662536621, + "learning_rate": 8.026460859977951e-06, + "loss": 2.123, + "step": 5974 + }, + { + "epoch": 1.8, + "grad_norm": 45.65658950805664, + "learning_rate": 8.02445624937356e-06, + "loss": 2.4255, + "step": 5975 + }, + { + "epoch": 1.8, + "grad_norm": 50.98988342285156, + "learning_rate": 8.02245163876917e-06, + "loss": 2.1818, + "step": 5976 + }, + { + "epoch": 1.8, + "grad_norm": 25.256816864013672, + "learning_rate": 8.02044702816478e-06, + "loss": 2.182, + "step": 5977 + }, + { + "epoch": 1.8, + "grad_norm": 18.960912704467773, + "learning_rate": 8.01844241756039e-06, + "loss": 1.4124, + "step": 5978 + }, + { + "epoch": 1.8, + "grad_norm": 20.172710418701172, + "learning_rate": 8.016437806956e-06, + "loss": 0.64, + "step": 5979 + }, + { + "epoch": 1.8, + "grad_norm": 16.025949478149414, + "learning_rate": 8.01443319635161e-06, + "loss": 1.1168, + "step": 5980 + }, + { + "epoch": 1.8, + "grad_norm": 19.96694564819336, + "learning_rate": 8.01242858574722e-06, + "loss": 0.9764, + "step": 5981 + }, + { + "epoch": 1.8, + "grad_norm": 28.706392288208008, + "learning_rate": 8.01042397514283e-06, + "loss": 1.6412, + "step": 5982 + }, + { + "epoch": 1.8, + "grad_norm": 9.621232032775879, + "learning_rate": 8.008419364538438e-06, + "loss": 0.9458, + "step": 5983 + }, + { + "epoch": 1.8, + "grad_norm": 50.64955520629883, + "learning_rate": 8.006414753934048e-06, + "loss": 1.558, + "step": 5984 + }, + { + "epoch": 1.8, + "grad_norm": 54.46828842163086, + "learning_rate": 8.004410143329658e-06, + "loss": 2.4981, + "step": 5985 + }, + { + "epoch": 1.8, + "grad_norm": 41.00409698486328, + "learning_rate": 8.002405532725269e-06, + "loss": 1.5329, + "step": 5986 + }, + { + "epoch": 1.8, + "grad_norm": 14.016818046569824, + "learning_rate": 8.000400922120879e-06, + "loss": 1.4779, + "step": 5987 + }, + { + "epoch": 1.8, + "grad_norm": 46.308258056640625, + "learning_rate": 7.998396311516489e-06, + "loss": 1.1857, + "step": 5988 + }, + { + "epoch": 1.8, + "grad_norm": 10.816703796386719, + "learning_rate": 7.996391700912099e-06, + "loss": 1.2238, + "step": 5989 + }, + { + "epoch": 1.8, + "grad_norm": 14.56143856048584, + "learning_rate": 7.994387090307709e-06, + "loss": 1.5197, + "step": 5990 + }, + { + "epoch": 1.8, + "grad_norm": 13.148183822631836, + "learning_rate": 7.992382479703317e-06, + "loss": 1.0046, + "step": 5991 + }, + { + "epoch": 1.8, + "grad_norm": 54.134552001953125, + "learning_rate": 7.990377869098929e-06, + "loss": 1.8494, + "step": 5992 + }, + { + "epoch": 1.8, + "grad_norm": 12.82621955871582, + "learning_rate": 7.988373258494539e-06, + "loss": 1.4167, + "step": 5993 + }, + { + "epoch": 1.8, + "grad_norm": 14.938492774963379, + "learning_rate": 7.986368647890148e-06, + "loss": 0.6812, + "step": 5994 + }, + { + "epoch": 1.8, + "grad_norm": 39.79289627075195, + "learning_rate": 7.984364037285758e-06, + "loss": 4.1951, + "step": 5995 + }, + { + "epoch": 1.8, + "grad_norm": 12.317481994628906, + "learning_rate": 7.982359426681368e-06, + "loss": 1.0172, + "step": 5996 + }, + { + "epoch": 1.8, + "grad_norm": 28.569238662719727, + "learning_rate": 7.980354816076978e-06, + "loss": 2.539, + "step": 5997 + }, + { + "epoch": 1.8, + "grad_norm": 28.877849578857422, + "learning_rate": 7.978350205472588e-06, + "loss": 1.4473, + "step": 5998 + }, + { + "epoch": 1.8, + "grad_norm": 42.884979248046875, + "learning_rate": 7.976345594868198e-06, + "loss": 0.6649, + "step": 5999 + }, + { + "epoch": 1.8, + "grad_norm": 8.961115837097168, + "learning_rate": 7.974340984263808e-06, + "loss": 1.1251, + "step": 6000 + }, + { + "epoch": 1.8, + "eval_loss": 0.19368119537830353, + "eval_runtime": 43.5079, + "eval_samples_per_second": 33.994, + "eval_steps_per_second": 33.994, + "step": 6000 + }, + { + "epoch": 1.8, + "grad_norm": 72.2662124633789, + "learning_rate": 7.972336373659416e-06, + "loss": 2.015, + "step": 6001 + }, + { + "epoch": 1.8, + "grad_norm": 16.20361328125, + "learning_rate": 7.970331763055027e-06, + "loss": 1.2574, + "step": 6002 + }, + { + "epoch": 1.8, + "grad_norm": 20.717201232910156, + "learning_rate": 7.968327152450637e-06, + "loss": 1.5776, + "step": 6003 + }, + { + "epoch": 1.81, + "grad_norm": 23.648340225219727, + "learning_rate": 7.966322541846248e-06, + "loss": 1.8967, + "step": 6004 + }, + { + "epoch": 1.81, + "grad_norm": 28.407695770263672, + "learning_rate": 7.964317931241857e-06, + "loss": 2.1085, + "step": 6005 + }, + { + "epoch": 1.81, + "grad_norm": 13.047331809997559, + "learning_rate": 7.962313320637467e-06, + "loss": 1.9436, + "step": 6006 + }, + { + "epoch": 1.81, + "grad_norm": 12.131548881530762, + "learning_rate": 7.960308710033077e-06, + "loss": 1.3458, + "step": 6007 + }, + { + "epoch": 1.81, + "grad_norm": 15.92016315460205, + "learning_rate": 7.958304099428687e-06, + "loss": 0.8177, + "step": 6008 + }, + { + "epoch": 1.81, + "grad_norm": 9.055414199829102, + "learning_rate": 7.956299488824295e-06, + "loss": 1.3297, + "step": 6009 + }, + { + "epoch": 1.81, + "grad_norm": 13.94730281829834, + "learning_rate": 7.954294878219907e-06, + "loss": 1.4445, + "step": 6010 + }, + { + "epoch": 1.81, + "grad_norm": 17.391616821289062, + "learning_rate": 7.952290267615517e-06, + "loss": 0.9038, + "step": 6011 + }, + { + "epoch": 1.81, + "grad_norm": 198.21731567382812, + "learning_rate": 7.950285657011126e-06, + "loss": 2.3957, + "step": 6012 + }, + { + "epoch": 1.81, + "grad_norm": 12.243670463562012, + "learning_rate": 7.948281046406736e-06, + "loss": 1.4155, + "step": 6013 + }, + { + "epoch": 1.81, + "grad_norm": 13.951066017150879, + "learning_rate": 7.946276435802346e-06, + "loss": 1.4097, + "step": 6014 + }, + { + "epoch": 1.81, + "grad_norm": 24.333097457885742, + "learning_rate": 7.944271825197956e-06, + "loss": 2.4542, + "step": 6015 + }, + { + "epoch": 1.81, + "grad_norm": 17.491724014282227, + "learning_rate": 7.942267214593566e-06, + "loss": 1.649, + "step": 6016 + }, + { + "epoch": 1.81, + "grad_norm": 11.039308547973633, + "learning_rate": 7.940262603989176e-06, + "loss": 1.1686, + "step": 6017 + }, + { + "epoch": 1.81, + "grad_norm": 18.201019287109375, + "learning_rate": 7.938257993384786e-06, + "loss": 1.5321, + "step": 6018 + }, + { + "epoch": 1.81, + "grad_norm": 29.47193717956543, + "learning_rate": 7.936253382780396e-06, + "loss": 1.5978, + "step": 6019 + }, + { + "epoch": 1.81, + "grad_norm": 24.67177391052246, + "learning_rate": 7.934248772176005e-06, + "loss": 2.0138, + "step": 6020 + }, + { + "epoch": 1.81, + "grad_norm": 25.735088348388672, + "learning_rate": 7.932244161571615e-06, + "loss": 3.0204, + "step": 6021 + }, + { + "epoch": 1.81, + "grad_norm": 63.19715118408203, + "learning_rate": 7.930239550967225e-06, + "loss": 1.5556, + "step": 6022 + }, + { + "epoch": 1.81, + "grad_norm": 12.563130378723145, + "learning_rate": 7.928234940362835e-06, + "loss": 1.3196, + "step": 6023 + }, + { + "epoch": 1.81, + "grad_norm": 21.176742553710938, + "learning_rate": 7.926230329758445e-06, + "loss": 2.416, + "step": 6024 + }, + { + "epoch": 1.81, + "grad_norm": 14.44190502166748, + "learning_rate": 7.924225719154055e-06, + "loss": 1.14, + "step": 6025 + }, + { + "epoch": 1.81, + "grad_norm": 10.227465629577637, + "learning_rate": 7.922221108549665e-06, + "loss": 0.9925, + "step": 6026 + }, + { + "epoch": 1.81, + "grad_norm": 16.822582244873047, + "learning_rate": 7.920216497945275e-06, + "loss": 1.413, + "step": 6027 + }, + { + "epoch": 1.81, + "grad_norm": 23.78660774230957, + "learning_rate": 7.918211887340884e-06, + "loss": 2.2229, + "step": 6028 + }, + { + "epoch": 1.81, + "grad_norm": 22.946805953979492, + "learning_rate": 7.916207276736495e-06, + "loss": 1.9366, + "step": 6029 + }, + { + "epoch": 1.81, + "grad_norm": 11.842130661010742, + "learning_rate": 7.914202666132105e-06, + "loss": 2.1797, + "step": 6030 + }, + { + "epoch": 1.81, + "grad_norm": 11.936902046203613, + "learning_rate": 7.912198055527714e-06, + "loss": 0.9288, + "step": 6031 + }, + { + "epoch": 1.81, + "grad_norm": 12.97545337677002, + "learning_rate": 7.910193444923324e-06, + "loss": 0.8792, + "step": 6032 + }, + { + "epoch": 1.81, + "grad_norm": 22.72028160095215, + "learning_rate": 7.908188834318934e-06, + "loss": 1.8991, + "step": 6033 + }, + { + "epoch": 1.81, + "grad_norm": 68.23728942871094, + "learning_rate": 7.906184223714544e-06, + "loss": 2.0705, + "step": 6034 + }, + { + "epoch": 1.81, + "grad_norm": 17.14826202392578, + "learning_rate": 7.904179613110154e-06, + "loss": 1.9903, + "step": 6035 + }, + { + "epoch": 1.81, + "grad_norm": 33.424381256103516, + "learning_rate": 7.902175002505764e-06, + "loss": 1.5851, + "step": 6036 + }, + { + "epoch": 1.82, + "grad_norm": 21.694814682006836, + "learning_rate": 7.900170391901374e-06, + "loss": 2.1811, + "step": 6037 + }, + { + "epoch": 1.82, + "grad_norm": 11.936399459838867, + "learning_rate": 7.898165781296984e-06, + "loss": 1.4652, + "step": 6038 + }, + { + "epoch": 1.82, + "grad_norm": 10.102071762084961, + "learning_rate": 7.896161170692593e-06, + "loss": 1.9421, + "step": 6039 + }, + { + "epoch": 1.82, + "grad_norm": 32.03966522216797, + "learning_rate": 7.894156560088203e-06, + "loss": 1.4121, + "step": 6040 + }, + { + "epoch": 1.82, + "grad_norm": 31.259422302246094, + "learning_rate": 7.892151949483815e-06, + "loss": 1.255, + "step": 6041 + }, + { + "epoch": 1.82, + "grad_norm": 11.89094352722168, + "learning_rate": 7.890147338879423e-06, + "loss": 1.1638, + "step": 6042 + }, + { + "epoch": 1.82, + "grad_norm": 16.84495735168457, + "learning_rate": 7.888142728275033e-06, + "loss": 1.4586, + "step": 6043 + }, + { + "epoch": 1.82, + "grad_norm": 9.196840286254883, + "learning_rate": 7.886138117670643e-06, + "loss": 2.2115, + "step": 6044 + }, + { + "epoch": 1.82, + "grad_norm": 20.001394271850586, + "learning_rate": 7.884133507066253e-06, + "loss": 1.0957, + "step": 6045 + }, + { + "epoch": 1.82, + "grad_norm": 30.092775344848633, + "learning_rate": 7.882128896461862e-06, + "loss": 1.6151, + "step": 6046 + }, + { + "epoch": 1.82, + "grad_norm": 10.02807331085205, + "learning_rate": 7.880124285857474e-06, + "loss": 0.872, + "step": 6047 + }, + { + "epoch": 1.82, + "grad_norm": 21.552358627319336, + "learning_rate": 7.878119675253084e-06, + "loss": 1.4557, + "step": 6048 + }, + { + "epoch": 1.82, + "grad_norm": 21.00843620300293, + "learning_rate": 7.876115064648692e-06, + "loss": 1.379, + "step": 6049 + }, + { + "epoch": 1.82, + "grad_norm": 16.021032333374023, + "learning_rate": 7.874110454044302e-06, + "loss": 1.8451, + "step": 6050 + }, + { + "epoch": 1.82, + "grad_norm": 14.853699684143066, + "learning_rate": 7.872105843439912e-06, + "loss": 1.125, + "step": 6051 + }, + { + "epoch": 1.82, + "grad_norm": 20.046295166015625, + "learning_rate": 7.870101232835522e-06, + "loss": 1.9366, + "step": 6052 + }, + { + "epoch": 1.82, + "grad_norm": 17.23594856262207, + "learning_rate": 7.868096622231132e-06, + "loss": 1.6949, + "step": 6053 + }, + { + "epoch": 1.82, + "grad_norm": 16.25902557373047, + "learning_rate": 7.866092011626742e-06, + "loss": 1.9581, + "step": 6054 + }, + { + "epoch": 1.82, + "grad_norm": 25.339801788330078, + "learning_rate": 7.864087401022353e-06, + "loss": 1.4696, + "step": 6055 + }, + { + "epoch": 1.82, + "grad_norm": 45.478515625, + "learning_rate": 7.862082790417963e-06, + "loss": 2.0867, + "step": 6056 + }, + { + "epoch": 1.82, + "grad_norm": 19.254344940185547, + "learning_rate": 7.860078179813571e-06, + "loss": 1.6101, + "step": 6057 + }, + { + "epoch": 1.82, + "grad_norm": 23.987213134765625, + "learning_rate": 7.858073569209181e-06, + "loss": 1.1053, + "step": 6058 + }, + { + "epoch": 1.82, + "grad_norm": 33.420997619628906, + "learning_rate": 7.856068958604791e-06, + "loss": 1.3957, + "step": 6059 + }, + { + "epoch": 1.82, + "grad_norm": 53.56259536743164, + "learning_rate": 7.854064348000401e-06, + "loss": 2.8107, + "step": 6060 + }, + { + "epoch": 1.82, + "grad_norm": 14.887017250061035, + "learning_rate": 7.852059737396011e-06, + "loss": 2.4886, + "step": 6061 + }, + { + "epoch": 1.82, + "grad_norm": 15.285018920898438, + "learning_rate": 7.850055126791621e-06, + "loss": 1.1363, + "step": 6062 + }, + { + "epoch": 1.82, + "grad_norm": 16.956491470336914, + "learning_rate": 7.848050516187231e-06, + "loss": 1.4113, + "step": 6063 + }, + { + "epoch": 1.82, + "grad_norm": 22.227479934692383, + "learning_rate": 7.846045905582842e-06, + "loss": 1.2002, + "step": 6064 + }, + { + "epoch": 1.82, + "grad_norm": 44.9412727355957, + "learning_rate": 7.84404129497845e-06, + "loss": 1.583, + "step": 6065 + }, + { + "epoch": 1.82, + "grad_norm": 37.56648254394531, + "learning_rate": 7.842036684374062e-06, + "loss": 2.022, + "step": 6066 + }, + { + "epoch": 1.82, + "grad_norm": 15.846146583557129, + "learning_rate": 7.840032073769672e-06, + "loss": 0.9523, + "step": 6067 + }, + { + "epoch": 1.82, + "grad_norm": 18.317182540893555, + "learning_rate": 7.83802746316528e-06, + "loss": 1.5857, + "step": 6068 + }, + { + "epoch": 1.82, + "grad_norm": 14.531113624572754, + "learning_rate": 7.83602285256089e-06, + "loss": 1.2722, + "step": 6069 + }, + { + "epoch": 1.83, + "grad_norm": 9.223918914794922, + "learning_rate": 7.8340182419565e-06, + "loss": 1.6124, + "step": 6070 + }, + { + "epoch": 1.83, + "grad_norm": 12.137374877929688, + "learning_rate": 7.83201363135211e-06, + "loss": 1.0094, + "step": 6071 + }, + { + "epoch": 1.83, + "grad_norm": 8.65196418762207, + "learning_rate": 7.83000902074772e-06, + "loss": 0.9413, + "step": 6072 + }, + { + "epoch": 1.83, + "grad_norm": 16.220958709716797, + "learning_rate": 7.82800441014333e-06, + "loss": 2.1056, + "step": 6073 + }, + { + "epoch": 1.83, + "grad_norm": 19.515766143798828, + "learning_rate": 7.82599979953894e-06, + "loss": 1.763, + "step": 6074 + }, + { + "epoch": 1.83, + "grad_norm": 11.028536796569824, + "learning_rate": 7.82399518893455e-06, + "loss": 1.577, + "step": 6075 + }, + { + "epoch": 1.83, + "grad_norm": 14.080721855163574, + "learning_rate": 7.82199057833016e-06, + "loss": 1.2928, + "step": 6076 + }, + { + "epoch": 1.83, + "grad_norm": 22.41744041442871, + "learning_rate": 7.81998596772577e-06, + "loss": 1.6717, + "step": 6077 + }, + { + "epoch": 1.83, + "grad_norm": 29.117725372314453, + "learning_rate": 7.817981357121381e-06, + "loss": 2.0723, + "step": 6078 + }, + { + "epoch": 1.83, + "grad_norm": 10.062914848327637, + "learning_rate": 7.81597674651699e-06, + "loss": 0.9723, + "step": 6079 + }, + { + "epoch": 1.83, + "grad_norm": 19.473957061767578, + "learning_rate": 7.8139721359126e-06, + "loss": 1.6513, + "step": 6080 + }, + { + "epoch": 1.83, + "grad_norm": 13.241780281066895, + "learning_rate": 7.81196752530821e-06, + "loss": 1.4396, + "step": 6081 + }, + { + "epoch": 1.83, + "grad_norm": 50.16336441040039, + "learning_rate": 7.80996291470382e-06, + "loss": 1.4343, + "step": 6082 + }, + { + "epoch": 1.83, + "grad_norm": 8.716567039489746, + "learning_rate": 7.807958304099428e-06, + "loss": 0.7952, + "step": 6083 + }, + { + "epoch": 1.83, + "grad_norm": 17.933069229125977, + "learning_rate": 7.80595369349504e-06, + "loss": 2.706, + "step": 6084 + }, + { + "epoch": 1.83, + "grad_norm": 30.77072525024414, + "learning_rate": 7.80394908289065e-06, + "loss": 1.9985, + "step": 6085 + }, + { + "epoch": 1.83, + "grad_norm": 13.682772636413574, + "learning_rate": 7.801944472286258e-06, + "loss": 0.9191, + "step": 6086 + }, + { + "epoch": 1.83, + "grad_norm": 17.457841873168945, + "learning_rate": 7.799939861681868e-06, + "loss": 1.518, + "step": 6087 + }, + { + "epoch": 1.83, + "grad_norm": 41.92538833618164, + "learning_rate": 7.797935251077479e-06, + "loss": 2.1136, + "step": 6088 + }, + { + "epoch": 1.83, + "grad_norm": 22.720664978027344, + "learning_rate": 7.795930640473089e-06, + "loss": 1.8965, + "step": 6089 + }, + { + "epoch": 1.83, + "grad_norm": 13.168417930603027, + "learning_rate": 7.793926029868699e-06, + "loss": 1.3237, + "step": 6090 + }, + { + "epoch": 1.83, + "grad_norm": 22.606231689453125, + "learning_rate": 7.791921419264309e-06, + "loss": 1.6433, + "step": 6091 + }, + { + "epoch": 1.83, + "grad_norm": 22.04924964904785, + "learning_rate": 7.789916808659919e-06, + "loss": 1.9051, + "step": 6092 + }, + { + "epoch": 1.83, + "grad_norm": 12.943755149841309, + "learning_rate": 7.787912198055529e-06, + "loss": 1.3566, + "step": 6093 + }, + { + "epoch": 1.83, + "grad_norm": 8.388080596923828, + "learning_rate": 7.785907587451137e-06, + "loss": 1.0533, + "step": 6094 + }, + { + "epoch": 1.83, + "grad_norm": 39.58984375, + "learning_rate": 7.783902976846747e-06, + "loss": 2.1753, + "step": 6095 + }, + { + "epoch": 1.83, + "grad_norm": 17.842021942138672, + "learning_rate": 7.781898366242357e-06, + "loss": 0.9146, + "step": 6096 + }, + { + "epoch": 1.83, + "grad_norm": 12.273473739624023, + "learning_rate": 7.779893755637968e-06, + "loss": 1.0507, + "step": 6097 + }, + { + "epoch": 1.83, + "grad_norm": 34.619178771972656, + "learning_rate": 7.777889145033578e-06, + "loss": 2.2011, + "step": 6098 + }, + { + "epoch": 1.83, + "grad_norm": 87.12923431396484, + "learning_rate": 7.775884534429188e-06, + "loss": 1.4831, + "step": 6099 + }, + { + "epoch": 1.83, + "grad_norm": 9.995388984680176, + "learning_rate": 7.773879923824798e-06, + "loss": 1.2102, + "step": 6100 + }, + { + "epoch": 1.83, + "grad_norm": 13.057961463928223, + "learning_rate": 7.771875313220408e-06, + "loss": 1.3872, + "step": 6101 + }, + { + "epoch": 1.83, + "grad_norm": 8.191656112670898, + "learning_rate": 7.769870702616016e-06, + "loss": 1.3387, + "step": 6102 + }, + { + "epoch": 1.83, + "grad_norm": 25.005573272705078, + "learning_rate": 7.767866092011628e-06, + "loss": 1.393, + "step": 6103 + }, + { + "epoch": 1.84, + "grad_norm": 11.448406219482422, + "learning_rate": 7.765861481407238e-06, + "loss": 1.2107, + "step": 6104 + }, + { + "epoch": 1.84, + "grad_norm": 10.444814682006836, + "learning_rate": 7.763856870802847e-06, + "loss": 1.1085, + "step": 6105 + }, + { + "epoch": 1.84, + "grad_norm": 22.36345863342285, + "learning_rate": 7.761852260198457e-06, + "loss": 1.4649, + "step": 6106 + }, + { + "epoch": 1.84, + "grad_norm": 12.638916015625, + "learning_rate": 7.759847649594067e-06, + "loss": 1.0798, + "step": 6107 + }, + { + "epoch": 1.84, + "grad_norm": 30.909666061401367, + "learning_rate": 7.757843038989677e-06, + "loss": 1.3542, + "step": 6108 + }, + { + "epoch": 1.84, + "grad_norm": 21.984508514404297, + "learning_rate": 7.755838428385287e-06, + "loss": 1.5787, + "step": 6109 + }, + { + "epoch": 1.84, + "grad_norm": 28.869102478027344, + "learning_rate": 7.753833817780897e-06, + "loss": 1.8047, + "step": 6110 + }, + { + "epoch": 1.84, + "grad_norm": 8.794825553894043, + "learning_rate": 7.751829207176507e-06, + "loss": 0.4721, + "step": 6111 + }, + { + "epoch": 1.84, + "grad_norm": 20.192941665649414, + "learning_rate": 7.749824596572117e-06, + "loss": 1.8928, + "step": 6112 + }, + { + "epoch": 1.84, + "grad_norm": 8.28946304321289, + "learning_rate": 7.747819985967726e-06, + "loss": 0.8887, + "step": 6113 + }, + { + "epoch": 1.84, + "grad_norm": 7.44697380065918, + "learning_rate": 7.745815375363336e-06, + "loss": 1.0449, + "step": 6114 + }, + { + "epoch": 1.84, + "grad_norm": 47.45841598510742, + "learning_rate": 7.743810764758947e-06, + "loss": 1.8076, + "step": 6115 + }, + { + "epoch": 1.84, + "grad_norm": 22.896045684814453, + "learning_rate": 7.741806154154556e-06, + "loss": 1.3109, + "step": 6116 + }, + { + "epoch": 1.84, + "grad_norm": 51.84198760986328, + "learning_rate": 7.739801543550166e-06, + "loss": 1.9647, + "step": 6117 + }, + { + "epoch": 1.84, + "grad_norm": 16.364362716674805, + "learning_rate": 7.737796932945776e-06, + "loss": 1.6399, + "step": 6118 + }, + { + "epoch": 1.84, + "grad_norm": 14.368910789489746, + "learning_rate": 7.735792322341386e-06, + "loss": 1.0631, + "step": 6119 + }, + { + "epoch": 1.84, + "grad_norm": 11.560124397277832, + "learning_rate": 7.733787711736994e-06, + "loss": 1.6401, + "step": 6120 + }, + { + "epoch": 1.84, + "eval_loss": 0.1900317370891571, + "eval_runtime": 43.601, + "eval_samples_per_second": 33.921, + "eval_steps_per_second": 33.921, + "step": 6120 + }, + { + "epoch": 1.84, + "grad_norm": 11.354382514953613, + "learning_rate": 7.731783101132606e-06, + "loss": 1.3231, + "step": 6121 + }, + { + "epoch": 1.84, + "grad_norm": 8.842697143554688, + "learning_rate": 7.729778490528216e-06, + "loss": 0.5822, + "step": 6122 + }, + { + "epoch": 1.84, + "grad_norm": 44.98017501831055, + "learning_rate": 7.727773879923826e-06, + "loss": 1.6967, + "step": 6123 + }, + { + "epoch": 1.84, + "grad_norm": 29.9891414642334, + "learning_rate": 7.725769269319435e-06, + "loss": 1.5777, + "step": 6124 + }, + { + "epoch": 1.84, + "grad_norm": 25.591136932373047, + "learning_rate": 7.723764658715045e-06, + "loss": 0.9665, + "step": 6125 + }, + { + "epoch": 1.84, + "grad_norm": 13.819255828857422, + "learning_rate": 7.721760048110655e-06, + "loss": 2.2739, + "step": 6126 + }, + { + "epoch": 1.84, + "grad_norm": 28.475740432739258, + "learning_rate": 7.719755437506265e-06, + "loss": 2.247, + "step": 6127 + }, + { + "epoch": 1.84, + "grad_norm": 12.238512992858887, + "learning_rate": 7.717750826901875e-06, + "loss": 0.8953, + "step": 6128 + }, + { + "epoch": 1.84, + "grad_norm": 35.80490493774414, + "learning_rate": 7.715746216297485e-06, + "loss": 3.4695, + "step": 6129 + }, + { + "epoch": 1.84, + "grad_norm": 7.781877040863037, + "learning_rate": 7.713741605693095e-06, + "loss": 1.8642, + "step": 6130 + }, + { + "epoch": 1.84, + "grad_norm": 13.301283836364746, + "learning_rate": 7.711736995088704e-06, + "loss": 1.3696, + "step": 6131 + }, + { + "epoch": 1.84, + "grad_norm": 10.512940406799316, + "learning_rate": 7.709732384484314e-06, + "loss": 0.6859, + "step": 6132 + }, + { + "epoch": 1.84, + "grad_norm": 25.800260543823242, + "learning_rate": 7.707727773879926e-06, + "loss": 1.9089, + "step": 6133 + }, + { + "epoch": 1.84, + "grad_norm": 15.625998497009277, + "learning_rate": 7.705723163275534e-06, + "loss": 1.8234, + "step": 6134 + }, + { + "epoch": 1.84, + "grad_norm": 42.901615142822266, + "learning_rate": 7.703718552671144e-06, + "loss": 2.0355, + "step": 6135 + }, + { + "epoch": 1.84, + "grad_norm": 15.231833457946777, + "learning_rate": 7.701713942066754e-06, + "loss": 1.1297, + "step": 6136 + }, + { + "epoch": 1.85, + "grad_norm": 21.943153381347656, + "learning_rate": 7.699709331462364e-06, + "loss": 1.2517, + "step": 6137 + }, + { + "epoch": 1.85, + "grad_norm": 22.251258850097656, + "learning_rate": 7.697704720857974e-06, + "loss": 1.5025, + "step": 6138 + }, + { + "epoch": 1.85, + "grad_norm": 15.252081871032715, + "learning_rate": 7.695700110253583e-06, + "loss": 1.0183, + "step": 6139 + }, + { + "epoch": 1.85, + "grad_norm": 26.634531021118164, + "learning_rate": 7.693695499649194e-06, + "loss": 1.5842, + "step": 6140 + }, + { + "epoch": 1.85, + "grad_norm": 29.84535789489746, + "learning_rate": 7.691690889044805e-06, + "loss": 1.862, + "step": 6141 + }, + { + "epoch": 1.85, + "grad_norm": 24.699621200561523, + "learning_rate": 7.689686278440413e-06, + "loss": 1.8952, + "step": 6142 + }, + { + "epoch": 1.85, + "grad_norm": 26.514081954956055, + "learning_rate": 7.687681667836023e-06, + "loss": 1.948, + "step": 6143 + }, + { + "epoch": 1.85, + "grad_norm": 11.536233901977539, + "learning_rate": 7.685677057231633e-06, + "loss": 2.2692, + "step": 6144 + }, + { + "epoch": 1.85, + "grad_norm": 30.69923973083496, + "learning_rate": 7.683672446627243e-06, + "loss": 1.2475, + "step": 6145 + }, + { + "epoch": 1.85, + "grad_norm": 19.856365203857422, + "learning_rate": 7.681667836022853e-06, + "loss": 1.361, + "step": 6146 + }, + { + "epoch": 1.85, + "grad_norm": 10.169954299926758, + "learning_rate": 7.679663225418463e-06, + "loss": 1.3548, + "step": 6147 + }, + { + "epoch": 1.85, + "grad_norm": 36.704524993896484, + "learning_rate": 7.677658614814073e-06, + "loss": 1.8048, + "step": 6148 + }, + { + "epoch": 1.85, + "grad_norm": 14.586971282958984, + "learning_rate": 7.675654004209683e-06, + "loss": 1.6924, + "step": 6149 + }, + { + "epoch": 1.85, + "grad_norm": 31.322877883911133, + "learning_rate": 7.673649393605292e-06, + "loss": 2.1292, + "step": 6150 + }, + { + "epoch": 1.85, + "grad_norm": 13.903470993041992, + "learning_rate": 7.671644783000902e-06, + "loss": 1.5199, + "step": 6151 + }, + { + "epoch": 1.85, + "grad_norm": 206.5603485107422, + "learning_rate": 7.669640172396514e-06, + "loss": 2.1998, + "step": 6152 + }, + { + "epoch": 1.85, + "grad_norm": 42.4091682434082, + "learning_rate": 7.667635561792122e-06, + "loss": 1.8714, + "step": 6153 + }, + { + "epoch": 1.85, + "grad_norm": 10.952982902526855, + "learning_rate": 7.665630951187732e-06, + "loss": 1.7223, + "step": 6154 + }, + { + "epoch": 1.85, + "grad_norm": 12.478805541992188, + "learning_rate": 7.663626340583342e-06, + "loss": 0.9292, + "step": 6155 + }, + { + "epoch": 1.85, + "grad_norm": 45.16120910644531, + "learning_rate": 7.661621729978952e-06, + "loss": 1.1461, + "step": 6156 + }, + { + "epoch": 1.85, + "grad_norm": 11.00290298461914, + "learning_rate": 7.659617119374562e-06, + "loss": 1.3505, + "step": 6157 + }, + { + "epoch": 1.85, + "grad_norm": 12.987632751464844, + "learning_rate": 7.657612508770173e-06, + "loss": 1.1632, + "step": 6158 + }, + { + "epoch": 1.85, + "grad_norm": 23.716419219970703, + "learning_rate": 7.655607898165783e-06, + "loss": 1.9068, + "step": 6159 + }, + { + "epoch": 1.85, + "grad_norm": 30.917993545532227, + "learning_rate": 7.653603287561393e-06, + "loss": 1.9522, + "step": 6160 + }, + { + "epoch": 1.85, + "grad_norm": 9.036660194396973, + "learning_rate": 7.651598676957001e-06, + "loss": 1.1854, + "step": 6161 + }, + { + "epoch": 1.85, + "grad_norm": 53.36625289916992, + "learning_rate": 7.649594066352611e-06, + "loss": 1.6606, + "step": 6162 + }, + { + "epoch": 1.85, + "grad_norm": 23.467453002929688, + "learning_rate": 7.647589455748221e-06, + "loss": 1.7637, + "step": 6163 + }, + { + "epoch": 1.85, + "grad_norm": 16.767295837402344, + "learning_rate": 7.645584845143831e-06, + "loss": 1.3892, + "step": 6164 + }, + { + "epoch": 1.85, + "grad_norm": 21.95707893371582, + "learning_rate": 7.643580234539441e-06, + "loss": 2.415, + "step": 6165 + }, + { + "epoch": 1.85, + "grad_norm": 14.439309120178223, + "learning_rate": 7.641575623935052e-06, + "loss": 1.575, + "step": 6166 + }, + { + "epoch": 1.85, + "grad_norm": 8.983049392700195, + "learning_rate": 7.639571013330662e-06, + "loss": 1.2169, + "step": 6167 + }, + { + "epoch": 1.85, + "grad_norm": 15.929500579833984, + "learning_rate": 7.63756640272627e-06, + "loss": 1.3968, + "step": 6168 + }, + { + "epoch": 1.85, + "grad_norm": 16.895946502685547, + "learning_rate": 7.63556179212188e-06, + "loss": 1.773, + "step": 6169 + }, + { + "epoch": 1.86, + "grad_norm": 11.67673110961914, + "learning_rate": 7.633557181517492e-06, + "loss": 0.8359, + "step": 6170 + }, + { + "epoch": 1.86, + "grad_norm": 12.90839958190918, + "learning_rate": 7.6315525709131e-06, + "loss": 1.2435, + "step": 6171 + }, + { + "epoch": 1.86, + "grad_norm": 50.556400299072266, + "learning_rate": 7.62954796030871e-06, + "loss": 3.8311, + "step": 6172 + }, + { + "epoch": 1.86, + "grad_norm": 9.137083053588867, + "learning_rate": 7.6275433497043204e-06, + "loss": 0.9147, + "step": 6173 + }, + { + "epoch": 1.86, + "grad_norm": 39.34935760498047, + "learning_rate": 7.6255387390999305e-06, + "loss": 2.6975, + "step": 6174 + }, + { + "epoch": 1.86, + "grad_norm": 14.056702613830566, + "learning_rate": 7.62353412849554e-06, + "loss": 1.3314, + "step": 6175 + }, + { + "epoch": 1.86, + "grad_norm": 14.793180465698242, + "learning_rate": 7.621529517891151e-06, + "loss": 1.8193, + "step": 6176 + }, + { + "epoch": 1.86, + "grad_norm": 6.99417781829834, + "learning_rate": 7.619524907286761e-06, + "loss": 0.9592, + "step": 6177 + }, + { + "epoch": 1.86, + "grad_norm": 8.927657127380371, + "learning_rate": 7.61752029668237e-06, + "loss": 0.7251, + "step": 6178 + }, + { + "epoch": 1.86, + "grad_norm": 22.89257049560547, + "learning_rate": 7.61551568607798e-06, + "loss": 2.0944, + "step": 6179 + }, + { + "epoch": 1.86, + "grad_norm": 33.927913665771484, + "learning_rate": 7.613511075473589e-06, + "loss": 2.6198, + "step": 6180 + }, + { + "epoch": 1.86, + "grad_norm": 18.200057983398438, + "learning_rate": 7.611506464869199e-06, + "loss": 1.3679, + "step": 6181 + }, + { + "epoch": 1.86, + "grad_norm": 17.662506103515625, + "learning_rate": 7.6095018542648095e-06, + "loss": 2.0992, + "step": 6182 + }, + { + "epoch": 1.86, + "grad_norm": 11.235939979553223, + "learning_rate": 7.60749724366042e-06, + "loss": 1.4116, + "step": 6183 + }, + { + "epoch": 1.86, + "grad_norm": 19.397539138793945, + "learning_rate": 7.60549263305603e-06, + "loss": 1.3618, + "step": 6184 + }, + { + "epoch": 1.86, + "grad_norm": 10.240893363952637, + "learning_rate": 7.60348802245164e-06, + "loss": 1.0369, + "step": 6185 + }, + { + "epoch": 1.86, + "grad_norm": 14.981084823608398, + "learning_rate": 7.601483411847249e-06, + "loss": 0.8672, + "step": 6186 + }, + { + "epoch": 1.86, + "grad_norm": 12.943142890930176, + "learning_rate": 7.599478801242859e-06, + "loss": 1.19, + "step": 6187 + }, + { + "epoch": 1.86, + "grad_norm": 15.80017375946045, + "learning_rate": 7.597474190638468e-06, + "loss": 1.2749, + "step": 6188 + }, + { + "epoch": 1.86, + "grad_norm": 15.71901798248291, + "learning_rate": 7.595469580034079e-06, + "loss": 1.0749, + "step": 6189 + }, + { + "epoch": 1.86, + "grad_norm": 13.598134994506836, + "learning_rate": 7.593464969429689e-06, + "loss": 1.2334, + "step": 6190 + }, + { + "epoch": 1.86, + "grad_norm": 13.935446739196777, + "learning_rate": 7.5914603588252986e-06, + "loss": 1.2212, + "step": 6191 + }, + { + "epoch": 1.86, + "grad_norm": 28.57442855834961, + "learning_rate": 7.589455748220909e-06, + "loss": 2.0672, + "step": 6192 + }, + { + "epoch": 1.86, + "grad_norm": 35.023197174072266, + "learning_rate": 7.587451137616518e-06, + "loss": 1.8612, + "step": 6193 + }, + { + "epoch": 1.86, + "grad_norm": 11.907183647155762, + "learning_rate": 7.585446527012128e-06, + "loss": 1.6789, + "step": 6194 + }, + { + "epoch": 1.86, + "grad_norm": 18.430017471313477, + "learning_rate": 7.583441916407739e-06, + "loss": 1.3619, + "step": 6195 + }, + { + "epoch": 1.86, + "grad_norm": 29.030160903930664, + "learning_rate": 7.581437305803349e-06, + "loss": 1.116, + "step": 6196 + }, + { + "epoch": 1.86, + "grad_norm": 18.958419799804688, + "learning_rate": 7.579432695198958e-06, + "loss": 1.6912, + "step": 6197 + }, + { + "epoch": 1.86, + "grad_norm": 19.51006317138672, + "learning_rate": 7.577428084594568e-06, + "loss": 1.6718, + "step": 6198 + }, + { + "epoch": 1.86, + "grad_norm": 13.113887786865234, + "learning_rate": 7.5754234739901775e-06, + "loss": 1.2158, + "step": 6199 + }, + { + "epoch": 1.86, + "grad_norm": 13.66335678100586, + "learning_rate": 7.573418863385788e-06, + "loss": 0.8829, + "step": 6200 + }, + { + "epoch": 1.86, + "grad_norm": 12.062017440795898, + "learning_rate": 7.5714142527813986e-06, + "loss": 1.7451, + "step": 6201 + }, + { + "epoch": 1.86, + "grad_norm": 12.591232299804688, + "learning_rate": 7.569409642177008e-06, + "loss": 1.4194, + "step": 6202 + }, + { + "epoch": 1.87, + "grad_norm": 9.007003784179688, + "learning_rate": 7.567405031572618e-06, + "loss": 0.6593, + "step": 6203 + }, + { + "epoch": 1.87, + "grad_norm": 15.050545692443848, + "learning_rate": 7.565400420968227e-06, + "loss": 1.2698, + "step": 6204 + }, + { + "epoch": 1.87, + "grad_norm": 53.996299743652344, + "learning_rate": 7.563395810363837e-06, + "loss": 2.5506, + "step": 6205 + }, + { + "epoch": 1.87, + "grad_norm": 16.08456039428711, + "learning_rate": 7.561391199759447e-06, + "loss": 0.7814, + "step": 6206 + }, + { + "epoch": 1.87, + "grad_norm": 19.180896759033203, + "learning_rate": 7.559386589155057e-06, + "loss": 2.2722, + "step": 6207 + }, + { + "epoch": 1.87, + "grad_norm": 12.53034782409668, + "learning_rate": 7.5573819785506675e-06, + "loss": 1.113, + "step": 6208 + }, + { + "epoch": 1.87, + "grad_norm": 12.597771644592285, + "learning_rate": 7.5553773679462775e-06, + "loss": 1.7598, + "step": 6209 + }, + { + "epoch": 1.87, + "grad_norm": 8.483002662658691, + "learning_rate": 7.553372757341887e-06, + "loss": 0.8353, + "step": 6210 + }, + { + "epoch": 1.87, + "grad_norm": 9.132856369018555, + "learning_rate": 7.551368146737497e-06, + "loss": 1.0596, + "step": 6211 + }, + { + "epoch": 1.87, + "grad_norm": 12.06258773803711, + "learning_rate": 7.549363536133106e-06, + "loss": 1.3816, + "step": 6212 + }, + { + "epoch": 1.87, + "grad_norm": 12.495207786560059, + "learning_rate": 7.547358925528717e-06, + "loss": 2.0172, + "step": 6213 + }, + { + "epoch": 1.87, + "grad_norm": 15.568368911743164, + "learning_rate": 7.545354314924327e-06, + "loss": 2.3213, + "step": 6214 + }, + { + "epoch": 1.87, + "grad_norm": 57.913516998291016, + "learning_rate": 7.543349704319936e-06, + "loss": 1.5673, + "step": 6215 + }, + { + "epoch": 1.87, + "grad_norm": 11.138136863708496, + "learning_rate": 7.5413450937155464e-06, + "loss": 1.1155, + "step": 6216 + }, + { + "epoch": 1.87, + "grad_norm": 36.35742950439453, + "learning_rate": 7.539340483111156e-06, + "loss": 0.877, + "step": 6217 + }, + { + "epoch": 1.87, + "grad_norm": 11.55097484588623, + "learning_rate": 7.537335872506766e-06, + "loss": 0.539, + "step": 6218 + }, + { + "epoch": 1.87, + "grad_norm": 18.280834197998047, + "learning_rate": 7.535331261902377e-06, + "loss": 1.5513, + "step": 6219 + }, + { + "epoch": 1.87, + "grad_norm": 30.530216217041016, + "learning_rate": 7.533326651297986e-06, + "loss": 1.9069, + "step": 6220 + }, + { + "epoch": 1.87, + "grad_norm": 28.866329193115234, + "learning_rate": 7.531322040693596e-06, + "loss": 1.4386, + "step": 6221 + }, + { + "epoch": 1.87, + "grad_norm": 22.6090145111084, + "learning_rate": 7.529317430089206e-06, + "loss": 2.3103, + "step": 6222 + }, + { + "epoch": 1.87, + "grad_norm": 9.455753326416016, + "learning_rate": 7.527312819484815e-06, + "loss": 1.0053, + "step": 6223 + }, + { + "epoch": 1.87, + "grad_norm": 47.22819519042969, + "learning_rate": 7.525308208880425e-06, + "loss": 2.0746, + "step": 6224 + }, + { + "epoch": 1.87, + "grad_norm": 16.05484962463379, + "learning_rate": 7.523303598276035e-06, + "loss": 1.4145, + "step": 6225 + }, + { + "epoch": 1.87, + "grad_norm": 23.540828704833984, + "learning_rate": 7.521298987671646e-06, + "loss": 1.1681, + "step": 6226 + }, + { + "epoch": 1.87, + "grad_norm": 14.86224365234375, + "learning_rate": 7.519294377067256e-06, + "loss": 1.6844, + "step": 6227 + }, + { + "epoch": 1.87, + "grad_norm": 76.27838134765625, + "learning_rate": 7.517289766462865e-06, + "loss": 1.9908, + "step": 6228 + }, + { + "epoch": 1.87, + "grad_norm": 18.715179443359375, + "learning_rate": 7.515285155858475e-06, + "loss": 1.2129, + "step": 6229 + }, + { + "epoch": 1.87, + "grad_norm": 16.809297561645508, + "learning_rate": 7.513280545254085e-06, + "loss": 1.1011, + "step": 6230 + }, + { + "epoch": 1.87, + "grad_norm": 8.646817207336426, + "learning_rate": 7.511275934649694e-06, + "loss": 1.5495, + "step": 6231 + }, + { + "epoch": 1.87, + "grad_norm": 35.9256591796875, + "learning_rate": 7.509271324045305e-06, + "loss": 1.7407, + "step": 6232 + }, + { + "epoch": 1.87, + "grad_norm": 8.104480743408203, + "learning_rate": 7.507266713440915e-06, + "loss": 0.5633, + "step": 6233 + }, + { + "epoch": 1.87, + "grad_norm": 10.297518730163574, + "learning_rate": 7.5052621028365246e-06, + "loss": 1.2832, + "step": 6234 + }, + { + "epoch": 1.87, + "grad_norm": 15.735064506530762, + "learning_rate": 7.503257492232135e-06, + "loss": 1.4141, + "step": 6235 + }, + { + "epoch": 1.87, + "grad_norm": 12.001105308532715, + "learning_rate": 7.501252881627744e-06, + "loss": 1.4383, + "step": 6236 + }, + { + "epoch": 1.88, + "grad_norm": 15.454205513000488, + "learning_rate": 7.499248271023354e-06, + "loss": 1.0, + "step": 6237 + }, + { + "epoch": 1.88, + "grad_norm": 19.108205795288086, + "learning_rate": 7.497243660418965e-06, + "loss": 1.6183, + "step": 6238 + }, + { + "epoch": 1.88, + "grad_norm": 38.10220718383789, + "learning_rate": 7.495239049814574e-06, + "loss": 2.5108, + "step": 6239 + }, + { + "epoch": 1.88, + "grad_norm": 10.768503189086914, + "learning_rate": 7.493234439210184e-06, + "loss": 1.4355, + "step": 6240 + }, + { + "epoch": 1.88, + "eval_loss": 0.19893887639045715, + "eval_runtime": 43.6319, + "eval_samples_per_second": 33.897, + "eval_steps_per_second": 33.897, + "step": 6240 + }, + { + "epoch": 1.88, + "grad_norm": 7.937902450561523, + "learning_rate": 7.4912298286057935e-06, + "loss": 0.9729, + "step": 6241 + }, + { + "epoch": 1.88, + "grad_norm": 10.797032356262207, + "learning_rate": 7.4892252180014035e-06, + "loss": 0.9792, + "step": 6242 + }, + { + "epoch": 1.88, + "grad_norm": 8.380309104919434, + "learning_rate": 7.487220607397014e-06, + "loss": 0.7776, + "step": 6243 + }, + { + "epoch": 1.88, + "grad_norm": 10.550333023071289, + "learning_rate": 7.485215996792624e-06, + "loss": 1.1951, + "step": 6244 + }, + { + "epoch": 1.88, + "grad_norm": 16.15055274963379, + "learning_rate": 7.483211386188234e-06, + "loss": 0.9995, + "step": 6245 + }, + { + "epoch": 1.88, + "grad_norm": 18.002609252929688, + "learning_rate": 7.481206775583844e-06, + "loss": 1.6159, + "step": 6246 + }, + { + "epoch": 1.88, + "grad_norm": 19.96839714050293, + "learning_rate": 7.479202164979453e-06, + "loss": 1.9925, + "step": 6247 + }, + { + "epoch": 1.88, + "grad_norm": 22.932233810424805, + "learning_rate": 7.477197554375063e-06, + "loss": 1.5709, + "step": 6248 + }, + { + "epoch": 1.88, + "grad_norm": 10.494919776916504, + "learning_rate": 7.4751929437706724e-06, + "loss": 1.1168, + "step": 6249 + }, + { + "epoch": 1.88, + "grad_norm": 37.70262145996094, + "learning_rate": 7.473188333166283e-06, + "loss": 1.6439, + "step": 6250 + }, + { + "epoch": 1.88, + "grad_norm": 14.26046085357666, + "learning_rate": 7.4711837225618935e-06, + "loss": 1.4012, + "step": 6251 + }, + { + "epoch": 1.88, + "grad_norm": 19.355712890625, + "learning_rate": 7.469179111957503e-06, + "loss": 1.3238, + "step": 6252 + }, + { + "epoch": 1.88, + "grad_norm": 43.30799865722656, + "learning_rate": 7.467174501353113e-06, + "loss": 2.3852, + "step": 6253 + }, + { + "epoch": 1.88, + "grad_norm": 17.540157318115234, + "learning_rate": 7.465169890748723e-06, + "loss": 0.9667, + "step": 6254 + }, + { + "epoch": 1.88, + "grad_norm": 14.730655670166016, + "learning_rate": 7.463165280144332e-06, + "loss": 1.454, + "step": 6255 + }, + { + "epoch": 1.88, + "grad_norm": 11.64047908782959, + "learning_rate": 7.461160669539943e-06, + "loss": 1.209, + "step": 6256 + }, + { + "epoch": 1.88, + "grad_norm": 8.219613075256348, + "learning_rate": 7.459156058935553e-06, + "loss": 1.3621, + "step": 6257 + }, + { + "epoch": 1.88, + "grad_norm": 15.764402389526367, + "learning_rate": 7.457151448331162e-06, + "loss": 1.7374, + "step": 6258 + }, + { + "epoch": 1.88, + "grad_norm": 17.738845825195312, + "learning_rate": 7.4551468377267724e-06, + "loss": 1.5129, + "step": 6259 + }, + { + "epoch": 1.88, + "grad_norm": 18.939716339111328, + "learning_rate": 7.453142227122382e-06, + "loss": 1.6715, + "step": 6260 + }, + { + "epoch": 1.88, + "grad_norm": 25.509105682373047, + "learning_rate": 7.451137616517992e-06, + "loss": 1.6497, + "step": 6261 + }, + { + "epoch": 1.88, + "grad_norm": 30.341272354125977, + "learning_rate": 7.449133005913603e-06, + "loss": 2.2224, + "step": 6262 + }, + { + "epoch": 1.88, + "grad_norm": 29.086957931518555, + "learning_rate": 7.447128395309212e-06, + "loss": 2.018, + "step": 6263 + }, + { + "epoch": 1.88, + "grad_norm": 13.672849655151367, + "learning_rate": 7.445123784704822e-06, + "loss": 1.4355, + "step": 6264 + }, + { + "epoch": 1.88, + "grad_norm": 27.003616333007812, + "learning_rate": 7.443119174100431e-06, + "loss": 1.4442, + "step": 6265 + }, + { + "epoch": 1.88, + "grad_norm": 19.808364868164062, + "learning_rate": 7.441114563496041e-06, + "loss": 1.2619, + "step": 6266 + }, + { + "epoch": 1.88, + "grad_norm": 17.011503219604492, + "learning_rate": 7.439109952891651e-06, + "loss": 1.2087, + "step": 6267 + }, + { + "epoch": 1.88, + "grad_norm": 16.227067947387695, + "learning_rate": 7.437105342287261e-06, + "loss": 2.4985, + "step": 6268 + }, + { + "epoch": 1.88, + "grad_norm": 15.978217124938965, + "learning_rate": 7.435100731682872e-06, + "loss": 1.3265, + "step": 6269 + }, + { + "epoch": 1.89, + "grad_norm": 16.302669525146484, + "learning_rate": 7.433096121078482e-06, + "loss": 0.8265, + "step": 6270 + }, + { + "epoch": 1.89, + "grad_norm": 20.792295455932617, + "learning_rate": 7.431091510474091e-06, + "loss": 1.4503, + "step": 6271 + }, + { + "epoch": 1.89, + "grad_norm": 9.446883201599121, + "learning_rate": 7.429086899869701e-06, + "loss": 1.8997, + "step": 6272 + }, + { + "epoch": 1.89, + "grad_norm": 24.59245491027832, + "learning_rate": 7.42708228926531e-06, + "loss": 1.1568, + "step": 6273 + }, + { + "epoch": 1.89, + "grad_norm": 13.666716575622559, + "learning_rate": 7.42507767866092e-06, + "loss": 0.9083, + "step": 6274 + }, + { + "epoch": 1.89, + "grad_norm": 17.118921279907227, + "learning_rate": 7.423073068056531e-06, + "loss": 1.5059, + "step": 6275 + }, + { + "epoch": 1.89, + "grad_norm": 56.16904067993164, + "learning_rate": 7.4210684574521405e-06, + "loss": 3.6439, + "step": 6276 + }, + { + "epoch": 1.89, + "grad_norm": 13.676959037780762, + "learning_rate": 7.4190638468477506e-06, + "loss": 1.6671, + "step": 6277 + }, + { + "epoch": 1.89, + "grad_norm": 10.493898391723633, + "learning_rate": 7.41705923624336e-06, + "loss": 1.4543, + "step": 6278 + }, + { + "epoch": 1.89, + "grad_norm": 22.156248092651367, + "learning_rate": 7.41505462563897e-06, + "loss": 1.5765, + "step": 6279 + }, + { + "epoch": 1.89, + "grad_norm": 15.999430656433105, + "learning_rate": 7.41305001503458e-06, + "loss": 1.7229, + "step": 6280 + }, + { + "epoch": 1.89, + "grad_norm": 10.846489906311035, + "learning_rate": 7.411045404430191e-06, + "loss": 1.9086, + "step": 6281 + }, + { + "epoch": 1.89, + "grad_norm": 17.6227970123291, + "learning_rate": 7.4090407938258e-06, + "loss": 1.2638, + "step": 6282 + }, + { + "epoch": 1.89, + "grad_norm": 16.64601707458496, + "learning_rate": 7.40703618322141e-06, + "loss": 1.2801, + "step": 6283 + }, + { + "epoch": 1.89, + "grad_norm": 18.028026580810547, + "learning_rate": 7.4050315726170195e-06, + "loss": 1.4945, + "step": 6284 + }, + { + "epoch": 1.89, + "grad_norm": 35.858795166015625, + "learning_rate": 7.4030269620126295e-06, + "loss": 1.969, + "step": 6285 + }, + { + "epoch": 1.89, + "grad_norm": 9.260512351989746, + "learning_rate": 7.401022351408239e-06, + "loss": 1.0171, + "step": 6286 + }, + { + "epoch": 1.89, + "grad_norm": 28.248775482177734, + "learning_rate": 7.39901774080385e-06, + "loss": 1.2457, + "step": 6287 + }, + { + "epoch": 1.89, + "grad_norm": 23.339683532714844, + "learning_rate": 7.39701313019946e-06, + "loss": 1.8959, + "step": 6288 + }, + { + "epoch": 1.89, + "grad_norm": 16.68250274658203, + "learning_rate": 7.395008519595069e-06, + "loss": 1.1061, + "step": 6289 + }, + { + "epoch": 1.89, + "grad_norm": 22.26504898071289, + "learning_rate": 7.393003908990679e-06, + "loss": 1.7896, + "step": 6290 + }, + { + "epoch": 1.89, + "grad_norm": 16.31560707092285, + "learning_rate": 7.390999298386289e-06, + "loss": 0.9241, + "step": 6291 + }, + { + "epoch": 1.89, + "grad_norm": 132.97479248046875, + "learning_rate": 7.3889946877818984e-06, + "loss": 2.0392, + "step": 6292 + }, + { + "epoch": 1.89, + "grad_norm": 26.72028350830078, + "learning_rate": 7.386990077177509e-06, + "loss": 1.2421, + "step": 6293 + }, + { + "epoch": 1.89, + "grad_norm": 7.662388324737549, + "learning_rate": 7.3849854665731195e-06, + "loss": 1.324, + "step": 6294 + }, + { + "epoch": 1.89, + "grad_norm": 17.365476608276367, + "learning_rate": 7.382980855968729e-06, + "loss": 1.5321, + "step": 6295 + }, + { + "epoch": 1.89, + "grad_norm": 12.46912956237793, + "learning_rate": 7.380976245364339e-06, + "loss": 1.4461, + "step": 6296 + }, + { + "epoch": 1.89, + "grad_norm": 11.624770164489746, + "learning_rate": 7.378971634759948e-06, + "loss": 1.7608, + "step": 6297 + }, + { + "epoch": 1.89, + "grad_norm": 18.62898826599121, + "learning_rate": 7.376967024155558e-06, + "loss": 1.0102, + "step": 6298 + }, + { + "epoch": 1.89, + "grad_norm": 7.544929504394531, + "learning_rate": 7.374962413551169e-06, + "loss": 0.8058, + "step": 6299 + }, + { + "epoch": 1.89, + "grad_norm": 8.430646896362305, + "learning_rate": 7.372957802946778e-06, + "loss": 1.2943, + "step": 6300 + }, + { + "epoch": 1.89, + "grad_norm": 16.084999084472656, + "learning_rate": 7.370953192342388e-06, + "loss": 1.6322, + "step": 6301 + }, + { + "epoch": 1.89, + "grad_norm": 26.333581924438477, + "learning_rate": 7.368948581737998e-06, + "loss": 1.3346, + "step": 6302 + }, + { + "epoch": 1.9, + "grad_norm": 12.999780654907227, + "learning_rate": 7.366943971133608e-06, + "loss": 1.312, + "step": 6303 + }, + { + "epoch": 1.9, + "grad_norm": 20.878276824951172, + "learning_rate": 7.364939360529218e-06, + "loss": 1.235, + "step": 6304 + }, + { + "epoch": 1.9, + "grad_norm": 39.3919677734375, + "learning_rate": 7.362934749924827e-06, + "loss": 1.3573, + "step": 6305 + }, + { + "epoch": 1.9, + "grad_norm": 42.52452087402344, + "learning_rate": 7.360930139320438e-06, + "loss": 2.1336, + "step": 6306 + }, + { + "epoch": 1.9, + "grad_norm": 10.164920806884766, + "learning_rate": 7.358925528716048e-06, + "loss": 1.1969, + "step": 6307 + }, + { + "epoch": 1.9, + "grad_norm": 14.353740692138672, + "learning_rate": 7.356920918111657e-06, + "loss": 1.6327, + "step": 6308 + }, + { + "epoch": 1.9, + "grad_norm": 11.760319709777832, + "learning_rate": 7.354916307507267e-06, + "loss": 1.8274, + "step": 6309 + }, + { + "epoch": 1.9, + "grad_norm": 65.9858169555664, + "learning_rate": 7.3529116969028766e-06, + "loss": 1.0124, + "step": 6310 + }, + { + "epoch": 1.9, + "grad_norm": 13.734026908874512, + "learning_rate": 7.350907086298487e-06, + "loss": 1.2509, + "step": 6311 + }, + { + "epoch": 1.9, + "grad_norm": 21.561870574951172, + "learning_rate": 7.348902475694098e-06, + "loss": 1.3681, + "step": 6312 + }, + { + "epoch": 1.9, + "grad_norm": 51.21876525878906, + "learning_rate": 7.346897865089707e-06, + "loss": 2.3598, + "step": 6313 + }, + { + "epoch": 1.9, + "grad_norm": 35.55440139770508, + "learning_rate": 7.344893254485317e-06, + "loss": 1.6387, + "step": 6314 + }, + { + "epoch": 1.9, + "grad_norm": 9.630316734313965, + "learning_rate": 7.342888643880927e-06, + "loss": 1.0267, + "step": 6315 + }, + { + "epoch": 1.9, + "grad_norm": 17.24791717529297, + "learning_rate": 7.340884033276536e-06, + "loss": 2.4284, + "step": 6316 + }, + { + "epoch": 1.9, + "grad_norm": 12.127941131591797, + "learning_rate": 7.338879422672146e-06, + "loss": 1.3257, + "step": 6317 + }, + { + "epoch": 1.9, + "grad_norm": 11.26683235168457, + "learning_rate": 7.336874812067757e-06, + "loss": 0.946, + "step": 6318 + }, + { + "epoch": 1.9, + "grad_norm": 7.805086612701416, + "learning_rate": 7.3348702014633665e-06, + "loss": 0.6828, + "step": 6319 + }, + { + "epoch": 1.9, + "grad_norm": 124.16524505615234, + "learning_rate": 7.3328655908589766e-06, + "loss": 2.6128, + "step": 6320 + }, + { + "epoch": 1.9, + "grad_norm": 26.658418655395508, + "learning_rate": 7.330860980254586e-06, + "loss": 1.359, + "step": 6321 + }, + { + "epoch": 1.9, + "grad_norm": 63.66525650024414, + "learning_rate": 7.328856369650196e-06, + "loss": 2.1401, + "step": 6322 + }, + { + "epoch": 1.9, + "grad_norm": 19.522897720336914, + "learning_rate": 7.326851759045805e-06, + "loss": 1.6017, + "step": 6323 + }, + { + "epoch": 1.9, + "grad_norm": 18.54653549194336, + "learning_rate": 7.324847148441416e-06, + "loss": 1.902, + "step": 6324 + }, + { + "epoch": 1.9, + "grad_norm": 40.539337158203125, + "learning_rate": 7.322842537837026e-06, + "loss": 1.6362, + "step": 6325 + }, + { + "epoch": 1.9, + "grad_norm": 24.200485229492188, + "learning_rate": 7.320837927232635e-06, + "loss": 2.5706, + "step": 6326 + }, + { + "epoch": 1.9, + "grad_norm": 38.600364685058594, + "learning_rate": 7.3188333166282455e-06, + "loss": 2.5773, + "step": 6327 + }, + { + "epoch": 1.9, + "grad_norm": 13.316827774047852, + "learning_rate": 7.3168287060238555e-06, + "loss": 1.6111, + "step": 6328 + }, + { + "epoch": 1.9, + "grad_norm": 20.68040657043457, + "learning_rate": 7.314824095419465e-06, + "loss": 1.8822, + "step": 6329 + }, + { + "epoch": 1.9, + "grad_norm": 20.918405532836914, + "learning_rate": 7.312819484815076e-06, + "loss": 0.873, + "step": 6330 + }, + { + "epoch": 1.9, + "grad_norm": 19.907466888427734, + "learning_rate": 7.310814874210686e-06, + "loss": 1.2545, + "step": 6331 + }, + { + "epoch": 1.9, + "grad_norm": 16.205663681030273, + "learning_rate": 7.308810263606295e-06, + "loss": 1.2967, + "step": 6332 + }, + { + "epoch": 1.9, + "grad_norm": 13.830738067626953, + "learning_rate": 7.306805653001905e-06, + "loss": 0.9852, + "step": 6333 + }, + { + "epoch": 1.9, + "grad_norm": 12.994755744934082, + "learning_rate": 7.304801042397514e-06, + "loss": 1.0101, + "step": 6334 + }, + { + "epoch": 1.9, + "grad_norm": 10.922076225280762, + "learning_rate": 7.3027964317931244e-06, + "loss": 1.1132, + "step": 6335 + }, + { + "epoch": 1.9, + "grad_norm": 14.2009859085083, + "learning_rate": 7.300791821188735e-06, + "loss": 1.1271, + "step": 6336 + }, + { + "epoch": 1.91, + "grad_norm": 13.78223705291748, + "learning_rate": 7.298787210584345e-06, + "loss": 1.4629, + "step": 6337 + }, + { + "epoch": 1.91, + "grad_norm": 47.76295852661133, + "learning_rate": 7.296782599979955e-06, + "loss": 1.6983, + "step": 6338 + }, + { + "epoch": 1.91, + "grad_norm": 26.619237899780273, + "learning_rate": 7.294777989375565e-06, + "loss": 2.0114, + "step": 6339 + }, + { + "epoch": 1.91, + "grad_norm": 10.558842658996582, + "learning_rate": 7.292773378771174e-06, + "loss": 0.7375, + "step": 6340 + }, + { + "epoch": 1.91, + "grad_norm": 11.236355781555176, + "learning_rate": 7.290768768166784e-06, + "loss": 1.406, + "step": 6341 + }, + { + "epoch": 1.91, + "grad_norm": 10.12108325958252, + "learning_rate": 7.288764157562395e-06, + "loss": 1.4366, + "step": 6342 + }, + { + "epoch": 1.91, + "grad_norm": 23.887067794799805, + "learning_rate": 7.286759546958004e-06, + "loss": 1.2171, + "step": 6343 + }, + { + "epoch": 1.91, + "grad_norm": 11.277857780456543, + "learning_rate": 7.284754936353614e-06, + "loss": 1.1178, + "step": 6344 + }, + { + "epoch": 1.91, + "grad_norm": 8.617938041687012, + "learning_rate": 7.282750325749224e-06, + "loss": 0.9352, + "step": 6345 + }, + { + "epoch": 1.91, + "grad_norm": 7.352804660797119, + "learning_rate": 7.280745715144834e-06, + "loss": 1.1608, + "step": 6346 + }, + { + "epoch": 1.91, + "grad_norm": 19.099529266357422, + "learning_rate": 7.278741104540443e-06, + "loss": 1.3358, + "step": 6347 + }, + { + "epoch": 1.91, + "grad_norm": 7.905605792999268, + "learning_rate": 7.276736493936053e-06, + "loss": 1.8386, + "step": 6348 + }, + { + "epoch": 1.91, + "grad_norm": 23.328462600708008, + "learning_rate": 7.274731883331664e-06, + "loss": 1.5516, + "step": 6349 + }, + { + "epoch": 1.91, + "grad_norm": 55.610599517822266, + "learning_rate": 7.272727272727273e-06, + "loss": 1.2527, + "step": 6350 + }, + { + "epoch": 1.91, + "grad_norm": 14.633689880371094, + "learning_rate": 7.270722662122883e-06, + "loss": 1.4997, + "step": 6351 + }, + { + "epoch": 1.91, + "grad_norm": 15.966309547424316, + "learning_rate": 7.268718051518493e-06, + "loss": 1.4803, + "step": 6352 + }, + { + "epoch": 1.91, + "grad_norm": 15.084650039672852, + "learning_rate": 7.2667134409141026e-06, + "loss": 0.8466, + "step": 6353 + }, + { + "epoch": 1.91, + "grad_norm": 24.249235153198242, + "learning_rate": 7.264708830309713e-06, + "loss": 1.2289, + "step": 6354 + }, + { + "epoch": 1.91, + "grad_norm": 30.721338272094727, + "learning_rate": 7.262704219705324e-06, + "loss": 1.5891, + "step": 6355 + }, + { + "epoch": 1.91, + "grad_norm": 16.246719360351562, + "learning_rate": 7.260699609100933e-06, + "loss": 1.4035, + "step": 6356 + }, + { + "epoch": 1.91, + "grad_norm": 21.17243194580078, + "learning_rate": 7.258694998496543e-06, + "loss": 1.8486, + "step": 6357 + }, + { + "epoch": 1.91, + "grad_norm": 16.293087005615234, + "learning_rate": 7.256690387892152e-06, + "loss": 0.9408, + "step": 6358 + }, + { + "epoch": 1.91, + "grad_norm": 96.68749237060547, + "learning_rate": 7.254685777287762e-06, + "loss": 2.8577, + "step": 6359 + }, + { + "epoch": 1.91, + "grad_norm": 17.761423110961914, + "learning_rate": 7.2526811666833715e-06, + "loss": 2.8659, + "step": 6360 + }, + { + "epoch": 1.91, + "eval_loss": 0.17964056134223938, + "eval_runtime": 43.926, + "eval_samples_per_second": 33.67, + "eval_steps_per_second": 33.67, + "step": 6360 + }, + { + "epoch": 1.91, + "grad_norm": 23.259288787841797, + "learning_rate": 7.250676556078982e-06, + "loss": 1.1937, + "step": 6361 + }, + { + "epoch": 1.91, + "grad_norm": 28.89937400817871, + "learning_rate": 7.2486719454745925e-06, + "loss": 1.9101, + "step": 6362 + }, + { + "epoch": 1.91, + "grad_norm": 13.6145601272583, + "learning_rate": 7.246667334870202e-06, + "loss": 1.4405, + "step": 6363 + }, + { + "epoch": 1.91, + "grad_norm": 28.463932037353516, + "learning_rate": 7.244662724265812e-06, + "loss": 1.7418, + "step": 6364 + }, + { + "epoch": 1.91, + "grad_norm": 29.748435974121094, + "learning_rate": 7.242658113661422e-06, + "loss": 1.7554, + "step": 6365 + }, + { + "epoch": 1.91, + "grad_norm": 10.47834300994873, + "learning_rate": 7.240653503057031e-06, + "loss": 0.9418, + "step": 6366 + }, + { + "epoch": 1.91, + "grad_norm": 66.0386962890625, + "learning_rate": 7.238648892452642e-06, + "loss": 1.4975, + "step": 6367 + }, + { + "epoch": 1.91, + "grad_norm": 17.965469360351562, + "learning_rate": 7.236644281848252e-06, + "loss": 1.2229, + "step": 6368 + }, + { + "epoch": 1.91, + "grad_norm": 12.599654197692871, + "learning_rate": 7.234639671243861e-06, + "loss": 1.606, + "step": 6369 + }, + { + "epoch": 1.92, + "grad_norm": 17.390832901000977, + "learning_rate": 7.2326350606394715e-06, + "loss": 1.387, + "step": 6370 + }, + { + "epoch": 1.92, + "grad_norm": 15.174606323242188, + "learning_rate": 7.230630450035081e-06, + "loss": 1.3947, + "step": 6371 + }, + { + "epoch": 1.92, + "grad_norm": 16.567323684692383, + "learning_rate": 7.228625839430691e-06, + "loss": 1.5866, + "step": 6372 + }, + { + "epoch": 1.92, + "grad_norm": 33.17914962768555, + "learning_rate": 7.226621228826302e-06, + "loss": 1.8752, + "step": 6373 + }, + { + "epoch": 1.92, + "grad_norm": 18.26946258544922, + "learning_rate": 7.224616618221911e-06, + "loss": 1.1493, + "step": 6374 + }, + { + "epoch": 1.92, + "grad_norm": 26.841148376464844, + "learning_rate": 7.222612007617521e-06, + "loss": 1.86, + "step": 6375 + }, + { + "epoch": 1.92, + "grad_norm": 15.99163818359375, + "learning_rate": 7.220607397013131e-06, + "loss": 1.7793, + "step": 6376 + }, + { + "epoch": 1.92, + "grad_norm": 85.62297821044922, + "learning_rate": 7.21860278640874e-06, + "loss": 2.2498, + "step": 6377 + }, + { + "epoch": 1.92, + "grad_norm": 14.444759368896484, + "learning_rate": 7.2165981758043504e-06, + "loss": 1.0914, + "step": 6378 + }, + { + "epoch": 1.92, + "grad_norm": 14.665372848510742, + "learning_rate": 7.214593565199961e-06, + "loss": 2.3208, + "step": 6379 + }, + { + "epoch": 1.92, + "grad_norm": 6.8967604637146, + "learning_rate": 7.212588954595571e-06, + "loss": 1.0709, + "step": 6380 + }, + { + "epoch": 1.92, + "grad_norm": 27.484949111938477, + "learning_rate": 7.210584343991181e-06, + "loss": 1.6652, + "step": 6381 + }, + { + "epoch": 1.92, + "grad_norm": 36.965274810791016, + "learning_rate": 7.20857973338679e-06, + "loss": 2.1163, + "step": 6382 + }, + { + "epoch": 1.92, + "grad_norm": 16.792247772216797, + "learning_rate": 7.2065751227824e-06, + "loss": 1.4545, + "step": 6383 + }, + { + "epoch": 1.92, + "grad_norm": 18.809303283691406, + "learning_rate": 7.204570512178009e-06, + "loss": 1.9265, + "step": 6384 + }, + { + "epoch": 1.92, + "grad_norm": 23.61911964416504, + "learning_rate": 7.20256590157362e-06, + "loss": 1.4629, + "step": 6385 + }, + { + "epoch": 1.92, + "grad_norm": 15.787665367126465, + "learning_rate": 7.20056129096923e-06, + "loss": 1.5397, + "step": 6386 + }, + { + "epoch": 1.92, + "grad_norm": 25.38530158996582, + "learning_rate": 7.1985566803648395e-06, + "loss": 1.9278, + "step": 6387 + }, + { + "epoch": 1.92, + "grad_norm": 52.69976806640625, + "learning_rate": 7.19655206976045e-06, + "loss": 1.0921, + "step": 6388 + }, + { + "epoch": 1.92, + "grad_norm": 12.26889419555664, + "learning_rate": 7.19454745915606e-06, + "loss": 0.9164, + "step": 6389 + }, + { + "epoch": 1.92, + "grad_norm": 11.600250244140625, + "learning_rate": 7.192542848551669e-06, + "loss": 0.8845, + "step": 6390 + }, + { + "epoch": 1.92, + "grad_norm": 21.00747299194336, + "learning_rate": 7.190538237947279e-06, + "loss": 1.6517, + "step": 6391 + }, + { + "epoch": 1.92, + "grad_norm": 34.60345458984375, + "learning_rate": 7.18853362734289e-06, + "loss": 1.5433, + "step": 6392 + }, + { + "epoch": 1.92, + "grad_norm": 11.045711517333984, + "learning_rate": 7.186529016738499e-06, + "loss": 1.2821, + "step": 6393 + }, + { + "epoch": 1.92, + "grad_norm": 15.181732177734375, + "learning_rate": 7.184524406134109e-06, + "loss": 1.382, + "step": 6394 + }, + { + "epoch": 1.92, + "grad_norm": 15.159256935119629, + "learning_rate": 7.1825197955297185e-06, + "loss": 1.1485, + "step": 6395 + }, + { + "epoch": 1.92, + "grad_norm": 18.364330291748047, + "learning_rate": 7.1805151849253286e-06, + "loss": 1.5439, + "step": 6396 + }, + { + "epoch": 1.92, + "grad_norm": 25.671911239624023, + "learning_rate": 7.178510574320938e-06, + "loss": 1.1944, + "step": 6397 + }, + { + "epoch": 1.92, + "grad_norm": 20.186847686767578, + "learning_rate": 7.176505963716549e-06, + "loss": 1.7509, + "step": 6398 + }, + { + "epoch": 1.92, + "grad_norm": 13.25575065612793, + "learning_rate": 7.174501353112159e-06, + "loss": 0.9348, + "step": 6399 + }, + { + "epoch": 1.92, + "grad_norm": 16.35478401184082, + "learning_rate": 7.172496742507769e-06, + "loss": 1.4031, + "step": 6400 + }, + { + "epoch": 1.92, + "grad_norm": 13.498659133911133, + "learning_rate": 7.170492131903378e-06, + "loss": 1.4828, + "step": 6401 + }, + { + "epoch": 1.92, + "grad_norm": 88.43494415283203, + "learning_rate": 7.168487521298988e-06, + "loss": 1.9878, + "step": 6402 + }, + { + "epoch": 1.93, + "grad_norm": 11.417867660522461, + "learning_rate": 7.1664829106945975e-06, + "loss": 1.5075, + "step": 6403 + }, + { + "epoch": 1.93, + "grad_norm": 12.581781387329102, + "learning_rate": 7.164478300090208e-06, + "loss": 1.1652, + "step": 6404 + }, + { + "epoch": 1.93, + "grad_norm": 19.474443435668945, + "learning_rate": 7.1624736894858185e-06, + "loss": 1.7053, + "step": 6405 + }, + { + "epoch": 1.93, + "grad_norm": 21.434553146362305, + "learning_rate": 7.160469078881428e-06, + "loss": 2.3571, + "step": 6406 + }, + { + "epoch": 1.93, + "grad_norm": 17.52594566345215, + "learning_rate": 7.158464468277038e-06, + "loss": 2.2389, + "step": 6407 + }, + { + "epoch": 1.93, + "grad_norm": 26.64896583557129, + "learning_rate": 7.156459857672647e-06, + "loss": 1.8134, + "step": 6408 + }, + { + "epoch": 1.93, + "grad_norm": 13.084494590759277, + "learning_rate": 7.154455247068257e-06, + "loss": 1.584, + "step": 6409 + }, + { + "epoch": 1.93, + "grad_norm": 12.483121871948242, + "learning_rate": 7.152450636463868e-06, + "loss": 1.8767, + "step": 6410 + }, + { + "epoch": 1.93, + "grad_norm": 8.219230651855469, + "learning_rate": 7.150446025859477e-06, + "loss": 1.177, + "step": 6411 + }, + { + "epoch": 1.93, + "grad_norm": 31.770601272583008, + "learning_rate": 7.148441415255087e-06, + "loss": 1.9082, + "step": 6412 + }, + { + "epoch": 1.93, + "grad_norm": 18.06419563293457, + "learning_rate": 7.1464368046506975e-06, + "loss": 1.2848, + "step": 6413 + }, + { + "epoch": 1.93, + "grad_norm": 6.39808464050293, + "learning_rate": 7.144432194046307e-06, + "loss": 0.4952, + "step": 6414 + }, + { + "epoch": 1.93, + "grad_norm": 16.47151756286621, + "learning_rate": 7.142427583441917e-06, + "loss": 1.4156, + "step": 6415 + }, + { + "epoch": 1.93, + "grad_norm": 35.63877487182617, + "learning_rate": 7.140422972837528e-06, + "loss": 1.6205, + "step": 6416 + }, + { + "epoch": 1.93, + "grad_norm": 16.693328857421875, + "learning_rate": 7.138418362233137e-06, + "loss": 1.1112, + "step": 6417 + }, + { + "epoch": 1.93, + "grad_norm": 34.685909271240234, + "learning_rate": 7.136413751628747e-06, + "loss": 1.704, + "step": 6418 + }, + { + "epoch": 1.93, + "grad_norm": 17.206161499023438, + "learning_rate": 7.134409141024356e-06, + "loss": 1.1891, + "step": 6419 + }, + { + "epoch": 1.93, + "grad_norm": 19.83222770690918, + "learning_rate": 7.132404530419966e-06, + "loss": 2.2121, + "step": 6420 + }, + { + "epoch": 1.93, + "grad_norm": 26.431730270385742, + "learning_rate": 7.130399919815576e-06, + "loss": 1.7221, + "step": 6421 + }, + { + "epoch": 1.93, + "grad_norm": 9.490403175354004, + "learning_rate": 7.1283953092111865e-06, + "loss": 1.1032, + "step": 6422 + }, + { + "epoch": 1.93, + "grad_norm": 52.717281341552734, + "learning_rate": 7.126390698606797e-06, + "loss": 1.5446, + "step": 6423 + }, + { + "epoch": 1.93, + "grad_norm": 14.396468162536621, + "learning_rate": 7.124386088002406e-06, + "loss": 0.762, + "step": 6424 + }, + { + "epoch": 1.93, + "grad_norm": 34.430179595947266, + "learning_rate": 7.122381477398016e-06, + "loss": 2.3349, + "step": 6425 + }, + { + "epoch": 1.93, + "grad_norm": 7.51977014541626, + "learning_rate": 7.120376866793626e-06, + "loss": 0.1446, + "step": 6426 + }, + { + "epoch": 1.93, + "grad_norm": 11.881677627563477, + "learning_rate": 7.118372256189235e-06, + "loss": 1.2101, + "step": 6427 + }, + { + "epoch": 1.93, + "grad_norm": 17.82876968383789, + "learning_rate": 7.116367645584846e-06, + "loss": 1.2283, + "step": 6428 + }, + { + "epoch": 1.93, + "grad_norm": 16.561769485473633, + "learning_rate": 7.114363034980456e-06, + "loss": 2.0483, + "step": 6429 + }, + { + "epoch": 1.93, + "grad_norm": 35.203155517578125, + "learning_rate": 7.1123584243760655e-06, + "loss": 1.7524, + "step": 6430 + }, + { + "epoch": 1.93, + "grad_norm": 24.895706176757812, + "learning_rate": 7.110353813771676e-06, + "loss": 1.6033, + "step": 6431 + }, + { + "epoch": 1.93, + "grad_norm": 23.870237350463867, + "learning_rate": 7.108349203167285e-06, + "loss": 1.4187, + "step": 6432 + }, + { + "epoch": 1.93, + "grad_norm": 24.825237274169922, + "learning_rate": 7.106344592562895e-06, + "loss": 1.594, + "step": 6433 + }, + { + "epoch": 1.93, + "grad_norm": 8.913065910339355, + "learning_rate": 7.104339981958505e-06, + "loss": 1.0598, + "step": 6434 + }, + { + "epoch": 1.93, + "grad_norm": 41.398555755615234, + "learning_rate": 7.102335371354115e-06, + "loss": 2.3157, + "step": 6435 + }, + { + "epoch": 1.94, + "grad_norm": 18.028615951538086, + "learning_rate": 7.100330760749725e-06, + "loss": 2.0904, + "step": 6436 + }, + { + "epoch": 1.94, + "grad_norm": 11.3342866897583, + "learning_rate": 7.098326150145335e-06, + "loss": 1.7827, + "step": 6437 + }, + { + "epoch": 1.94, + "grad_norm": 20.189563751220703, + "learning_rate": 7.0963215395409445e-06, + "loss": 1.6553, + "step": 6438 + }, + { + "epoch": 1.94, + "grad_norm": 12.127889633178711, + "learning_rate": 7.094316928936555e-06, + "loss": 1.4503, + "step": 6439 + }, + { + "epoch": 1.94, + "grad_norm": 37.8315315246582, + "learning_rate": 7.092312318332164e-06, + "loss": 1.7977, + "step": 6440 + }, + { + "epoch": 1.94, + "grad_norm": 13.1519775390625, + "learning_rate": 7.090307707727775e-06, + "loss": 1.6393, + "step": 6441 + }, + { + "epoch": 1.94, + "grad_norm": 35.46371841430664, + "learning_rate": 7.088303097123385e-06, + "loss": 2.3399, + "step": 6442 + }, + { + "epoch": 1.94, + "grad_norm": 12.697428703308105, + "learning_rate": 7.086298486518994e-06, + "loss": 2.5277, + "step": 6443 + }, + { + "epoch": 1.94, + "grad_norm": 26.87334632873535, + "learning_rate": 7.084293875914604e-06, + "loss": 2.0531, + "step": 6444 + }, + { + "epoch": 1.94, + "grad_norm": 14.13779354095459, + "learning_rate": 7.082289265310213e-06, + "loss": 1.8136, + "step": 6445 + }, + { + "epoch": 1.94, + "grad_norm": 12.27436637878418, + "learning_rate": 7.0802846547058235e-06, + "loss": 1.2326, + "step": 6446 + }, + { + "epoch": 1.94, + "grad_norm": 19.918701171875, + "learning_rate": 7.078280044101434e-06, + "loss": 1.4257, + "step": 6447 + }, + { + "epoch": 1.94, + "grad_norm": 13.029349327087402, + "learning_rate": 7.076275433497044e-06, + "loss": 1.7537, + "step": 6448 + }, + { + "epoch": 1.94, + "grad_norm": 9.116921424865723, + "learning_rate": 7.074270822892654e-06, + "loss": 0.7983, + "step": 6449 + }, + { + "epoch": 1.94, + "grad_norm": 30.727689743041992, + "learning_rate": 7.072266212288264e-06, + "loss": 1.2988, + "step": 6450 + }, + { + "epoch": 1.94, + "grad_norm": 15.236931800842285, + "learning_rate": 7.070261601683873e-06, + "loss": 1.4535, + "step": 6451 + }, + { + "epoch": 1.94, + "grad_norm": 38.2266845703125, + "learning_rate": 7.068256991079483e-06, + "loss": 0.8669, + "step": 6452 + }, + { + "epoch": 1.94, + "grad_norm": 42.05986785888672, + "learning_rate": 7.066252380475094e-06, + "loss": 1.7596, + "step": 6453 + }, + { + "epoch": 1.94, + "grad_norm": 28.586145401000977, + "learning_rate": 7.064247769870703e-06, + "loss": 1.823, + "step": 6454 + }, + { + "epoch": 1.94, + "grad_norm": 6.259302616119385, + "learning_rate": 7.062243159266313e-06, + "loss": 0.3312, + "step": 6455 + }, + { + "epoch": 1.94, + "grad_norm": 8.961036682128906, + "learning_rate": 7.060238548661923e-06, + "loss": 0.999, + "step": 6456 + }, + { + "epoch": 1.94, + "grad_norm": 19.617019653320312, + "learning_rate": 7.058233938057533e-06, + "loss": 1.9528, + "step": 6457 + }, + { + "epoch": 1.94, + "grad_norm": 30.003917694091797, + "learning_rate": 7.056229327453143e-06, + "loss": 1.5707, + "step": 6458 + }, + { + "epoch": 1.94, + "grad_norm": 16.13441276550293, + "learning_rate": 7.054224716848753e-06, + "loss": 2.1332, + "step": 6459 + }, + { + "epoch": 1.94, + "grad_norm": 11.956790924072266, + "learning_rate": 7.052220106244363e-06, + "loss": 1.4419, + "step": 6460 + }, + { + "epoch": 1.94, + "grad_norm": 17.357301712036133, + "learning_rate": 7.050215495639973e-06, + "loss": 1.1238, + "step": 6461 + }, + { + "epoch": 1.94, + "grad_norm": 16.655643463134766, + "learning_rate": 7.048210885035582e-06, + "loss": 2.1496, + "step": 6462 + }, + { + "epoch": 1.94, + "grad_norm": 17.952760696411133, + "learning_rate": 7.046206274431192e-06, + "loss": 1.5901, + "step": 6463 + }, + { + "epoch": 1.94, + "grad_norm": 20.259384155273438, + "learning_rate": 7.044201663826802e-06, + "loss": 1.2221, + "step": 6464 + }, + { + "epoch": 1.94, + "grad_norm": 25.712038040161133, + "learning_rate": 7.0421970532224125e-06, + "loss": 2.141, + "step": 6465 + }, + { + "epoch": 1.94, + "grad_norm": 17.299470901489258, + "learning_rate": 7.040192442618023e-06, + "loss": 0.9474, + "step": 6466 + }, + { + "epoch": 1.94, + "grad_norm": 18.08493423461914, + "learning_rate": 7.038187832013632e-06, + "loss": 1.711, + "step": 6467 + }, + { + "epoch": 1.94, + "grad_norm": 54.114418029785156, + "learning_rate": 7.036183221409242e-06, + "loss": 0.6773, + "step": 6468 + }, + { + "epoch": 1.94, + "grad_norm": 13.001557350158691, + "learning_rate": 7.034178610804851e-06, + "loss": 1.1428, + "step": 6469 + }, + { + "epoch": 1.95, + "grad_norm": 8.948216438293457, + "learning_rate": 7.032174000200461e-06, + "loss": 1.2129, + "step": 6470 + }, + { + "epoch": 1.95, + "grad_norm": 96.37139892578125, + "learning_rate": 7.030169389596072e-06, + "loss": 1.3139, + "step": 6471 + }, + { + "epoch": 1.95, + "grad_norm": 14.782844543457031, + "learning_rate": 7.0281647789916814e-06, + "loss": 1.4799, + "step": 6472 + }, + { + "epoch": 1.95, + "grad_norm": 14.976436614990234, + "learning_rate": 7.0261601683872915e-06, + "loss": 1.4801, + "step": 6473 + }, + { + "epoch": 1.95, + "grad_norm": 26.140769958496094, + "learning_rate": 7.024155557782902e-06, + "loss": 1.9828, + "step": 6474 + }, + { + "epoch": 1.95, + "grad_norm": 14.462868690490723, + "learning_rate": 7.022150947178511e-06, + "loss": 1.3346, + "step": 6475 + }, + { + "epoch": 1.95, + "grad_norm": 12.547101974487305, + "learning_rate": 7.020146336574121e-06, + "loss": 1.8225, + "step": 6476 + }, + { + "epoch": 1.95, + "grad_norm": 13.108290672302246, + "learning_rate": 7.01814172596973e-06, + "loss": 1.7796, + "step": 6477 + }, + { + "epoch": 1.95, + "grad_norm": 20.673439025878906, + "learning_rate": 7.016137115365341e-06, + "loss": 1.7685, + "step": 6478 + }, + { + "epoch": 1.95, + "grad_norm": 9.68248462677002, + "learning_rate": 7.014132504760951e-06, + "loss": 1.1545, + "step": 6479 + }, + { + "epoch": 1.95, + "grad_norm": 11.824162483215332, + "learning_rate": 7.01212789415656e-06, + "loss": 1.1257, + "step": 6480 + }, + { + "epoch": 1.95, + "eval_loss": 0.17668664455413818, + "eval_runtime": 43.6307, + "eval_samples_per_second": 33.898, + "eval_steps_per_second": 33.898, + "step": 6480 + }, + { + "epoch": 1.95, + "grad_norm": 19.132976531982422, + "learning_rate": 7.0101232835521705e-06, + "loss": 1.4466, + "step": 6481 + }, + { + "epoch": 1.95, + "grad_norm": 13.578184127807617, + "learning_rate": 7.00811867294778e-06, + "loss": 1.8283, + "step": 6482 + }, + { + "epoch": 1.95, + "grad_norm": 14.636981010437012, + "learning_rate": 7.00611406234339e-06, + "loss": 1.5467, + "step": 6483 + }, + { + "epoch": 1.95, + "grad_norm": 14.252835273742676, + "learning_rate": 7.004109451739001e-06, + "loss": 1.0799, + "step": 6484 + }, + { + "epoch": 1.95, + "grad_norm": 7.7530837059021, + "learning_rate": 7.002104841134611e-06, + "loss": 1.7159, + "step": 6485 + }, + { + "epoch": 1.95, + "grad_norm": 19.219745635986328, + "learning_rate": 7.00010023053022e-06, + "loss": 1.3202, + "step": 6486 + }, + { + "epoch": 1.95, + "grad_norm": 10.907150268554688, + "learning_rate": 6.99809561992583e-06, + "loss": 1.3228, + "step": 6487 + }, + { + "epoch": 1.95, + "grad_norm": 7.514638423919678, + "learning_rate": 6.996091009321439e-06, + "loss": 1.1397, + "step": 6488 + }, + { + "epoch": 1.95, + "grad_norm": 21.773658752441406, + "learning_rate": 6.9940863987170495e-06, + "loss": 0.9278, + "step": 6489 + }, + { + "epoch": 1.95, + "grad_norm": 10.506806373596191, + "learning_rate": 6.99208178811266e-06, + "loss": 1.709, + "step": 6490 + }, + { + "epoch": 1.95, + "grad_norm": 15.891654014587402, + "learning_rate": 6.99007717750827e-06, + "loss": 1.4618, + "step": 6491 + }, + { + "epoch": 1.95, + "grad_norm": 31.406612396240234, + "learning_rate": 6.98807256690388e-06, + "loss": 1.977, + "step": 6492 + }, + { + "epoch": 1.95, + "grad_norm": 20.738195419311523, + "learning_rate": 6.986067956299489e-06, + "loss": 1.4744, + "step": 6493 + }, + { + "epoch": 1.95, + "grad_norm": 17.87335968017578, + "learning_rate": 6.984063345695099e-06, + "loss": 1.8068, + "step": 6494 + }, + { + "epoch": 1.95, + "grad_norm": 13.646010398864746, + "learning_rate": 6.982058735090709e-06, + "loss": 0.9828, + "step": 6495 + }, + { + "epoch": 1.95, + "grad_norm": 12.331052780151367, + "learning_rate": 6.980054124486319e-06, + "loss": 1.6284, + "step": 6496 + }, + { + "epoch": 1.95, + "grad_norm": 52.41801452636719, + "learning_rate": 6.978049513881929e-06, + "loss": 0.973, + "step": 6497 + }, + { + "epoch": 1.95, + "grad_norm": 27.965364456176758, + "learning_rate": 6.976044903277539e-06, + "loss": 1.0861, + "step": 6498 + }, + { + "epoch": 1.95, + "grad_norm": 25.636484146118164, + "learning_rate": 6.974040292673149e-06, + "loss": 1.8763, + "step": 6499 + }, + { + "epoch": 1.95, + "grad_norm": 15.626718521118164, + "learning_rate": 6.972035682068759e-06, + "loss": 1.2224, + "step": 6500 + }, + { + "epoch": 1.95, + "grad_norm": 12.968523979187012, + "learning_rate": 6.970031071464368e-06, + "loss": 1.4936, + "step": 6501 + }, + { + "epoch": 1.95, + "grad_norm": 38.640384674072266, + "learning_rate": 6.968026460859979e-06, + "loss": 1.847, + "step": 6502 + }, + { + "epoch": 1.96, + "grad_norm": 35.005760192871094, + "learning_rate": 6.966021850255589e-06, + "loss": 1.8344, + "step": 6503 + }, + { + "epoch": 1.96, + "grad_norm": 59.90297317504883, + "learning_rate": 6.964017239651198e-06, + "loss": 1.5804, + "step": 6504 + }, + { + "epoch": 1.96, + "grad_norm": 8.664277076721191, + "learning_rate": 6.962012629046808e-06, + "loss": 1.1554, + "step": 6505 + }, + { + "epoch": 1.96, + "grad_norm": 15.182535171508789, + "learning_rate": 6.9600080184424175e-06, + "loss": 1.107, + "step": 6506 + }, + { + "epoch": 1.96, + "grad_norm": 25.827924728393555, + "learning_rate": 6.958003407838028e-06, + "loss": 1.1151, + "step": 6507 + }, + { + "epoch": 1.96, + "grad_norm": 9.57219409942627, + "learning_rate": 6.9559987972336385e-06, + "loss": 1.3565, + "step": 6508 + }, + { + "epoch": 1.96, + "grad_norm": 113.87895202636719, + "learning_rate": 6.953994186629248e-06, + "loss": 1.4153, + "step": 6509 + }, + { + "epoch": 1.96, + "grad_norm": 17.828977584838867, + "learning_rate": 6.951989576024858e-06, + "loss": 1.8861, + "step": 6510 + }, + { + "epoch": 1.96, + "grad_norm": 27.323060989379883, + "learning_rate": 6.949984965420468e-06, + "loss": 1.4276, + "step": 6511 + }, + { + "epoch": 1.96, + "grad_norm": 11.81863021850586, + "learning_rate": 6.947980354816077e-06, + "loss": 0.785, + "step": 6512 + }, + { + "epoch": 1.96, + "grad_norm": 10.938015937805176, + "learning_rate": 6.945975744211687e-06, + "loss": 1.096, + "step": 6513 + }, + { + "epoch": 1.96, + "grad_norm": 11.981101036071777, + "learning_rate": 6.9439711336072965e-06, + "loss": 1.3268, + "step": 6514 + }, + { + "epoch": 1.96, + "grad_norm": 19.58521842956543, + "learning_rate": 6.9419665230029074e-06, + "loss": 1.8049, + "step": 6515 + }, + { + "epoch": 1.96, + "grad_norm": 24.513118743896484, + "learning_rate": 6.9399619123985175e-06, + "loss": 1.3997, + "step": 6516 + }, + { + "epoch": 1.96, + "grad_norm": 13.00726318359375, + "learning_rate": 6.937957301794127e-06, + "loss": 2.1062, + "step": 6517 + }, + { + "epoch": 1.96, + "grad_norm": 16.61869239807129, + "learning_rate": 6.935952691189737e-06, + "loss": 1.4637, + "step": 6518 + }, + { + "epoch": 1.96, + "grad_norm": 18.2260684967041, + "learning_rate": 6.933948080585347e-06, + "loss": 1.3907, + "step": 6519 + }, + { + "epoch": 1.96, + "grad_norm": 27.4097957611084, + "learning_rate": 6.931943469980956e-06, + "loss": 2.0212, + "step": 6520 + }, + { + "epoch": 1.96, + "grad_norm": 13.984885215759277, + "learning_rate": 6.929938859376567e-06, + "loss": 0.8537, + "step": 6521 + }, + { + "epoch": 1.96, + "grad_norm": 29.465072631835938, + "learning_rate": 6.927934248772177e-06, + "loss": 2.082, + "step": 6522 + }, + { + "epoch": 1.96, + "grad_norm": 13.194611549377441, + "learning_rate": 6.925929638167786e-06, + "loss": 1.1381, + "step": 6523 + }, + { + "epoch": 1.96, + "grad_norm": 7.926638603210449, + "learning_rate": 6.9239250275633965e-06, + "loss": 0.9932, + "step": 6524 + }, + { + "epoch": 1.96, + "grad_norm": 16.14949607849121, + "learning_rate": 6.921920416959006e-06, + "loss": 1.4698, + "step": 6525 + }, + { + "epoch": 1.96, + "grad_norm": 33.72904968261719, + "learning_rate": 6.919915806354616e-06, + "loss": 3.2966, + "step": 6526 + }, + { + "epoch": 1.96, + "grad_norm": 8.642800331115723, + "learning_rate": 6.917911195750227e-06, + "loss": 1.1102, + "step": 6527 + }, + { + "epoch": 1.96, + "grad_norm": 12.295087814331055, + "learning_rate": 6.915906585145836e-06, + "loss": 0.9972, + "step": 6528 + }, + { + "epoch": 1.96, + "grad_norm": 24.457801818847656, + "learning_rate": 6.913901974541446e-06, + "loss": 1.4332, + "step": 6529 + }, + { + "epoch": 1.96, + "grad_norm": 14.026710510253906, + "learning_rate": 6.911897363937055e-06, + "loss": 1.8641, + "step": 6530 + }, + { + "epoch": 1.96, + "grad_norm": 12.17983341217041, + "learning_rate": 6.909892753332665e-06, + "loss": 1.4611, + "step": 6531 + }, + { + "epoch": 1.96, + "grad_norm": 12.283072471618652, + "learning_rate": 6.9078881427282755e-06, + "loss": 1.3947, + "step": 6532 + }, + { + "epoch": 1.96, + "grad_norm": 17.96526527404785, + "learning_rate": 6.9058835321238856e-06, + "loss": 1.7153, + "step": 6533 + }, + { + "epoch": 1.96, + "grad_norm": 24.099628448486328, + "learning_rate": 6.903878921519496e-06, + "loss": 1.2495, + "step": 6534 + }, + { + "epoch": 1.96, + "grad_norm": 16.107059478759766, + "learning_rate": 6.901874310915106e-06, + "loss": 1.4728, + "step": 6535 + }, + { + "epoch": 1.97, + "grad_norm": 15.214371681213379, + "learning_rate": 6.899869700310715e-06, + "loss": 1.1217, + "step": 6536 + }, + { + "epoch": 1.97, + "grad_norm": 27.67184829711914, + "learning_rate": 6.897865089706325e-06, + "loss": 1.4113, + "step": 6537 + }, + { + "epoch": 1.97, + "grad_norm": 20.205726623535156, + "learning_rate": 6.895860479101934e-06, + "loss": 0.8228, + "step": 6538 + }, + { + "epoch": 1.97, + "grad_norm": 12.976736068725586, + "learning_rate": 6.893855868497545e-06, + "loss": 1.0787, + "step": 6539 + }, + { + "epoch": 1.97, + "grad_norm": 141.27496337890625, + "learning_rate": 6.891851257893155e-06, + "loss": 2.659, + "step": 6540 + }, + { + "epoch": 1.97, + "grad_norm": 25.662107467651367, + "learning_rate": 6.8898466472887645e-06, + "loss": 2.3812, + "step": 6541 + }, + { + "epoch": 1.97, + "grad_norm": 17.610265731811523, + "learning_rate": 6.887842036684375e-06, + "loss": 1.1345, + "step": 6542 + }, + { + "epoch": 1.97, + "grad_norm": 13.565105438232422, + "learning_rate": 6.885837426079985e-06, + "loss": 1.7477, + "step": 6543 + }, + { + "epoch": 1.97, + "grad_norm": 72.72836303710938, + "learning_rate": 6.883832815475594e-06, + "loss": 1.0895, + "step": 6544 + }, + { + "epoch": 1.97, + "grad_norm": 11.967425346374512, + "learning_rate": 6.881828204871205e-06, + "loss": 1.2826, + "step": 6545 + }, + { + "epoch": 1.97, + "grad_norm": 27.99849510192871, + "learning_rate": 6.879823594266815e-06, + "loss": 1.4163, + "step": 6546 + }, + { + "epoch": 1.97, + "grad_norm": 17.99080467224121, + "learning_rate": 6.877818983662424e-06, + "loss": 0.7908, + "step": 6547 + }, + { + "epoch": 1.97, + "grad_norm": 8.94594955444336, + "learning_rate": 6.875814373058034e-06, + "loss": 1.5791, + "step": 6548 + }, + { + "epoch": 1.97, + "grad_norm": 18.120492935180664, + "learning_rate": 6.8738097624536435e-06, + "loss": 1.5981, + "step": 6549 + }, + { + "epoch": 1.97, + "grad_norm": 11.887974739074707, + "learning_rate": 6.871805151849254e-06, + "loss": 2.1877, + "step": 6550 + }, + { + "epoch": 1.97, + "grad_norm": 14.82646369934082, + "learning_rate": 6.8698005412448645e-06, + "loss": 1.2886, + "step": 6551 + }, + { + "epoch": 1.97, + "grad_norm": 15.733491897583008, + "learning_rate": 6.867795930640474e-06, + "loss": 0.836, + "step": 6552 + }, + { + "epoch": 1.97, + "grad_norm": 20.574148178100586, + "learning_rate": 6.865791320036084e-06, + "loss": 0.9531, + "step": 6553 + }, + { + "epoch": 1.97, + "grad_norm": 28.415647506713867, + "learning_rate": 6.863786709431693e-06, + "loss": 2.1882, + "step": 6554 + }, + { + "epoch": 1.97, + "grad_norm": 17.647674560546875, + "learning_rate": 6.861782098827303e-06, + "loss": 2.3014, + "step": 6555 + }, + { + "epoch": 1.97, + "grad_norm": 49.27579879760742, + "learning_rate": 6.859777488222913e-06, + "loss": 1.5323, + "step": 6556 + }, + { + "epoch": 1.97, + "grad_norm": 19.76232147216797, + "learning_rate": 6.8577728776185225e-06, + "loss": 1.6397, + "step": 6557 + }, + { + "epoch": 1.97, + "grad_norm": 11.850191116333008, + "learning_rate": 6.8557682670141334e-06, + "loss": 1.027, + "step": 6558 + }, + { + "epoch": 1.97, + "grad_norm": 26.760108947753906, + "learning_rate": 6.8537636564097435e-06, + "loss": 1.7687, + "step": 6559 + }, + { + "epoch": 1.97, + "grad_norm": 11.1150484085083, + "learning_rate": 6.851759045805353e-06, + "loss": 1.418, + "step": 6560 + }, + { + "epoch": 1.97, + "grad_norm": 23.722135543823242, + "learning_rate": 6.849754435200963e-06, + "loss": 1.0756, + "step": 6561 + }, + { + "epoch": 1.97, + "grad_norm": 17.482393264770508, + "learning_rate": 6.847749824596572e-06, + "loss": 1.2066, + "step": 6562 + }, + { + "epoch": 1.97, + "grad_norm": 17.04688835144043, + "learning_rate": 6.845745213992182e-06, + "loss": 1.4078, + "step": 6563 + }, + { + "epoch": 1.97, + "grad_norm": 15.257699012756348, + "learning_rate": 6.843740603387793e-06, + "loss": 1.8723, + "step": 6564 + }, + { + "epoch": 1.97, + "grad_norm": 13.324009895324707, + "learning_rate": 6.841735992783402e-06, + "loss": 1.3293, + "step": 6565 + }, + { + "epoch": 1.97, + "grad_norm": 18.284709930419922, + "learning_rate": 6.839731382179012e-06, + "loss": 1.8599, + "step": 6566 + }, + { + "epoch": 1.97, + "grad_norm": 13.653274536132812, + "learning_rate": 6.837726771574622e-06, + "loss": 1.7576, + "step": 6567 + }, + { + "epoch": 1.97, + "grad_norm": 38.0052490234375, + "learning_rate": 6.835722160970232e-06, + "loss": 1.656, + "step": 6568 + }, + { + "epoch": 1.98, + "grad_norm": 15.570902824401855, + "learning_rate": 6.833717550365842e-06, + "loss": 1.3572, + "step": 6569 + }, + { + "epoch": 1.98, + "grad_norm": 10.287487030029297, + "learning_rate": 6.831712939761453e-06, + "loss": 1.0965, + "step": 6570 + }, + { + "epoch": 1.98, + "grad_norm": 21.403841018676758, + "learning_rate": 6.829708329157062e-06, + "loss": 1.657, + "step": 6571 + }, + { + "epoch": 1.98, + "grad_norm": 11.690693855285645, + "learning_rate": 6.827703718552672e-06, + "loss": 1.9188, + "step": 6572 + }, + { + "epoch": 1.98, + "grad_norm": 15.413796424865723, + "learning_rate": 6.825699107948281e-06, + "loss": 1.1102, + "step": 6573 + }, + { + "epoch": 1.98, + "grad_norm": 14.91450023651123, + "learning_rate": 6.823694497343891e-06, + "loss": 1.0896, + "step": 6574 + }, + { + "epoch": 1.98, + "grad_norm": 12.081074714660645, + "learning_rate": 6.821689886739501e-06, + "loss": 1.7378, + "step": 6575 + }, + { + "epoch": 1.98, + "grad_norm": 16.04603385925293, + "learning_rate": 6.8196852761351116e-06, + "loss": 1.4417, + "step": 6576 + }, + { + "epoch": 1.98, + "grad_norm": 22.271512985229492, + "learning_rate": 6.817680665530722e-06, + "loss": 1.7555, + "step": 6577 + }, + { + "epoch": 1.98, + "grad_norm": 13.81209945678711, + "learning_rate": 6.815676054926331e-06, + "loss": 1.1311, + "step": 6578 + }, + { + "epoch": 1.98, + "grad_norm": 16.152433395385742, + "learning_rate": 6.813671444321941e-06, + "loss": 1.1026, + "step": 6579 + }, + { + "epoch": 1.98, + "grad_norm": 12.994856834411621, + "learning_rate": 6.811666833717551e-06, + "loss": 1.1081, + "step": 6580 + }, + { + "epoch": 1.98, + "grad_norm": 12.3396577835083, + "learning_rate": 6.80966222311316e-06, + "loss": 1.3073, + "step": 6581 + }, + { + "epoch": 1.98, + "grad_norm": 8.903776168823242, + "learning_rate": 6.807657612508771e-06, + "loss": 0.9238, + "step": 6582 + }, + { + "epoch": 1.98, + "grad_norm": 15.200712203979492, + "learning_rate": 6.805653001904381e-06, + "loss": 1.3466, + "step": 6583 + }, + { + "epoch": 1.98, + "grad_norm": 19.97858428955078, + "learning_rate": 6.8036483912999905e-06, + "loss": 1.7572, + "step": 6584 + }, + { + "epoch": 1.98, + "grad_norm": 24.35478401184082, + "learning_rate": 6.801643780695601e-06, + "loss": 1.929, + "step": 6585 + }, + { + "epoch": 1.98, + "grad_norm": 30.171939849853516, + "learning_rate": 6.79963917009121e-06, + "loss": 1.5492, + "step": 6586 + }, + { + "epoch": 1.98, + "grad_norm": 12.768831253051758, + "learning_rate": 6.79763455948682e-06, + "loss": 1.0463, + "step": 6587 + }, + { + "epoch": 1.98, + "grad_norm": 21.945470809936523, + "learning_rate": 6.795629948882431e-06, + "loss": 2.2842, + "step": 6588 + }, + { + "epoch": 1.98, + "grad_norm": 22.516357421875, + "learning_rate": 6.79362533827804e-06, + "loss": 2.0211, + "step": 6589 + }, + { + "epoch": 1.98, + "grad_norm": 8.880302429199219, + "learning_rate": 6.79162072767365e-06, + "loss": 1.2581, + "step": 6590 + }, + { + "epoch": 1.98, + "grad_norm": 9.477561950683594, + "learning_rate": 6.7896161170692594e-06, + "loss": 0.7168, + "step": 6591 + }, + { + "epoch": 1.98, + "grad_norm": 21.51420783996582, + "learning_rate": 6.7876115064648695e-06, + "loss": 1.3912, + "step": 6592 + }, + { + "epoch": 1.98, + "grad_norm": 50.1191291809082, + "learning_rate": 6.78560689586048e-06, + "loss": 3.0844, + "step": 6593 + }, + { + "epoch": 1.98, + "grad_norm": 19.533275604248047, + "learning_rate": 6.78360228525609e-06, + "loss": 2.1365, + "step": 6594 + }, + { + "epoch": 1.98, + "grad_norm": 11.014450073242188, + "learning_rate": 6.7815976746517e-06, + "loss": 0.7238, + "step": 6595 + }, + { + "epoch": 1.98, + "grad_norm": 35.398197174072266, + "learning_rate": 6.77959306404731e-06, + "loss": 2.9231, + "step": 6596 + }, + { + "epoch": 1.98, + "grad_norm": 59.179141998291016, + "learning_rate": 6.777588453442919e-06, + "loss": 1.4903, + "step": 6597 + }, + { + "epoch": 1.98, + "grad_norm": 26.32767677307129, + "learning_rate": 6.775583842838529e-06, + "loss": 1.0562, + "step": 6598 + }, + { + "epoch": 1.98, + "grad_norm": 14.91978931427002, + "learning_rate": 6.773579232234138e-06, + "loss": 1.8945, + "step": 6599 + }, + { + "epoch": 1.98, + "grad_norm": 43.29436111450195, + "learning_rate": 6.7715746216297485e-06, + "loss": 2.4119, + "step": 6600 + }, + { + "epoch": 1.98, + "eval_loss": 0.17498844861984253, + "eval_runtime": 43.9202, + "eval_samples_per_second": 33.675, + "eval_steps_per_second": 33.675, + "step": 6600 + }, + { + "epoch": 1.98, + "grad_norm": 20.57763671875, + "learning_rate": 6.7695700110253594e-06, + "loss": 1.2593, + "step": 6601 + }, + { + "epoch": 1.98, + "grad_norm": 11.902490615844727, + "learning_rate": 6.767565400420969e-06, + "loss": 0.8341, + "step": 6602 + }, + { + "epoch": 1.99, + "grad_norm": 11.146263122558594, + "learning_rate": 6.765560789816579e-06, + "loss": 1.3651, + "step": 6603 + }, + { + "epoch": 1.99, + "grad_norm": 11.864297866821289, + "learning_rate": 6.763556179212189e-06, + "loss": 0.9559, + "step": 6604 + }, + { + "epoch": 1.99, + "grad_norm": 12.255757331848145, + "learning_rate": 6.761551568607798e-06, + "loss": 1.1177, + "step": 6605 + }, + { + "epoch": 1.99, + "grad_norm": 6.7448506355285645, + "learning_rate": 6.759546958003408e-06, + "loss": 0.6869, + "step": 6606 + }, + { + "epoch": 1.99, + "grad_norm": 15.666464805603027, + "learning_rate": 6.757542347399019e-06, + "loss": 1.7579, + "step": 6607 + }, + { + "epoch": 1.99, + "grad_norm": 16.473482131958008, + "learning_rate": 6.755537736794628e-06, + "loss": 0.9031, + "step": 6608 + }, + { + "epoch": 1.99, + "grad_norm": 12.140952110290527, + "learning_rate": 6.753533126190238e-06, + "loss": 0.7175, + "step": 6609 + }, + { + "epoch": 1.99, + "grad_norm": 9.83596134185791, + "learning_rate": 6.751528515585848e-06, + "loss": 0.8216, + "step": 6610 + }, + { + "epoch": 1.99, + "grad_norm": 13.17597770690918, + "learning_rate": 6.749523904981458e-06, + "loss": 1.694, + "step": 6611 + }, + { + "epoch": 1.99, + "grad_norm": 17.648849487304688, + "learning_rate": 6.747519294377067e-06, + "loss": 2.1423, + "step": 6612 + }, + { + "epoch": 1.99, + "grad_norm": 13.655058860778809, + "learning_rate": 6.745514683772678e-06, + "loss": 0.9178, + "step": 6613 + }, + { + "epoch": 1.99, + "grad_norm": 18.347841262817383, + "learning_rate": 6.743510073168288e-06, + "loss": 1.0027, + "step": 6614 + }, + { + "epoch": 1.99, + "grad_norm": 26.16404914855957, + "learning_rate": 6.741505462563897e-06, + "loss": 2.206, + "step": 6615 + }, + { + "epoch": 1.99, + "grad_norm": 7.279219150543213, + "learning_rate": 6.739500851959507e-06, + "loss": 0.7718, + "step": 6616 + }, + { + "epoch": 1.99, + "grad_norm": 12.55292797088623, + "learning_rate": 6.737496241355117e-06, + "loss": 1.8409, + "step": 6617 + }, + { + "epoch": 1.99, + "grad_norm": 42.25106430053711, + "learning_rate": 6.735491630750727e-06, + "loss": 2.3152, + "step": 6618 + }, + { + "epoch": 1.99, + "grad_norm": 14.544368743896484, + "learning_rate": 6.7334870201463376e-06, + "loss": 1.2898, + "step": 6619 + }, + { + "epoch": 1.99, + "grad_norm": 21.00879669189453, + "learning_rate": 6.731482409541948e-06, + "loss": 1.4371, + "step": 6620 + }, + { + "epoch": 1.99, + "grad_norm": 15.112432479858398, + "learning_rate": 6.729477798937557e-06, + "loss": 2.2847, + "step": 6621 + }, + { + "epoch": 1.99, + "grad_norm": 16.390920639038086, + "learning_rate": 6.727473188333167e-06, + "loss": 1.1366, + "step": 6622 + }, + { + "epoch": 1.99, + "grad_norm": 17.105491638183594, + "learning_rate": 6.725468577728776e-06, + "loss": 1.7725, + "step": 6623 + }, + { + "epoch": 1.99, + "grad_norm": 17.488632202148438, + "learning_rate": 6.723463967124386e-06, + "loss": 0.5385, + "step": 6624 + }, + { + "epoch": 1.99, + "grad_norm": 29.78978157043457, + "learning_rate": 6.721459356519997e-06, + "loss": 2.3833, + "step": 6625 + }, + { + "epoch": 1.99, + "grad_norm": 15.036394119262695, + "learning_rate": 6.7194547459156065e-06, + "loss": 1.0212, + "step": 6626 + }, + { + "epoch": 1.99, + "grad_norm": 17.2657527923584, + "learning_rate": 6.7174501353112165e-06, + "loss": 1.4752, + "step": 6627 + }, + { + "epoch": 1.99, + "grad_norm": 27.82944679260254, + "learning_rate": 6.715445524706826e-06, + "loss": 1.741, + "step": 6628 + }, + { + "epoch": 1.99, + "grad_norm": 14.460271835327148, + "learning_rate": 6.713440914102436e-06, + "loss": 1.6737, + "step": 6629 + }, + { + "epoch": 1.99, + "grad_norm": 30.571147918701172, + "learning_rate": 6.711436303498046e-06, + "loss": 2.2318, + "step": 6630 + }, + { + "epoch": 1.99, + "grad_norm": 18.02724838256836, + "learning_rate": 6.709431692893657e-06, + "loss": 1.0354, + "step": 6631 + }, + { + "epoch": 1.99, + "grad_norm": 12.847180366516113, + "learning_rate": 6.707427082289266e-06, + "loss": 1.5948, + "step": 6632 + }, + { + "epoch": 1.99, + "grad_norm": 13.379613876342773, + "learning_rate": 6.705422471684876e-06, + "loss": 1.2021, + "step": 6633 + }, + { + "epoch": 1.99, + "grad_norm": 20.53904914855957, + "learning_rate": 6.7034178610804854e-06, + "loss": 1.9106, + "step": 6634 + }, + { + "epoch": 1.99, + "grad_norm": 23.03459930419922, + "learning_rate": 6.7014132504760955e-06, + "loss": 1.785, + "step": 6635 + }, + { + "epoch": 2.0, + "grad_norm": 19.824228286743164, + "learning_rate": 6.699408639871705e-06, + "loss": 0.8569, + "step": 6636 + }, + { + "epoch": 2.0, + "grad_norm": 59.54680633544922, + "learning_rate": 6.697404029267316e-06, + "loss": 2.1475, + "step": 6637 + }, + { + "epoch": 2.0, + "grad_norm": 10.229207992553711, + "learning_rate": 6.695399418662926e-06, + "loss": 1.1441, + "step": 6638 + }, + { + "epoch": 2.0, + "grad_norm": 28.467979431152344, + "learning_rate": 6.693394808058535e-06, + "loss": 1.7708, + "step": 6639 + }, + { + "epoch": 2.0, + "grad_norm": 10.531692504882812, + "learning_rate": 6.691390197454145e-06, + "loss": 0.5444, + "step": 6640 + }, + { + "epoch": 2.0, + "grad_norm": 13.020025253295898, + "learning_rate": 6.689385586849755e-06, + "loss": 0.8387, + "step": 6641 + }, + { + "epoch": 2.0, + "grad_norm": 12.514997482299805, + "learning_rate": 6.687380976245364e-06, + "loss": 1.3939, + "step": 6642 + }, + { + "epoch": 2.0, + "grad_norm": 27.652118682861328, + "learning_rate": 6.6853763656409745e-06, + "loss": 3.0772, + "step": 6643 + }, + { + "epoch": 2.0, + "grad_norm": 24.15280532836914, + "learning_rate": 6.6833717550365854e-06, + "loss": 1.5129, + "step": 6644 + }, + { + "epoch": 2.0, + "grad_norm": 13.905855178833008, + "learning_rate": 6.681367144432195e-06, + "loss": 0.9144, + "step": 6645 + }, + { + "epoch": 2.0, + "grad_norm": 16.91253662109375, + "learning_rate": 6.679362533827805e-06, + "loss": 1.5355, + "step": 6646 + }, + { + "epoch": 2.0, + "grad_norm": 11.888856887817383, + "learning_rate": 6.677357923223414e-06, + "loss": 1.1719, + "step": 6647 + }, + { + "epoch": 2.0, + "grad_norm": 14.441067695617676, + "learning_rate": 6.675353312619024e-06, + "loss": 1.3663, + "step": 6648 + }, + { + "epoch": 2.0, + "grad_norm": 13.287384033203125, + "learning_rate": 6.673348702014633e-06, + "loss": 0.5206, + "step": 6649 + }, + { + "epoch": 2.0, + "grad_norm": 12.121244430541992, + "learning_rate": 6.671344091410244e-06, + "loss": 1.0182, + "step": 6650 + }, + { + "epoch": 2.0, + "grad_norm": 44.00876998901367, + "learning_rate": 6.669339480805854e-06, + "loss": 0.9455, + "step": 6651 + }, + { + "epoch": 2.0, + "grad_norm": 15.983967781066895, + "learning_rate": 6.6673348702014636e-06, + "loss": 0.8615, + "step": 6652 + }, + { + "epoch": 2.0, + "grad_norm": 13.022303581237793, + "learning_rate": 6.665330259597074e-06, + "loss": 1.3803, + "step": 6653 + }, + { + "epoch": 2.0, + "grad_norm": 57.43462371826172, + "learning_rate": 6.663325648992684e-06, + "loss": 1.9661, + "step": 6654 + }, + { + "epoch": 2.0, + "grad_norm": 13.801424026489258, + "learning_rate": 6.661321038388293e-06, + "loss": 1.6653, + "step": 6655 + }, + { + "epoch": 2.0, + "grad_norm": 14.12394905090332, + "learning_rate": 6.659316427783904e-06, + "loss": 1.0337, + "step": 6656 + }, + { + "epoch": 2.0, + "grad_norm": 16.065610885620117, + "learning_rate": 6.657311817179514e-06, + "loss": 1.6578, + "step": 6657 + }, + { + "epoch": 2.0, + "grad_norm": 23.195810317993164, + "learning_rate": 6.655307206575123e-06, + "loss": 2.0227, + "step": 6658 + }, + { + "epoch": 2.0, + "grad_norm": 39.60324478149414, + "learning_rate": 6.653302595970733e-06, + "loss": 2.9616, + "step": 6659 + }, + { + "epoch": 2.0, + "grad_norm": 12.803409576416016, + "learning_rate": 6.6512979853663425e-06, + "loss": 1.3529, + "step": 6660 + }, + { + "epoch": 2.0, + "grad_norm": 27.853031158447266, + "learning_rate": 6.649293374761953e-06, + "loss": 1.2641, + "step": 6661 + }, + { + "epoch": 2.0, + "grad_norm": 12.550612449645996, + "learning_rate": 6.6472887641575636e-06, + "loss": 1.032, + "step": 6662 + }, + { + "epoch": 2.0, + "grad_norm": 14.057964324951172, + "learning_rate": 6.645284153553173e-06, + "loss": 1.4831, + "step": 6663 + }, + { + "epoch": 2.0, + "grad_norm": 13.735244750976562, + "learning_rate": 6.643279542948783e-06, + "loss": 1.842, + "step": 6664 + }, + { + "epoch": 2.0, + "grad_norm": 10.447699546813965, + "learning_rate": 6.641274932344393e-06, + "loss": 1.1719, + "step": 6665 + }, + { + "epoch": 2.0, + "grad_norm": 24.760276794433594, + "learning_rate": 6.639270321740002e-06, + "loss": 1.0642, + "step": 6666 + }, + { + "epoch": 2.0, + "grad_norm": 17.945220947265625, + "learning_rate": 6.637265711135612e-06, + "loss": 2.1282, + "step": 6667 + }, + { + "epoch": 2.0, + "grad_norm": 40.95531463623047, + "learning_rate": 6.635261100531223e-06, + "loss": 1.8811, + "step": 6668 + }, + { + "epoch": 2.01, + "grad_norm": 13.15553092956543, + "learning_rate": 6.6332564899268325e-06, + "loss": 1.134, + "step": 6669 + }, + { + "epoch": 2.01, + "grad_norm": 20.673992156982422, + "learning_rate": 6.6312518793224425e-06, + "loss": 1.2125, + "step": 6670 + }, + { + "epoch": 2.01, + "grad_norm": 17.39179801940918, + "learning_rate": 6.629247268718052e-06, + "loss": 1.7538, + "step": 6671 + }, + { + "epoch": 2.01, + "grad_norm": 8.030019760131836, + "learning_rate": 6.627242658113662e-06, + "loss": 1.5265, + "step": 6672 + }, + { + "epoch": 2.01, + "grad_norm": 23.085996627807617, + "learning_rate": 6.625238047509271e-06, + "loss": 1.5642, + "step": 6673 + }, + { + "epoch": 2.01, + "grad_norm": 36.12226104736328, + "learning_rate": 6.623233436904882e-06, + "loss": 2.836, + "step": 6674 + }, + { + "epoch": 2.01, + "grad_norm": 16.673858642578125, + "learning_rate": 6.621228826300492e-06, + "loss": 1.0128, + "step": 6675 + }, + { + "epoch": 2.01, + "grad_norm": 18.04882049560547, + "learning_rate": 6.619224215696101e-06, + "loss": 1.2667, + "step": 6676 + }, + { + "epoch": 2.01, + "grad_norm": 14.236947059631348, + "learning_rate": 6.6172196050917114e-06, + "loss": 1.0438, + "step": 6677 + }, + { + "epoch": 2.01, + "grad_norm": 64.36849212646484, + "learning_rate": 6.6152149944873215e-06, + "loss": 2.7021, + "step": 6678 + }, + { + "epoch": 2.01, + "grad_norm": 23.676084518432617, + "learning_rate": 6.613210383882931e-06, + "loss": 1.0589, + "step": 6679 + }, + { + "epoch": 2.01, + "grad_norm": 19.20448875427246, + "learning_rate": 6.611205773278542e-06, + "loss": 1.8695, + "step": 6680 + }, + { + "epoch": 2.01, + "grad_norm": 10.511244773864746, + "learning_rate": 6.609201162674152e-06, + "loss": 1.1752, + "step": 6681 + }, + { + "epoch": 2.01, + "grad_norm": 40.56865692138672, + "learning_rate": 6.607196552069761e-06, + "loss": 1.6004, + "step": 6682 + }, + { + "epoch": 2.01, + "grad_norm": 17.472505569458008, + "learning_rate": 6.605191941465371e-06, + "loss": 1.5864, + "step": 6683 + }, + { + "epoch": 2.01, + "grad_norm": 10.834254264831543, + "learning_rate": 6.60318733086098e-06, + "loss": 1.2419, + "step": 6684 + }, + { + "epoch": 2.01, + "grad_norm": 16.969730377197266, + "learning_rate": 6.60118272025659e-06, + "loss": 1.2284, + "step": 6685 + }, + { + "epoch": 2.01, + "grad_norm": 7.831116199493408, + "learning_rate": 6.5991781096522e-06, + "loss": 1.4888, + "step": 6686 + }, + { + "epoch": 2.01, + "grad_norm": 24.744110107421875, + "learning_rate": 6.597173499047811e-06, + "loss": 1.6571, + "step": 6687 + }, + { + "epoch": 2.01, + "grad_norm": 21.574138641357422, + "learning_rate": 6.595168888443421e-06, + "loss": 2.0631, + "step": 6688 + }, + { + "epoch": 2.01, + "grad_norm": 11.040058135986328, + "learning_rate": 6.593164277839031e-06, + "loss": 1.1278, + "step": 6689 + }, + { + "epoch": 2.01, + "grad_norm": 46.7620735168457, + "learning_rate": 6.59115966723464e-06, + "loss": 1.4132, + "step": 6690 + }, + { + "epoch": 2.01, + "grad_norm": 10.559418678283691, + "learning_rate": 6.58915505663025e-06, + "loss": 0.9851, + "step": 6691 + }, + { + "epoch": 2.01, + "grad_norm": 8.289081573486328, + "learning_rate": 6.587150446025859e-06, + "loss": 1.1853, + "step": 6692 + }, + { + "epoch": 2.01, + "grad_norm": 8.112937927246094, + "learning_rate": 6.58514583542147e-06, + "loss": 1.0523, + "step": 6693 + }, + { + "epoch": 2.01, + "grad_norm": 19.774045944213867, + "learning_rate": 6.58314122481708e-06, + "loss": 1.6196, + "step": 6694 + }, + { + "epoch": 2.01, + "grad_norm": 52.585914611816406, + "learning_rate": 6.5811366142126896e-06, + "loss": 1.0791, + "step": 6695 + }, + { + "epoch": 2.01, + "grad_norm": 10.37691879272461, + "learning_rate": 6.5791320036083e-06, + "loss": 0.525, + "step": 6696 + }, + { + "epoch": 2.01, + "grad_norm": 13.222021102905273, + "learning_rate": 6.577127393003909e-06, + "loss": 1.0604, + "step": 6697 + }, + { + "epoch": 2.01, + "grad_norm": 12.805461883544922, + "learning_rate": 6.575122782399519e-06, + "loss": 1.4706, + "step": 6698 + }, + { + "epoch": 2.01, + "grad_norm": 43.80728530883789, + "learning_rate": 6.57311817179513e-06, + "loss": 2.084, + "step": 6699 + }, + { + "epoch": 2.01, + "grad_norm": 15.121885299682617, + "learning_rate": 6.571113561190739e-06, + "loss": 0.8853, + "step": 6700 + }, + { + "epoch": 2.01, + "grad_norm": 32.659427642822266, + "learning_rate": 6.569108950586349e-06, + "loss": 1.8597, + "step": 6701 + }, + { + "epoch": 2.02, + "grad_norm": 26.797901153564453, + "learning_rate": 6.567104339981959e-06, + "loss": 2.3284, + "step": 6702 + }, + { + "epoch": 2.02, + "grad_norm": 3.8142130374908447, + "learning_rate": 6.5650997293775685e-06, + "loss": 0.2054, + "step": 6703 + }, + { + "epoch": 2.02, + "grad_norm": 15.68226432800293, + "learning_rate": 6.563095118773179e-06, + "loss": 2.487, + "step": 6704 + }, + { + "epoch": 2.02, + "grad_norm": 42.038726806640625, + "learning_rate": 6.5610905081687896e-06, + "loss": 2.1623, + "step": 6705 + }, + { + "epoch": 2.02, + "grad_norm": 19.80816650390625, + "learning_rate": 6.559085897564399e-06, + "loss": 1.0708, + "step": 6706 + }, + { + "epoch": 2.02, + "grad_norm": 9.321516990661621, + "learning_rate": 6.557081286960009e-06, + "loss": 1.605, + "step": 6707 + }, + { + "epoch": 2.02, + "grad_norm": 159.25819396972656, + "learning_rate": 6.555076676355618e-06, + "loss": 2.8398, + "step": 6708 + }, + { + "epoch": 2.02, + "grad_norm": 17.203962326049805, + "learning_rate": 6.553072065751228e-06, + "loss": 1.4595, + "step": 6709 + }, + { + "epoch": 2.02, + "grad_norm": 17.149423599243164, + "learning_rate": 6.5510674551468374e-06, + "loss": 1.7006, + "step": 6710 + }, + { + "epoch": 2.02, + "grad_norm": 12.44357967376709, + "learning_rate": 6.549062844542448e-06, + "loss": 0.8718, + "step": 6711 + }, + { + "epoch": 2.02, + "grad_norm": 9.552597999572754, + "learning_rate": 6.5470582339380585e-06, + "loss": 1.4398, + "step": 6712 + }, + { + "epoch": 2.02, + "grad_norm": 12.471324920654297, + "learning_rate": 6.545053623333668e-06, + "loss": 2.0504, + "step": 6713 + }, + { + "epoch": 2.02, + "grad_norm": 13.623446464538574, + "learning_rate": 6.543049012729278e-06, + "loss": 1.7889, + "step": 6714 + }, + { + "epoch": 2.02, + "grad_norm": 8.370439529418945, + "learning_rate": 6.541044402124888e-06, + "loss": 1.1749, + "step": 6715 + }, + { + "epoch": 2.02, + "grad_norm": 16.073829650878906, + "learning_rate": 6.539039791520497e-06, + "loss": 1.3742, + "step": 6716 + }, + { + "epoch": 2.02, + "grad_norm": 9.64084529876709, + "learning_rate": 6.537035180916108e-06, + "loss": 1.142, + "step": 6717 + }, + { + "epoch": 2.02, + "grad_norm": 25.97657585144043, + "learning_rate": 6.535030570311718e-06, + "loss": 2.3333, + "step": 6718 + }, + { + "epoch": 2.02, + "grad_norm": 118.77561950683594, + "learning_rate": 6.533025959707327e-06, + "loss": 1.5912, + "step": 6719 + }, + { + "epoch": 2.02, + "grad_norm": 14.903034210205078, + "learning_rate": 6.5310213491029374e-06, + "loss": 1.897, + "step": 6720 + }, + { + "epoch": 2.02, + "eval_loss": 0.16902218759059906, + "eval_runtime": 43.4322, + "eval_samples_per_second": 34.053, + "eval_steps_per_second": 34.053, + "step": 6720 + }, + { + "epoch": 2.02, + "grad_norm": 39.25941848754883, + "learning_rate": 6.529016738498547e-06, + "loss": 1.8256, + "step": 6721 + }, + { + "epoch": 2.02, + "grad_norm": 9.439811706542969, + "learning_rate": 6.527012127894157e-06, + "loss": 0.9625, + "step": 6722 + }, + { + "epoch": 2.02, + "grad_norm": 12.62756061553955, + "learning_rate": 6.525007517289767e-06, + "loss": 0.8624, + "step": 6723 + }, + { + "epoch": 2.02, + "grad_norm": 13.67402458190918, + "learning_rate": 6.523002906685377e-06, + "loss": 0.785, + "step": 6724 + }, + { + "epoch": 2.02, + "grad_norm": 29.54031753540039, + "learning_rate": 6.520998296080987e-06, + "loss": 2.0936, + "step": 6725 + }, + { + "epoch": 2.02, + "grad_norm": 13.225194931030273, + "learning_rate": 6.518993685476597e-06, + "loss": 1.2107, + "step": 6726 + }, + { + "epoch": 2.02, + "grad_norm": 8.916555404663086, + "learning_rate": 6.516989074872206e-06, + "loss": 1.0452, + "step": 6727 + }, + { + "epoch": 2.02, + "grad_norm": 24.967422485351562, + "learning_rate": 6.514984464267816e-06, + "loss": 2.1105, + "step": 6728 + }, + { + "epoch": 2.02, + "grad_norm": 14.154083251953125, + "learning_rate": 6.512979853663426e-06, + "loss": 0.7428, + "step": 6729 + }, + { + "epoch": 2.02, + "grad_norm": 17.682830810546875, + "learning_rate": 6.510975243059037e-06, + "loss": 1.3675, + "step": 6730 + }, + { + "epoch": 2.02, + "grad_norm": 12.129364013671875, + "learning_rate": 6.508970632454647e-06, + "loss": 1.7799, + "step": 6731 + }, + { + "epoch": 2.02, + "grad_norm": 37.760650634765625, + "learning_rate": 6.506966021850256e-06, + "loss": 1.1513, + "step": 6732 + }, + { + "epoch": 2.02, + "grad_norm": 87.66400909423828, + "learning_rate": 6.504961411245866e-06, + "loss": 1.9592, + "step": 6733 + }, + { + "epoch": 2.02, + "grad_norm": 12.75828742980957, + "learning_rate": 6.502956800641475e-06, + "loss": 1.1079, + "step": 6734 + }, + { + "epoch": 2.02, + "grad_norm": 12.436376571655273, + "learning_rate": 6.500952190037085e-06, + "loss": 0.8629, + "step": 6735 + }, + { + "epoch": 2.03, + "grad_norm": 8.984572410583496, + "learning_rate": 6.498947579432696e-06, + "loss": 0.9553, + "step": 6736 + }, + { + "epoch": 2.03, + "grad_norm": 22.55635643005371, + "learning_rate": 6.4969429688283055e-06, + "loss": 1.6062, + "step": 6737 + }, + { + "epoch": 2.03, + "grad_norm": 13.196118354797363, + "learning_rate": 6.4949383582239156e-06, + "loss": 0.5885, + "step": 6738 + }, + { + "epoch": 2.03, + "grad_norm": 46.98003387451172, + "learning_rate": 6.492933747619526e-06, + "loss": 1.3093, + "step": 6739 + }, + { + "epoch": 2.03, + "grad_norm": 23.900150299072266, + "learning_rate": 6.490929137015135e-06, + "loss": 1.4142, + "step": 6740 + }, + { + "epoch": 2.03, + "grad_norm": 17.741270065307617, + "learning_rate": 6.488924526410745e-06, + "loss": 1.4268, + "step": 6741 + }, + { + "epoch": 2.03, + "grad_norm": 26.587739944458008, + "learning_rate": 6.486919915806356e-06, + "loss": 2.1686, + "step": 6742 + }, + { + "epoch": 2.03, + "grad_norm": 35.67372131347656, + "learning_rate": 6.484915305201965e-06, + "loss": 1.6213, + "step": 6743 + }, + { + "epoch": 2.03, + "grad_norm": 26.185956954956055, + "learning_rate": 6.482910694597575e-06, + "loss": 1.1309, + "step": 6744 + }, + { + "epoch": 2.03, + "grad_norm": 17.040918350219727, + "learning_rate": 6.4809060839931845e-06, + "loss": 1.0778, + "step": 6745 + }, + { + "epoch": 2.03, + "grad_norm": 23.627857208251953, + "learning_rate": 6.4789014733887946e-06, + "loss": 1.1968, + "step": 6746 + }, + { + "epoch": 2.03, + "grad_norm": 29.324899673461914, + "learning_rate": 6.476896862784405e-06, + "loss": 1.9221, + "step": 6747 + }, + { + "epoch": 2.03, + "grad_norm": 24.806861877441406, + "learning_rate": 6.474892252180015e-06, + "loss": 1.0605, + "step": 6748 + }, + { + "epoch": 2.03, + "grad_norm": 32.63029098510742, + "learning_rate": 6.472887641575625e-06, + "loss": 1.4815, + "step": 6749 + }, + { + "epoch": 2.03, + "grad_norm": 17.42801284790039, + "learning_rate": 6.470883030971235e-06, + "loss": 1.6286, + "step": 6750 + }, + { + "epoch": 2.03, + "grad_norm": 40.37263488769531, + "learning_rate": 6.468878420366844e-06, + "loss": 1.6338, + "step": 6751 + }, + { + "epoch": 2.03, + "grad_norm": 17.74640464782715, + "learning_rate": 6.466873809762454e-06, + "loss": 1.2096, + "step": 6752 + }, + { + "epoch": 2.03, + "grad_norm": 26.730159759521484, + "learning_rate": 6.4648691991580634e-06, + "loss": 2.1374, + "step": 6753 + }, + { + "epoch": 2.03, + "grad_norm": 13.699724197387695, + "learning_rate": 6.462864588553674e-06, + "loss": 1.7276, + "step": 6754 + }, + { + "epoch": 2.03, + "grad_norm": 26.75263023376465, + "learning_rate": 6.4608599779492845e-06, + "loss": 1.0525, + "step": 6755 + }, + { + "epoch": 2.03, + "grad_norm": 24.192272186279297, + "learning_rate": 6.458855367344894e-06, + "loss": 1.4798, + "step": 6756 + }, + { + "epoch": 2.03, + "grad_norm": 62.38142013549805, + "learning_rate": 6.456850756740504e-06, + "loss": 1.8068, + "step": 6757 + }, + { + "epoch": 2.03, + "grad_norm": 29.921567916870117, + "learning_rate": 6.454846146136113e-06, + "loss": 1.619, + "step": 6758 + }, + { + "epoch": 2.03, + "grad_norm": 6.582921981811523, + "learning_rate": 6.452841535531723e-06, + "loss": 0.648, + "step": 6759 + }, + { + "epoch": 2.03, + "grad_norm": 13.459547996520996, + "learning_rate": 6.450836924927334e-06, + "loss": 1.4701, + "step": 6760 + }, + { + "epoch": 2.03, + "grad_norm": 19.61090660095215, + "learning_rate": 6.448832314322943e-06, + "loss": 1.2505, + "step": 6761 + }, + { + "epoch": 2.03, + "grad_norm": 32.77788543701172, + "learning_rate": 6.446827703718553e-06, + "loss": 1.9251, + "step": 6762 + }, + { + "epoch": 2.03, + "grad_norm": 7.951191425323486, + "learning_rate": 6.4448230931141634e-06, + "loss": 0.7957, + "step": 6763 + }, + { + "epoch": 2.03, + "grad_norm": 13.6834135055542, + "learning_rate": 6.442818482509773e-06, + "loss": 1.488, + "step": 6764 + }, + { + "epoch": 2.03, + "grad_norm": 26.919858932495117, + "learning_rate": 6.440813871905383e-06, + "loss": 1.6017, + "step": 6765 + }, + { + "epoch": 2.03, + "grad_norm": 18.014230728149414, + "learning_rate": 6.438809261300992e-06, + "loss": 0.8233, + "step": 6766 + }, + { + "epoch": 2.03, + "grad_norm": 18.699932098388672, + "learning_rate": 6.436804650696603e-06, + "loss": 1.1873, + "step": 6767 + }, + { + "epoch": 2.03, + "grad_norm": 24.57878303527832, + "learning_rate": 6.434800040092213e-06, + "loss": 1.7365, + "step": 6768 + }, + { + "epoch": 2.04, + "grad_norm": 30.225631713867188, + "learning_rate": 6.432795429487822e-06, + "loss": 2.7789, + "step": 6769 + }, + { + "epoch": 2.04, + "grad_norm": 22.14498519897461, + "learning_rate": 6.430790818883432e-06, + "loss": 1.6425, + "step": 6770 + }, + { + "epoch": 2.04, + "grad_norm": 12.216958045959473, + "learning_rate": 6.428786208279042e-06, + "loss": 1.7486, + "step": 6771 + }, + { + "epoch": 2.04, + "grad_norm": 26.501882553100586, + "learning_rate": 6.426781597674652e-06, + "loss": 1.3613, + "step": 6772 + }, + { + "epoch": 2.04, + "grad_norm": 30.44036293029785, + "learning_rate": 6.424776987070263e-06, + "loss": 1.2315, + "step": 6773 + }, + { + "epoch": 2.04, + "grad_norm": 19.72119140625, + "learning_rate": 6.422772376465873e-06, + "loss": 1.8577, + "step": 6774 + }, + { + "epoch": 2.04, + "grad_norm": 11.062511444091797, + "learning_rate": 6.420767765861482e-06, + "loss": 1.4272, + "step": 6775 + }, + { + "epoch": 2.04, + "grad_norm": 16.308460235595703, + "learning_rate": 6.418763155257092e-06, + "loss": 1.3599, + "step": 6776 + }, + { + "epoch": 2.04, + "grad_norm": 35.35619354248047, + "learning_rate": 6.416758544652701e-06, + "loss": 1.477, + "step": 6777 + }, + { + "epoch": 2.04, + "grad_norm": 13.431120872497559, + "learning_rate": 6.414753934048311e-06, + "loss": 0.9098, + "step": 6778 + }, + { + "epoch": 2.04, + "grad_norm": 22.69354820251465, + "learning_rate": 6.412749323443922e-06, + "loss": 1.3123, + "step": 6779 + }, + { + "epoch": 2.04, + "grad_norm": 41.717098236083984, + "learning_rate": 6.4107447128395315e-06, + "loss": 2.688, + "step": 6780 + }, + { + "epoch": 2.04, + "grad_norm": 9.498014450073242, + "learning_rate": 6.4087401022351416e-06, + "loss": 0.8678, + "step": 6781 + }, + { + "epoch": 2.04, + "grad_norm": 10.49351692199707, + "learning_rate": 6.406735491630751e-06, + "loss": 1.8695, + "step": 6782 + }, + { + "epoch": 2.04, + "grad_norm": 31.190580368041992, + "learning_rate": 6.404730881026361e-06, + "loss": 1.78, + "step": 6783 + }, + { + "epoch": 2.04, + "grad_norm": 15.785634994506836, + "learning_rate": 6.402726270421971e-06, + "loss": 1.0891, + "step": 6784 + }, + { + "epoch": 2.04, + "grad_norm": 14.310815811157227, + "learning_rate": 6.400721659817581e-06, + "loss": 0.9196, + "step": 6785 + }, + { + "epoch": 2.04, + "grad_norm": 38.41286087036133, + "learning_rate": 6.398717049213191e-06, + "loss": 2.4826, + "step": 6786 + }, + { + "epoch": 2.04, + "grad_norm": 10.610367774963379, + "learning_rate": 6.396712438608801e-06, + "loss": 1.0001, + "step": 6787 + }, + { + "epoch": 2.04, + "grad_norm": 9.231528282165527, + "learning_rate": 6.3947078280044105e-06, + "loss": 1.0759, + "step": 6788 + }, + { + "epoch": 2.04, + "grad_norm": 7.4241862297058105, + "learning_rate": 6.3927032174000206e-06, + "loss": 1.9806, + "step": 6789 + }, + { + "epoch": 2.04, + "grad_norm": 14.655599594116211, + "learning_rate": 6.39069860679563e-06, + "loss": 1.3411, + "step": 6790 + }, + { + "epoch": 2.04, + "grad_norm": 10.586833000183105, + "learning_rate": 6.388693996191241e-06, + "loss": 0.9932, + "step": 6791 + }, + { + "epoch": 2.04, + "grad_norm": 19.10291290283203, + "learning_rate": 6.386689385586851e-06, + "loss": 1.5976, + "step": 6792 + }, + { + "epoch": 2.04, + "grad_norm": 15.874801635742188, + "learning_rate": 6.38468477498246e-06, + "loss": 0.897, + "step": 6793 + }, + { + "epoch": 2.04, + "grad_norm": 13.638111114501953, + "learning_rate": 6.38268016437807e-06, + "loss": 0.7186, + "step": 6794 + }, + { + "epoch": 2.04, + "grad_norm": 9.389366149902344, + "learning_rate": 6.380675553773679e-06, + "loss": 0.9544, + "step": 6795 + }, + { + "epoch": 2.04, + "grad_norm": 26.145200729370117, + "learning_rate": 6.3786709431692894e-06, + "loss": 1.3817, + "step": 6796 + }, + { + "epoch": 2.04, + "grad_norm": 12.559453010559082, + "learning_rate": 6.3766663325649e-06, + "loss": 1.5977, + "step": 6797 + }, + { + "epoch": 2.04, + "grad_norm": 9.843866348266602, + "learning_rate": 6.37466172196051e-06, + "loss": 0.9535, + "step": 6798 + }, + { + "epoch": 2.04, + "grad_norm": 15.275118827819824, + "learning_rate": 6.37265711135612e-06, + "loss": 1.3151, + "step": 6799 + }, + { + "epoch": 2.04, + "grad_norm": 18.97870635986328, + "learning_rate": 6.37065250075173e-06, + "loss": 1.6633, + "step": 6800 + }, + { + "epoch": 2.04, + "grad_norm": 11.981043815612793, + "learning_rate": 6.368647890147339e-06, + "loss": 1.0551, + "step": 6801 + }, + { + "epoch": 2.05, + "grad_norm": 25.228471755981445, + "learning_rate": 6.366643279542949e-06, + "loss": 1.4122, + "step": 6802 + }, + { + "epoch": 2.05, + "grad_norm": 46.05463790893555, + "learning_rate": 6.36463866893856e-06, + "loss": 2.192, + "step": 6803 + }, + { + "epoch": 2.05, + "grad_norm": 28.93854522705078, + "learning_rate": 6.362634058334169e-06, + "loss": 1.235, + "step": 6804 + }, + { + "epoch": 2.05, + "grad_norm": 22.576128005981445, + "learning_rate": 6.360629447729779e-06, + "loss": 1.61, + "step": 6805 + }, + { + "epoch": 2.05, + "grad_norm": 8.201756477355957, + "learning_rate": 6.358624837125389e-06, + "loss": 1.3716, + "step": 6806 + }, + { + "epoch": 2.05, + "grad_norm": 9.097603797912598, + "learning_rate": 6.356620226520999e-06, + "loss": 0.69, + "step": 6807 + }, + { + "epoch": 2.05, + "grad_norm": 26.398914337158203, + "learning_rate": 6.354615615916609e-06, + "loss": 2.6075, + "step": 6808 + }, + { + "epoch": 2.05, + "grad_norm": 15.047880172729492, + "learning_rate": 6.352611005312218e-06, + "loss": 1.2919, + "step": 6809 + }, + { + "epoch": 2.05, + "grad_norm": 42.7448616027832, + "learning_rate": 6.350606394707829e-06, + "loss": 1.1902, + "step": 6810 + }, + { + "epoch": 2.05, + "grad_norm": 29.7187557220459, + "learning_rate": 6.348601784103439e-06, + "loss": 1.9144, + "step": 6811 + }, + { + "epoch": 2.05, + "grad_norm": 13.130889892578125, + "learning_rate": 6.346597173499048e-06, + "loss": 0.9165, + "step": 6812 + }, + { + "epoch": 2.05, + "grad_norm": 14.863086700439453, + "learning_rate": 6.344592562894658e-06, + "loss": 2.3026, + "step": 6813 + }, + { + "epoch": 2.05, + "grad_norm": 31.419349670410156, + "learning_rate": 6.342587952290268e-06, + "loss": 1.929, + "step": 6814 + }, + { + "epoch": 2.05, + "grad_norm": 16.03536605834961, + "learning_rate": 6.340583341685878e-06, + "loss": 1.0579, + "step": 6815 + }, + { + "epoch": 2.05, + "grad_norm": 9.93125057220459, + "learning_rate": 6.338578731081489e-06, + "loss": 1.0248, + "step": 6816 + }, + { + "epoch": 2.05, + "grad_norm": 20.598365783691406, + "learning_rate": 6.336574120477098e-06, + "loss": 1.0575, + "step": 6817 + }, + { + "epoch": 2.05, + "grad_norm": 24.11957359313965, + "learning_rate": 6.334569509872708e-06, + "loss": 2.0264, + "step": 6818 + }, + { + "epoch": 2.05, + "grad_norm": 12.065218925476074, + "learning_rate": 6.332564899268317e-06, + "loss": 0.6793, + "step": 6819 + }, + { + "epoch": 2.05, + "grad_norm": 43.54728698730469, + "learning_rate": 6.330560288663927e-06, + "loss": 2.746, + "step": 6820 + }, + { + "epoch": 2.05, + "grad_norm": 12.573387145996094, + "learning_rate": 6.328555678059537e-06, + "loss": 1.0912, + "step": 6821 + }, + { + "epoch": 2.05, + "grad_norm": 53.12962341308594, + "learning_rate": 6.326551067455147e-06, + "loss": 2.5651, + "step": 6822 + }, + { + "epoch": 2.05, + "grad_norm": 27.892974853515625, + "learning_rate": 6.3245464568507575e-06, + "loss": 2.1102, + "step": 6823 + }, + { + "epoch": 2.05, + "grad_norm": 11.939574241638184, + "learning_rate": 6.3225418462463676e-06, + "loss": 1.0358, + "step": 6824 + }, + { + "epoch": 2.05, + "grad_norm": 15.528552055358887, + "learning_rate": 6.320537235641977e-06, + "loss": 1.43, + "step": 6825 + }, + { + "epoch": 2.05, + "grad_norm": 19.028608322143555, + "learning_rate": 6.318532625037587e-06, + "loss": 1.3059, + "step": 6826 + }, + { + "epoch": 2.05, + "grad_norm": 29.945940017700195, + "learning_rate": 6.316528014433196e-06, + "loss": 1.4528, + "step": 6827 + }, + { + "epoch": 2.05, + "grad_norm": 8.454259872436523, + "learning_rate": 6.314523403828807e-06, + "loss": 0.9366, + "step": 6828 + }, + { + "epoch": 2.05, + "grad_norm": 19.378828048706055, + "learning_rate": 6.312518793224417e-06, + "loss": 0.8679, + "step": 6829 + }, + { + "epoch": 2.05, + "grad_norm": 20.16804313659668, + "learning_rate": 6.310514182620026e-06, + "loss": 1.6893, + "step": 6830 + }, + { + "epoch": 2.05, + "grad_norm": 9.753872871398926, + "learning_rate": 6.3085095720156365e-06, + "loss": 1.1253, + "step": 6831 + }, + { + "epoch": 2.05, + "grad_norm": 14.549400329589844, + "learning_rate": 6.306504961411246e-06, + "loss": 1.3079, + "step": 6832 + }, + { + "epoch": 2.05, + "grad_norm": 15.794536590576172, + "learning_rate": 6.304500350806856e-06, + "loss": 1.394, + "step": 6833 + }, + { + "epoch": 2.05, + "grad_norm": 32.083927154541016, + "learning_rate": 6.302495740202467e-06, + "loss": 1.716, + "step": 6834 + }, + { + "epoch": 2.06, + "grad_norm": 12.402176856994629, + "learning_rate": 6.300491129598077e-06, + "loss": 1.0654, + "step": 6835 + }, + { + "epoch": 2.06, + "grad_norm": 16.35983657836914, + "learning_rate": 6.298486518993686e-06, + "loss": 1.1802, + "step": 6836 + }, + { + "epoch": 2.06, + "grad_norm": 27.28036117553711, + "learning_rate": 6.296481908389296e-06, + "loss": 1.8658, + "step": 6837 + }, + { + "epoch": 2.06, + "grad_norm": 10.746267318725586, + "learning_rate": 6.294477297784905e-06, + "loss": 1.2586, + "step": 6838 + }, + { + "epoch": 2.06, + "grad_norm": 27.72111701965332, + "learning_rate": 6.2924726871805155e-06, + "loss": 2.0948, + "step": 6839 + }, + { + "epoch": 2.06, + "grad_norm": 13.532320976257324, + "learning_rate": 6.290468076576126e-06, + "loss": 1.2246, + "step": 6840 + }, + { + "epoch": 2.06, + "eval_loss": 0.18061862885951996, + "eval_runtime": 43.8871, + "eval_samples_per_second": 33.7, + "eval_steps_per_second": 33.7, + "step": 6840 + }, + { + "epoch": 2.06, + "grad_norm": 23.459665298461914, + "learning_rate": 6.288463465971736e-06, + "loss": 1.4494, + "step": 6841 + }, + { + "epoch": 2.06, + "grad_norm": 13.611770629882812, + "learning_rate": 6.286458855367346e-06, + "loss": 1.1819, + "step": 6842 + }, + { + "epoch": 2.06, + "grad_norm": 25.924118041992188, + "learning_rate": 6.284454244762955e-06, + "loss": 1.0744, + "step": 6843 + }, + { + "epoch": 2.06, + "grad_norm": 19.138996124267578, + "learning_rate": 6.282449634158565e-06, + "loss": 1.2561, + "step": 6844 + }, + { + "epoch": 2.06, + "grad_norm": 46.55351638793945, + "learning_rate": 6.280445023554175e-06, + "loss": 1.4002, + "step": 6845 + }, + { + "epoch": 2.06, + "grad_norm": 16.169979095458984, + "learning_rate": 6.278440412949785e-06, + "loss": 1.2913, + "step": 6846 + }, + { + "epoch": 2.06, + "grad_norm": 37.923133850097656, + "learning_rate": 6.276435802345395e-06, + "loss": 1.5886, + "step": 6847 + }, + { + "epoch": 2.06, + "grad_norm": 57.48485565185547, + "learning_rate": 6.274431191741005e-06, + "loss": 2.0952, + "step": 6848 + }, + { + "epoch": 2.06, + "grad_norm": 13.789143562316895, + "learning_rate": 6.272426581136615e-06, + "loss": 1.712, + "step": 6849 + }, + { + "epoch": 2.06, + "grad_norm": 21.859037399291992, + "learning_rate": 6.270421970532225e-06, + "loss": 2.0063, + "step": 6850 + }, + { + "epoch": 2.06, + "grad_norm": 13.258837699890137, + "learning_rate": 6.268417359927834e-06, + "loss": 1.2955, + "step": 6851 + }, + { + "epoch": 2.06, + "grad_norm": 8.517374038696289, + "learning_rate": 6.266412749323444e-06, + "loss": 1.0369, + "step": 6852 + }, + { + "epoch": 2.06, + "grad_norm": 10.359166145324707, + "learning_rate": 6.264408138719055e-06, + "loss": 1.0215, + "step": 6853 + }, + { + "epoch": 2.06, + "grad_norm": 28.84781837463379, + "learning_rate": 6.262403528114664e-06, + "loss": 1.9747, + "step": 6854 + }, + { + "epoch": 2.06, + "grad_norm": 11.506865501403809, + "learning_rate": 6.260398917510274e-06, + "loss": 1.1076, + "step": 6855 + }, + { + "epoch": 2.06, + "grad_norm": 15.466108322143555, + "learning_rate": 6.2583943069058835e-06, + "loss": 1.2779, + "step": 6856 + }, + { + "epoch": 2.06, + "grad_norm": 31.154294967651367, + "learning_rate": 6.256389696301494e-06, + "loss": 1.9285, + "step": 6857 + }, + { + "epoch": 2.06, + "grad_norm": 15.335009574890137, + "learning_rate": 6.254385085697104e-06, + "loss": 1.6327, + "step": 6858 + }, + { + "epoch": 2.06, + "grad_norm": 14.675117492675781, + "learning_rate": 6.252380475092715e-06, + "loss": 1.644, + "step": 6859 + }, + { + "epoch": 2.06, + "grad_norm": 133.6639862060547, + "learning_rate": 6.250375864488324e-06, + "loss": 1.1432, + "step": 6860 + }, + { + "epoch": 2.06, + "grad_norm": 39.901222229003906, + "learning_rate": 6.248371253883934e-06, + "loss": 2.097, + "step": 6861 + }, + { + "epoch": 2.06, + "grad_norm": 13.691509246826172, + "learning_rate": 6.246366643279543e-06, + "loss": 1.6336, + "step": 6862 + }, + { + "epoch": 2.06, + "grad_norm": 14.485090255737305, + "learning_rate": 6.244362032675153e-06, + "loss": 1.3259, + "step": 6863 + }, + { + "epoch": 2.06, + "grad_norm": 72.5770492553711, + "learning_rate": 6.2423574220707625e-06, + "loss": 1.6907, + "step": 6864 + }, + { + "epoch": 2.06, + "grad_norm": 10.634366989135742, + "learning_rate": 6.240352811466373e-06, + "loss": 0.5027, + "step": 6865 + }, + { + "epoch": 2.06, + "grad_norm": 33.018367767333984, + "learning_rate": 6.2383482008619835e-06, + "loss": 1.7409, + "step": 6866 + }, + { + "epoch": 2.06, + "grad_norm": 9.623235702514648, + "learning_rate": 6.236343590257593e-06, + "loss": 1.715, + "step": 6867 + }, + { + "epoch": 2.06, + "grad_norm": 47.73927307128906, + "learning_rate": 6.234338979653203e-06, + "loss": 1.4383, + "step": 6868 + }, + { + "epoch": 2.07, + "grad_norm": 9.162986755371094, + "learning_rate": 6.232334369048813e-06, + "loss": 1.1415, + "step": 6869 + }, + { + "epoch": 2.07, + "grad_norm": 41.81623077392578, + "learning_rate": 6.230329758444422e-06, + "loss": 1.6832, + "step": 6870 + }, + { + "epoch": 2.07, + "grad_norm": 21.047195434570312, + "learning_rate": 6.228325147840033e-06, + "loss": 1.202, + "step": 6871 + }, + { + "epoch": 2.07, + "grad_norm": 19.954036712646484, + "learning_rate": 6.226320537235643e-06, + "loss": 1.4934, + "step": 6872 + }, + { + "epoch": 2.07, + "grad_norm": 24.60007095336914, + "learning_rate": 6.224315926631252e-06, + "loss": 0.9536, + "step": 6873 + }, + { + "epoch": 2.07, + "grad_norm": 23.833621978759766, + "learning_rate": 6.2223113160268625e-06, + "loss": 2.561, + "step": 6874 + }, + { + "epoch": 2.07, + "grad_norm": 51.6407585144043, + "learning_rate": 6.220306705422472e-06, + "loss": 1.8729, + "step": 6875 + }, + { + "epoch": 2.07, + "grad_norm": 9.72765827178955, + "learning_rate": 6.218302094818082e-06, + "loss": 1.6889, + "step": 6876 + }, + { + "epoch": 2.07, + "grad_norm": 86.259765625, + "learning_rate": 6.216297484213693e-06, + "loss": 1.6229, + "step": 6877 + }, + { + "epoch": 2.07, + "grad_norm": 12.195944786071777, + "learning_rate": 6.214292873609302e-06, + "loss": 1.7568, + "step": 6878 + }, + { + "epoch": 2.07, + "grad_norm": 41.19993591308594, + "learning_rate": 6.212288263004912e-06, + "loss": 1.2259, + "step": 6879 + }, + { + "epoch": 2.07, + "grad_norm": 10.510581970214844, + "learning_rate": 6.210283652400521e-06, + "loss": 1.1598, + "step": 6880 + }, + { + "epoch": 2.07, + "grad_norm": 14.523313522338867, + "learning_rate": 6.208279041796131e-06, + "loss": 1.3958, + "step": 6881 + }, + { + "epoch": 2.07, + "grad_norm": 11.1996431350708, + "learning_rate": 6.2062744311917415e-06, + "loss": 1.6634, + "step": 6882 + }, + { + "epoch": 2.07, + "grad_norm": 17.171239852905273, + "learning_rate": 6.2042698205873515e-06, + "loss": 1.8742, + "step": 6883 + }, + { + "epoch": 2.07, + "grad_norm": 83.33916473388672, + "learning_rate": 6.202265209982962e-06, + "loss": 2.0191, + "step": 6884 + }, + { + "epoch": 2.07, + "grad_norm": 13.033934593200684, + "learning_rate": 6.200260599378572e-06, + "loss": 0.9672, + "step": 6885 + }, + { + "epoch": 2.07, + "grad_norm": 25.13782501220703, + "learning_rate": 6.198255988774181e-06, + "loss": 1.5467, + "step": 6886 + }, + { + "epoch": 2.07, + "grad_norm": 129.21408081054688, + "learning_rate": 6.196251378169791e-06, + "loss": 1.8225, + "step": 6887 + }, + { + "epoch": 2.07, + "grad_norm": 15.809123992919922, + "learning_rate": 6.1942467675654e-06, + "loss": 1.1815, + "step": 6888 + }, + { + "epoch": 2.07, + "grad_norm": 10.88241195678711, + "learning_rate": 6.19224215696101e-06, + "loss": 0.5932, + "step": 6889 + }, + { + "epoch": 2.07, + "grad_norm": 17.281024932861328, + "learning_rate": 6.190237546356621e-06, + "loss": 0.6785, + "step": 6890 + }, + { + "epoch": 2.07, + "grad_norm": 78.56420135498047, + "learning_rate": 6.1882329357522305e-06, + "loss": 2.5001, + "step": 6891 + }, + { + "epoch": 2.07, + "grad_norm": 7.025608539581299, + "learning_rate": 6.186228325147841e-06, + "loss": 0.9645, + "step": 6892 + }, + { + "epoch": 2.07, + "grad_norm": 17.67293930053711, + "learning_rate": 6.184223714543451e-06, + "loss": 1.5476, + "step": 6893 + }, + { + "epoch": 2.07, + "grad_norm": 54.84214782714844, + "learning_rate": 6.18221910393906e-06, + "loss": 2.5763, + "step": 6894 + }, + { + "epoch": 2.07, + "grad_norm": 16.025243759155273, + "learning_rate": 6.18021449333467e-06, + "loss": 1.0716, + "step": 6895 + }, + { + "epoch": 2.07, + "grad_norm": 19.868698120117188, + "learning_rate": 6.178209882730281e-06, + "loss": 1.6816, + "step": 6896 + }, + { + "epoch": 2.07, + "grad_norm": 23.872350692749023, + "learning_rate": 6.17620527212589e-06, + "loss": 1.1166, + "step": 6897 + }, + { + "epoch": 2.07, + "grad_norm": 17.220922470092773, + "learning_rate": 6.1742006615215e-06, + "loss": 1.601, + "step": 6898 + }, + { + "epoch": 2.07, + "grad_norm": 37.89704895019531, + "learning_rate": 6.1721960509171095e-06, + "loss": 1.2833, + "step": 6899 + }, + { + "epoch": 2.07, + "grad_norm": 10.233612060546875, + "learning_rate": 6.17019144031272e-06, + "loss": 1.137, + "step": 6900 + }, + { + "epoch": 2.07, + "grad_norm": 14.788456916809082, + "learning_rate": 6.168186829708329e-06, + "loss": 0.7968, + "step": 6901 + }, + { + "epoch": 2.08, + "grad_norm": 47.135398864746094, + "learning_rate": 6.16618221910394e-06, + "loss": 1.8272, + "step": 6902 + }, + { + "epoch": 2.08, + "grad_norm": 48.95859146118164, + "learning_rate": 6.16417760849955e-06, + "loss": 1.8223, + "step": 6903 + }, + { + "epoch": 2.08, + "grad_norm": 9.25454044342041, + "learning_rate": 6.162172997895159e-06, + "loss": 0.9862, + "step": 6904 + }, + { + "epoch": 2.08, + "grad_norm": 34.636192321777344, + "learning_rate": 6.160168387290769e-06, + "loss": 1.5629, + "step": 6905 + }, + { + "epoch": 2.08, + "grad_norm": 51.81803512573242, + "learning_rate": 6.158163776686379e-06, + "loss": 1.5237, + "step": 6906 + }, + { + "epoch": 2.08, + "grad_norm": 39.07495880126953, + "learning_rate": 6.1561591660819885e-06, + "loss": 1.1899, + "step": 6907 + }, + { + "epoch": 2.08, + "grad_norm": 22.247758865356445, + "learning_rate": 6.154154555477599e-06, + "loss": 0.9568, + "step": 6908 + }, + { + "epoch": 2.08, + "grad_norm": 15.606700897216797, + "learning_rate": 6.1521499448732095e-06, + "loss": 1.5251, + "step": 6909 + }, + { + "epoch": 2.08, + "grad_norm": 17.546934127807617, + "learning_rate": 6.150145334268819e-06, + "loss": 1.1845, + "step": 6910 + }, + { + "epoch": 2.08, + "grad_norm": 7.191950798034668, + "learning_rate": 6.148140723664429e-06, + "loss": 0.7725, + "step": 6911 + }, + { + "epoch": 2.08, + "grad_norm": 10.01009464263916, + "learning_rate": 6.146136113060038e-06, + "loss": 1.0729, + "step": 6912 + }, + { + "epoch": 2.08, + "grad_norm": 9.743487358093262, + "learning_rate": 6.144131502455648e-06, + "loss": 1.2371, + "step": 6913 + }, + { + "epoch": 2.08, + "grad_norm": 21.309419631958008, + "learning_rate": 6.142126891851259e-06, + "loss": 2.0328, + "step": 6914 + }, + { + "epoch": 2.08, + "grad_norm": 14.136860847473145, + "learning_rate": 6.140122281246868e-06, + "loss": 1.4086, + "step": 6915 + }, + { + "epoch": 2.08, + "grad_norm": 12.133606910705566, + "learning_rate": 6.138117670642478e-06, + "loss": 1.3393, + "step": 6916 + }, + { + "epoch": 2.08, + "grad_norm": 8.155012130737305, + "learning_rate": 6.136113060038088e-06, + "loss": 0.8114, + "step": 6917 + }, + { + "epoch": 2.08, + "grad_norm": 14.2897367477417, + "learning_rate": 6.134108449433698e-06, + "loss": 1.0815, + "step": 6918 + }, + { + "epoch": 2.08, + "grad_norm": 37.01668930053711, + "learning_rate": 6.132103838829308e-06, + "loss": 2.4772, + "step": 6919 + }, + { + "epoch": 2.08, + "grad_norm": 9.246944427490234, + "learning_rate": 6.130099228224919e-06, + "loss": 1.0533, + "step": 6920 + }, + { + "epoch": 2.08, + "grad_norm": 14.934267044067383, + "learning_rate": 6.128094617620528e-06, + "loss": 0.8619, + "step": 6921 + }, + { + "epoch": 2.08, + "grad_norm": 25.984840393066406, + "learning_rate": 6.126090007016138e-06, + "loss": 1.3991, + "step": 6922 + }, + { + "epoch": 2.08, + "grad_norm": 23.823701858520508, + "learning_rate": 6.124085396411747e-06, + "loss": 2.2563, + "step": 6923 + }, + { + "epoch": 2.08, + "grad_norm": 25.616357803344727, + "learning_rate": 6.122080785807357e-06, + "loss": 1.559, + "step": 6924 + }, + { + "epoch": 2.08, + "grad_norm": 25.483346939086914, + "learning_rate": 6.120076175202967e-06, + "loss": 1.8079, + "step": 6925 + }, + { + "epoch": 2.08, + "grad_norm": 26.09974479675293, + "learning_rate": 6.1180715645985775e-06, + "loss": 1.8173, + "step": 6926 + }, + { + "epoch": 2.08, + "grad_norm": 33.826454162597656, + "learning_rate": 6.116066953994188e-06, + "loss": 1.2576, + "step": 6927 + }, + { + "epoch": 2.08, + "grad_norm": 22.255189895629883, + "learning_rate": 6.114062343389797e-06, + "loss": 1.5849, + "step": 6928 + }, + { + "epoch": 2.08, + "grad_norm": 23.203243255615234, + "learning_rate": 6.112057732785407e-06, + "loss": 2.0366, + "step": 6929 + }, + { + "epoch": 2.08, + "grad_norm": 48.67564010620117, + "learning_rate": 6.110053122181017e-06, + "loss": 1.4752, + "step": 6930 + }, + { + "epoch": 2.08, + "grad_norm": 13.649922370910645, + "learning_rate": 6.108048511576626e-06, + "loss": 1.0061, + "step": 6931 + }, + { + "epoch": 2.08, + "grad_norm": 15.346248626708984, + "learning_rate": 6.106043900972236e-06, + "loss": 0.975, + "step": 6932 + }, + { + "epoch": 2.08, + "grad_norm": 43.33583450317383, + "learning_rate": 6.104039290367847e-06, + "loss": 1.8835, + "step": 6933 + }, + { + "epoch": 2.08, + "grad_norm": 11.801863670349121, + "learning_rate": 6.1020346797634565e-06, + "loss": 0.9421, + "step": 6934 + }, + { + "epoch": 2.09, + "grad_norm": 13.316075325012207, + "learning_rate": 6.100030069159067e-06, + "loss": 0.9621, + "step": 6935 + }, + { + "epoch": 2.09, + "grad_norm": 7.643853187561035, + "learning_rate": 6.098025458554676e-06, + "loss": 0.7109, + "step": 6936 + }, + { + "epoch": 2.09, + "grad_norm": 6.656645774841309, + "learning_rate": 6.096020847950286e-06, + "loss": 0.9677, + "step": 6937 + }, + { + "epoch": 2.09, + "grad_norm": 8.41319751739502, + "learning_rate": 6.094016237345895e-06, + "loss": 1.4541, + "step": 6938 + }, + { + "epoch": 2.09, + "grad_norm": 17.172225952148438, + "learning_rate": 6.092011626741506e-06, + "loss": 1.2853, + "step": 6939 + }, + { + "epoch": 2.09, + "grad_norm": 13.03164005279541, + "learning_rate": 6.090007016137116e-06, + "loss": 1.2991, + "step": 6940 + }, + { + "epoch": 2.09, + "grad_norm": 26.83146095275879, + "learning_rate": 6.088002405532725e-06, + "loss": 1.2144, + "step": 6941 + }, + { + "epoch": 2.09, + "grad_norm": 15.566588401794434, + "learning_rate": 6.0859977949283355e-06, + "loss": 1.1742, + "step": 6942 + }, + { + "epoch": 2.09, + "grad_norm": 11.731130599975586, + "learning_rate": 6.083993184323946e-06, + "loss": 1.5685, + "step": 6943 + }, + { + "epoch": 2.09, + "grad_norm": 23.45868492126465, + "learning_rate": 6.081988573719555e-06, + "loss": 1.1652, + "step": 6944 + }, + { + "epoch": 2.09, + "grad_norm": 30.693378448486328, + "learning_rate": 6.079983963115166e-06, + "loss": 1.1192, + "step": 6945 + }, + { + "epoch": 2.09, + "grad_norm": 76.05176544189453, + "learning_rate": 6.077979352510776e-06, + "loss": 1.03, + "step": 6946 + }, + { + "epoch": 2.09, + "grad_norm": 42.937931060791016, + "learning_rate": 6.075974741906385e-06, + "loss": 2.9801, + "step": 6947 + }, + { + "epoch": 2.09, + "grad_norm": 21.51824188232422, + "learning_rate": 6.073970131301995e-06, + "loss": 1.3744, + "step": 6948 + }, + { + "epoch": 2.09, + "grad_norm": 13.035652160644531, + "learning_rate": 6.071965520697604e-06, + "loss": 1.2832, + "step": 6949 + }, + { + "epoch": 2.09, + "grad_norm": 7.819190502166748, + "learning_rate": 6.0699609100932145e-06, + "loss": 1.3225, + "step": 6950 + }, + { + "epoch": 2.09, + "grad_norm": 10.307256698608398, + "learning_rate": 6.067956299488825e-06, + "loss": 1.1998, + "step": 6951 + }, + { + "epoch": 2.09, + "grad_norm": 43.879268646240234, + "learning_rate": 6.065951688884435e-06, + "loss": 1.2899, + "step": 6952 + }, + { + "epoch": 2.09, + "grad_norm": 14.263348579406738, + "learning_rate": 6.063947078280045e-06, + "loss": 0.8165, + "step": 6953 + }, + { + "epoch": 2.09, + "grad_norm": 19.4408016204834, + "learning_rate": 6.061942467675655e-06, + "loss": 1.2222, + "step": 6954 + }, + { + "epoch": 2.09, + "grad_norm": 21.61421012878418, + "learning_rate": 6.059937857071264e-06, + "loss": 1.6328, + "step": 6955 + }, + { + "epoch": 2.09, + "grad_norm": 59.171634674072266, + "learning_rate": 6.057933246466874e-06, + "loss": 2.2851, + "step": 6956 + }, + { + "epoch": 2.09, + "grad_norm": 17.417770385742188, + "learning_rate": 6.055928635862485e-06, + "loss": 1.5558, + "step": 6957 + }, + { + "epoch": 2.09, + "grad_norm": 21.351945877075195, + "learning_rate": 6.053924025258094e-06, + "loss": 1.9198, + "step": 6958 + }, + { + "epoch": 2.09, + "grad_norm": 21.162813186645508, + "learning_rate": 6.051919414653704e-06, + "loss": 0.8733, + "step": 6959 + }, + { + "epoch": 2.09, + "grad_norm": 28.082653045654297, + "learning_rate": 6.049914804049314e-06, + "loss": 1.3162, + "step": 6960 + }, + { + "epoch": 2.09, + "eval_loss": 0.1762227714061737, + "eval_runtime": 43.6018, + "eval_samples_per_second": 33.921, + "eval_steps_per_second": 33.921, + "step": 6960 + }, + { + "epoch": 2.09, + "grad_norm": 17.02800941467285, + "learning_rate": 6.047910193444924e-06, + "loss": 1.4449, + "step": 6961 + }, + { + "epoch": 2.09, + "grad_norm": 48.51543045043945, + "learning_rate": 6.045905582840533e-06, + "loss": 1.0376, + "step": 6962 + }, + { + "epoch": 2.09, + "grad_norm": 30.203258514404297, + "learning_rate": 6.043900972236144e-06, + "loss": 1.8892, + "step": 6963 + }, + { + "epoch": 2.09, + "grad_norm": 22.062679290771484, + "learning_rate": 6.041896361631754e-06, + "loss": 2.034, + "step": 6964 + }, + { + "epoch": 2.09, + "grad_norm": 29.935195922851562, + "learning_rate": 6.039891751027363e-06, + "loss": 2.1552, + "step": 6965 + }, + { + "epoch": 2.09, + "grad_norm": 21.121536254882812, + "learning_rate": 6.037887140422973e-06, + "loss": 1.848, + "step": 6966 + }, + { + "epoch": 2.09, + "grad_norm": 13.469535827636719, + "learning_rate": 6.035882529818583e-06, + "loss": 0.8718, + "step": 6967 + }, + { + "epoch": 2.1, + "grad_norm": 9.78769302368164, + "learning_rate": 6.033877919214193e-06, + "loss": 0.5627, + "step": 6968 + }, + { + "epoch": 2.1, + "grad_norm": 11.262333869934082, + "learning_rate": 6.0318733086098035e-06, + "loss": 1.4755, + "step": 6969 + }, + { + "epoch": 2.1, + "grad_norm": 21.476552963256836, + "learning_rate": 6.029868698005414e-06, + "loss": 0.9482, + "step": 6970 + }, + { + "epoch": 2.1, + "grad_norm": 8.83089542388916, + "learning_rate": 6.027864087401023e-06, + "loss": 0.9499, + "step": 6971 + }, + { + "epoch": 2.1, + "grad_norm": 16.572052001953125, + "learning_rate": 6.025859476796633e-06, + "loss": 1.4945, + "step": 6972 + }, + { + "epoch": 2.1, + "grad_norm": 94.96258544921875, + "learning_rate": 6.023854866192242e-06, + "loss": 1.655, + "step": 6973 + }, + { + "epoch": 2.1, + "grad_norm": 11.988658905029297, + "learning_rate": 6.021850255587852e-06, + "loss": 1.4834, + "step": 6974 + }, + { + "epoch": 2.1, + "grad_norm": 15.83250904083252, + "learning_rate": 6.0198456449834615e-06, + "loss": 1.7121, + "step": 6975 + }, + { + "epoch": 2.1, + "grad_norm": 14.637907981872559, + "learning_rate": 6.0178410343790724e-06, + "loss": 1.849, + "step": 6976 + }, + { + "epoch": 2.1, + "grad_norm": 42.469970703125, + "learning_rate": 6.0158364237746825e-06, + "loss": 1.3548, + "step": 6977 + }, + { + "epoch": 2.1, + "grad_norm": 14.263885498046875, + "learning_rate": 6.013831813170293e-06, + "loss": 1.5441, + "step": 6978 + }, + { + "epoch": 2.1, + "grad_norm": 91.35665130615234, + "learning_rate": 6.011827202565902e-06, + "loss": 2.4573, + "step": 6979 + }, + { + "epoch": 2.1, + "grad_norm": 26.59601402282715, + "learning_rate": 6.009822591961512e-06, + "loss": 1.0937, + "step": 6980 + }, + { + "epoch": 2.1, + "grad_norm": 21.7692928314209, + "learning_rate": 6.007817981357121e-06, + "loss": 1.4581, + "step": 6981 + }, + { + "epoch": 2.1, + "grad_norm": 55.85102081298828, + "learning_rate": 6.005813370752732e-06, + "loss": 1.5007, + "step": 6982 + }, + { + "epoch": 2.1, + "grad_norm": 17.059696197509766, + "learning_rate": 6.003808760148342e-06, + "loss": 1.2327, + "step": 6983 + }, + { + "epoch": 2.1, + "grad_norm": 20.334264755249023, + "learning_rate": 6.001804149543951e-06, + "loss": 1.1726, + "step": 6984 + }, + { + "epoch": 2.1, + "grad_norm": 14.4188871383667, + "learning_rate": 5.9997995389395615e-06, + "loss": 1.4264, + "step": 6985 + }, + { + "epoch": 2.1, + "grad_norm": 10.726988792419434, + "learning_rate": 5.997794928335171e-06, + "loss": 0.912, + "step": 6986 + }, + { + "epoch": 2.1, + "grad_norm": 22.09978675842285, + "learning_rate": 5.995790317730781e-06, + "loss": 1.9006, + "step": 6987 + }, + { + "epoch": 2.1, + "grad_norm": 25.827951431274414, + "learning_rate": 5.993785707126392e-06, + "loss": 2.7776, + "step": 6988 + }, + { + "epoch": 2.1, + "grad_norm": 43.5711784362793, + "learning_rate": 5.991781096522001e-06, + "loss": 1.5516, + "step": 6989 + }, + { + "epoch": 2.1, + "grad_norm": 20.165645599365234, + "learning_rate": 5.989776485917611e-06, + "loss": 1.1189, + "step": 6990 + }, + { + "epoch": 2.1, + "grad_norm": 23.63228988647461, + "learning_rate": 5.987771875313221e-06, + "loss": 1.4044, + "step": 6991 + }, + { + "epoch": 2.1, + "grad_norm": 18.94355583190918, + "learning_rate": 5.98576726470883e-06, + "loss": 1.3975, + "step": 6992 + }, + { + "epoch": 2.1, + "grad_norm": 43.25507736206055, + "learning_rate": 5.9837626541044405e-06, + "loss": 2.5693, + "step": 6993 + }, + { + "epoch": 2.1, + "grad_norm": 24.190704345703125, + "learning_rate": 5.981758043500051e-06, + "loss": 1.0735, + "step": 6994 + }, + { + "epoch": 2.1, + "grad_norm": 16.01865577697754, + "learning_rate": 5.979753432895661e-06, + "loss": 1.2435, + "step": 6995 + }, + { + "epoch": 2.1, + "grad_norm": 16.649044036865234, + "learning_rate": 5.977748822291271e-06, + "loss": 1.3125, + "step": 6996 + }, + { + "epoch": 2.1, + "grad_norm": 17.755905151367188, + "learning_rate": 5.97574421168688e-06, + "loss": 1.6912, + "step": 6997 + }, + { + "epoch": 2.1, + "grad_norm": 18.950183868408203, + "learning_rate": 5.97373960108249e-06, + "loss": 1.9294, + "step": 6998 + }, + { + "epoch": 2.1, + "grad_norm": 9.899605751037598, + "learning_rate": 5.971734990478099e-06, + "loss": 1.6834, + "step": 6999 + }, + { + "epoch": 2.1, + "grad_norm": 16.555978775024414, + "learning_rate": 5.96973037987371e-06, + "loss": 1.773, + "step": 7000 + }, + { + "epoch": 2.1, + "grad_norm": 9.918785095214844, + "learning_rate": 5.96772576926932e-06, + "loss": 0.8201, + "step": 7001 + }, + { + "epoch": 2.11, + "grad_norm": 7.417164325714111, + "learning_rate": 5.9657211586649295e-06, + "loss": 0.8608, + "step": 7002 + }, + { + "epoch": 2.11, + "grad_norm": 31.227968215942383, + "learning_rate": 5.96371654806054e-06, + "loss": 1.3737, + "step": 7003 + }, + { + "epoch": 2.11, + "grad_norm": 16.146133422851562, + "learning_rate": 5.96171193745615e-06, + "loss": 1.6191, + "step": 7004 + }, + { + "epoch": 2.11, + "grad_norm": 11.509698867797852, + "learning_rate": 5.959707326851759e-06, + "loss": 1.118, + "step": 7005 + }, + { + "epoch": 2.11, + "grad_norm": 11.928789138793945, + "learning_rate": 5.95770271624737e-06, + "loss": 1.3745, + "step": 7006 + }, + { + "epoch": 2.11, + "grad_norm": 12.056342124938965, + "learning_rate": 5.95569810564298e-06, + "loss": 2.4071, + "step": 7007 + }, + { + "epoch": 2.11, + "grad_norm": 15.146832466125488, + "learning_rate": 5.953693495038589e-06, + "loss": 0.6288, + "step": 7008 + }, + { + "epoch": 2.11, + "grad_norm": 15.896178245544434, + "learning_rate": 5.951688884434199e-06, + "loss": 1.3848, + "step": 7009 + }, + { + "epoch": 2.11, + "grad_norm": 20.91379737854004, + "learning_rate": 5.9496842738298085e-06, + "loss": 1.9552, + "step": 7010 + }, + { + "epoch": 2.11, + "grad_norm": 15.576189041137695, + "learning_rate": 5.947679663225419e-06, + "loss": 1.7155, + "step": 7011 + }, + { + "epoch": 2.11, + "grad_norm": 12.269247055053711, + "learning_rate": 5.9456750526210295e-06, + "loss": 1.1991, + "step": 7012 + }, + { + "epoch": 2.11, + "grad_norm": 11.45450210571289, + "learning_rate": 5.943670442016639e-06, + "loss": 0.6404, + "step": 7013 + }, + { + "epoch": 2.11, + "grad_norm": 14.839483261108398, + "learning_rate": 5.941665831412249e-06, + "loss": 1.1984, + "step": 7014 + }, + { + "epoch": 2.11, + "grad_norm": 11.097734451293945, + "learning_rate": 5.939661220807859e-06, + "loss": 1.0033, + "step": 7015 + }, + { + "epoch": 2.11, + "grad_norm": 23.61093521118164, + "learning_rate": 5.937656610203468e-06, + "loss": 2.0833, + "step": 7016 + }, + { + "epoch": 2.11, + "grad_norm": 16.99240493774414, + "learning_rate": 5.935651999599078e-06, + "loss": 1.4891, + "step": 7017 + }, + { + "epoch": 2.11, + "grad_norm": 11.620327949523926, + "learning_rate": 5.9336473889946875e-06, + "loss": 0.9176, + "step": 7018 + }, + { + "epoch": 2.11, + "grad_norm": 17.769229888916016, + "learning_rate": 5.9316427783902984e-06, + "loss": 2.0986, + "step": 7019 + }, + { + "epoch": 2.11, + "grad_norm": 9.187108039855957, + "learning_rate": 5.9296381677859085e-06, + "loss": 1.0068, + "step": 7020 + }, + { + "epoch": 2.11, + "grad_norm": 47.74100112915039, + "learning_rate": 5.927633557181518e-06, + "loss": 1.3823, + "step": 7021 + }, + { + "epoch": 2.11, + "grad_norm": 20.690021514892578, + "learning_rate": 5.925628946577128e-06, + "loss": 1.5737, + "step": 7022 + }, + { + "epoch": 2.11, + "grad_norm": 19.90296745300293, + "learning_rate": 5.923624335972737e-06, + "loss": 1.6142, + "step": 7023 + }, + { + "epoch": 2.11, + "grad_norm": 19.526081085205078, + "learning_rate": 5.921619725368347e-06, + "loss": 0.9043, + "step": 7024 + }, + { + "epoch": 2.11, + "grad_norm": 47.775001525878906, + "learning_rate": 5.919615114763958e-06, + "loss": 3.4549, + "step": 7025 + }, + { + "epoch": 2.11, + "grad_norm": 12.722251892089844, + "learning_rate": 5.917610504159567e-06, + "loss": 1.7378, + "step": 7026 + }, + { + "epoch": 2.11, + "grad_norm": 10.309690475463867, + "learning_rate": 5.915605893555177e-06, + "loss": 0.9647, + "step": 7027 + }, + { + "epoch": 2.11, + "grad_norm": 10.273581504821777, + "learning_rate": 5.9136012829507875e-06, + "loss": 1.6892, + "step": 7028 + }, + { + "epoch": 2.11, + "grad_norm": 16.655803680419922, + "learning_rate": 5.911596672346397e-06, + "loss": 1.6853, + "step": 7029 + }, + { + "epoch": 2.11, + "grad_norm": 23.953218460083008, + "learning_rate": 5.909592061742007e-06, + "loss": 0.5064, + "step": 7030 + }, + { + "epoch": 2.11, + "grad_norm": 21.488235473632812, + "learning_rate": 5.907587451137618e-06, + "loss": 1.7174, + "step": 7031 + }, + { + "epoch": 2.11, + "grad_norm": 54.67894744873047, + "learning_rate": 5.905582840533227e-06, + "loss": 1.7804, + "step": 7032 + }, + { + "epoch": 2.11, + "grad_norm": 10.905550956726074, + "learning_rate": 5.903578229928837e-06, + "loss": 0.7826, + "step": 7033 + }, + { + "epoch": 2.11, + "grad_norm": 6.627830982208252, + "learning_rate": 5.901573619324446e-06, + "loss": 0.9134, + "step": 7034 + }, + { + "epoch": 2.12, + "grad_norm": 12.987330436706543, + "learning_rate": 5.899569008720056e-06, + "loss": 1.4485, + "step": 7035 + }, + { + "epoch": 2.12, + "grad_norm": 14.574166297912598, + "learning_rate": 5.897564398115666e-06, + "loss": 1.1571, + "step": 7036 + }, + { + "epoch": 2.12, + "grad_norm": 12.813142776489258, + "learning_rate": 5.8955597875112766e-06, + "loss": 2.2213, + "step": 7037 + }, + { + "epoch": 2.12, + "grad_norm": 96.96940612792969, + "learning_rate": 5.893555176906887e-06, + "loss": 2.6402, + "step": 7038 + }, + { + "epoch": 2.12, + "grad_norm": 22.448095321655273, + "learning_rate": 5.891550566302497e-06, + "loss": 1.2935, + "step": 7039 + }, + { + "epoch": 2.12, + "grad_norm": 8.147926330566406, + "learning_rate": 5.889545955698106e-06, + "loss": 0.8031, + "step": 7040 + }, + { + "epoch": 2.12, + "grad_norm": 16.05350685119629, + "learning_rate": 5.887541345093716e-06, + "loss": 1.6077, + "step": 7041 + }, + { + "epoch": 2.12, + "grad_norm": 6.752038955688477, + "learning_rate": 5.885536734489325e-06, + "loss": 0.6485, + "step": 7042 + }, + { + "epoch": 2.12, + "grad_norm": 13.922531127929688, + "learning_rate": 5.883532123884936e-06, + "loss": 1.3103, + "step": 7043 + }, + { + "epoch": 2.12, + "grad_norm": 26.871532440185547, + "learning_rate": 5.881527513280546e-06, + "loss": 1.4686, + "step": 7044 + }, + { + "epoch": 2.12, + "grad_norm": 10.8401460647583, + "learning_rate": 5.8795229026761555e-06, + "loss": 1.5021, + "step": 7045 + }, + { + "epoch": 2.12, + "grad_norm": 22.18703842163086, + "learning_rate": 5.877518292071766e-06, + "loss": 1.2032, + "step": 7046 + }, + { + "epoch": 2.12, + "grad_norm": 14.264338493347168, + "learning_rate": 5.875513681467375e-06, + "loss": 1.3754, + "step": 7047 + }, + { + "epoch": 2.12, + "grad_norm": 14.007697105407715, + "learning_rate": 5.873509070862985e-06, + "loss": 1.549, + "step": 7048 + }, + { + "epoch": 2.12, + "grad_norm": 45.7745361328125, + "learning_rate": 5.871504460258596e-06, + "loss": 1.5497, + "step": 7049 + }, + { + "epoch": 2.12, + "grad_norm": 13.82027530670166, + "learning_rate": 5.869499849654205e-06, + "loss": 0.9325, + "step": 7050 + }, + { + "epoch": 2.12, + "grad_norm": 28.13387107849121, + "learning_rate": 5.867495239049815e-06, + "loss": 1.5999, + "step": 7051 + }, + { + "epoch": 2.12, + "grad_norm": 6.187619209289551, + "learning_rate": 5.865490628445425e-06, + "loss": 0.843, + "step": 7052 + }, + { + "epoch": 2.12, + "grad_norm": 16.584333419799805, + "learning_rate": 5.8634860178410345e-06, + "loss": 1.3149, + "step": 7053 + }, + { + "epoch": 2.12, + "grad_norm": 36.140869140625, + "learning_rate": 5.861481407236645e-06, + "loss": 1.3137, + "step": 7054 + }, + { + "epoch": 2.12, + "grad_norm": 25.52848243713379, + "learning_rate": 5.8594767966322555e-06, + "loss": 2.0762, + "step": 7055 + }, + { + "epoch": 2.12, + "grad_norm": 36.224937438964844, + "learning_rate": 5.857472186027865e-06, + "loss": 1.4368, + "step": 7056 + }, + { + "epoch": 2.12, + "grad_norm": 37.73203659057617, + "learning_rate": 5.855467575423475e-06, + "loss": 2.0182, + "step": 7057 + }, + { + "epoch": 2.12, + "grad_norm": 10.022035598754883, + "learning_rate": 5.853462964819084e-06, + "loss": 0.9173, + "step": 7058 + }, + { + "epoch": 2.12, + "grad_norm": 31.495861053466797, + "learning_rate": 5.851458354214694e-06, + "loss": 1.8068, + "step": 7059 + }, + { + "epoch": 2.12, + "grad_norm": 13.925558090209961, + "learning_rate": 5.849453743610303e-06, + "loss": 0.8992, + "step": 7060 + }, + { + "epoch": 2.12, + "grad_norm": 45.48153305053711, + "learning_rate": 5.8474491330059135e-06, + "loss": 2.8715, + "step": 7061 + }, + { + "epoch": 2.12, + "grad_norm": 19.373300552368164, + "learning_rate": 5.8454445224015244e-06, + "loss": 1.1757, + "step": 7062 + }, + { + "epoch": 2.12, + "grad_norm": 20.65883445739746, + "learning_rate": 5.8434399117971345e-06, + "loss": 2.2168, + "step": 7063 + }, + { + "epoch": 2.12, + "grad_norm": 18.880720138549805, + "learning_rate": 5.841435301192744e-06, + "loss": 1.2185, + "step": 7064 + }, + { + "epoch": 2.12, + "grad_norm": 14.757368087768555, + "learning_rate": 5.839430690588354e-06, + "loss": 1.3524, + "step": 7065 + }, + { + "epoch": 2.12, + "grad_norm": 14.642279624938965, + "learning_rate": 5.837426079983963e-06, + "loss": 2.0154, + "step": 7066 + }, + { + "epoch": 2.12, + "grad_norm": 7.192394733428955, + "learning_rate": 5.835421469379573e-06, + "loss": 0.3944, + "step": 7067 + }, + { + "epoch": 2.13, + "grad_norm": 33.72096252441406, + "learning_rate": 5.833416858775184e-06, + "loss": 1.6128, + "step": 7068 + }, + { + "epoch": 2.13, + "grad_norm": 18.03380584716797, + "learning_rate": 5.831412248170793e-06, + "loss": 1.4521, + "step": 7069 + }, + { + "epoch": 2.13, + "grad_norm": 10.494623184204102, + "learning_rate": 5.829407637566403e-06, + "loss": 1.3512, + "step": 7070 + }, + { + "epoch": 2.13, + "grad_norm": 12.263495445251465, + "learning_rate": 5.827403026962013e-06, + "loss": 1.0091, + "step": 7071 + }, + { + "epoch": 2.13, + "grad_norm": 10.31989860534668, + "learning_rate": 5.825398416357623e-06, + "loss": 1.4169, + "step": 7072 + }, + { + "epoch": 2.13, + "grad_norm": 24.252532958984375, + "learning_rate": 5.823393805753233e-06, + "loss": 1.5791, + "step": 7073 + }, + { + "epoch": 2.13, + "grad_norm": 9.647172927856445, + "learning_rate": 5.821389195148843e-06, + "loss": 0.7097, + "step": 7074 + }, + { + "epoch": 2.13, + "grad_norm": 39.26737976074219, + "learning_rate": 5.819384584544453e-06, + "loss": 1.2926, + "step": 7075 + }, + { + "epoch": 2.13, + "grad_norm": 18.434120178222656, + "learning_rate": 5.817379973940063e-06, + "loss": 1.684, + "step": 7076 + }, + { + "epoch": 2.13, + "grad_norm": 23.794525146484375, + "learning_rate": 5.815375363335672e-06, + "loss": 1.4986, + "step": 7077 + }, + { + "epoch": 2.13, + "grad_norm": 23.152494430541992, + "learning_rate": 5.813370752731282e-06, + "loss": 1.7962, + "step": 7078 + }, + { + "epoch": 2.13, + "grad_norm": 10.018393516540527, + "learning_rate": 5.811366142126892e-06, + "loss": 0.6564, + "step": 7079 + }, + { + "epoch": 2.13, + "grad_norm": 14.856311798095703, + "learning_rate": 5.8093615315225026e-06, + "loss": 1.3591, + "step": 7080 + }, + { + "epoch": 2.13, + "eval_loss": 0.1809202879667282, + "eval_runtime": 43.5269, + "eval_samples_per_second": 33.979, + "eval_steps_per_second": 33.979, + "step": 7080 + }, + { + "epoch": 2.13, + "grad_norm": 14.052577018737793, + "learning_rate": 5.807356920918113e-06, + "loss": 0.7851, + "step": 7081 + }, + { + "epoch": 2.13, + "grad_norm": 49.35251998901367, + "learning_rate": 5.805352310313722e-06, + "loss": 2.7815, + "step": 7082 + }, + { + "epoch": 2.13, + "grad_norm": 48.668617248535156, + "learning_rate": 5.803347699709332e-06, + "loss": 1.4805, + "step": 7083 + }, + { + "epoch": 2.13, + "grad_norm": 9.275986671447754, + "learning_rate": 5.801343089104941e-06, + "loss": 0.7976, + "step": 7084 + }, + { + "epoch": 2.13, + "grad_norm": 50.608848571777344, + "learning_rate": 5.799338478500551e-06, + "loss": 1.9386, + "step": 7085 + }, + { + "epoch": 2.13, + "grad_norm": 18.53827476501465, + "learning_rate": 5.797333867896162e-06, + "loss": 1.2232, + "step": 7086 + }, + { + "epoch": 2.13, + "grad_norm": 20.711294174194336, + "learning_rate": 5.7953292572917715e-06, + "loss": 2.057, + "step": 7087 + }, + { + "epoch": 2.13, + "grad_norm": 12.552864074707031, + "learning_rate": 5.7933246466873815e-06, + "loss": 0.9687, + "step": 7088 + }, + { + "epoch": 2.13, + "grad_norm": 20.43316078186035, + "learning_rate": 5.791320036082992e-06, + "loss": 1.6212, + "step": 7089 + }, + { + "epoch": 2.13, + "grad_norm": 8.86861801147461, + "learning_rate": 5.789315425478601e-06, + "loss": 1.2358, + "step": 7090 + }, + { + "epoch": 2.13, + "grad_norm": 17.865428924560547, + "learning_rate": 5.787310814874211e-06, + "loss": 1.8776, + "step": 7091 + }, + { + "epoch": 2.13, + "grad_norm": 10.841514587402344, + "learning_rate": 5.785306204269822e-06, + "loss": 1.0038, + "step": 7092 + }, + { + "epoch": 2.13, + "grad_norm": 10.26215934753418, + "learning_rate": 5.783301593665431e-06, + "loss": 0.7371, + "step": 7093 + }, + { + "epoch": 2.13, + "grad_norm": 19.391332626342773, + "learning_rate": 5.781296983061041e-06, + "loss": 1.335, + "step": 7094 + }, + { + "epoch": 2.13, + "grad_norm": 84.89678955078125, + "learning_rate": 5.7792923724566504e-06, + "loss": 3.5264, + "step": 7095 + }, + { + "epoch": 2.13, + "grad_norm": 18.809005737304688, + "learning_rate": 5.7772877618522605e-06, + "loss": 1.2989, + "step": 7096 + }, + { + "epoch": 2.13, + "grad_norm": 17.723434448242188, + "learning_rate": 5.775283151247871e-06, + "loss": 0.9432, + "step": 7097 + }, + { + "epoch": 2.13, + "grad_norm": 11.459822654724121, + "learning_rate": 5.77327854064348e-06, + "loss": 0.796, + "step": 7098 + }, + { + "epoch": 2.13, + "grad_norm": 10.62411880493164, + "learning_rate": 5.771273930039091e-06, + "loss": 2.0095, + "step": 7099 + }, + { + "epoch": 2.13, + "grad_norm": 17.644634246826172, + "learning_rate": 5.769269319434701e-06, + "loss": 1.3346, + "step": 7100 + }, + { + "epoch": 2.13, + "grad_norm": 16.67945671081543, + "learning_rate": 5.76726470883031e-06, + "loss": 1.6617, + "step": 7101 + }, + { + "epoch": 2.14, + "grad_norm": 12.336973190307617, + "learning_rate": 5.76526009822592e-06, + "loss": 1.2088, + "step": 7102 + }, + { + "epoch": 2.14, + "grad_norm": 43.285831451416016, + "learning_rate": 5.763255487621529e-06, + "loss": 1.8144, + "step": 7103 + }, + { + "epoch": 2.14, + "grad_norm": 95.72914123535156, + "learning_rate": 5.7612508770171395e-06, + "loss": 1.4038, + "step": 7104 + }, + { + "epoch": 2.14, + "grad_norm": 37.060298919677734, + "learning_rate": 5.7592462664127504e-06, + "loss": 1.6265, + "step": 7105 + }, + { + "epoch": 2.14, + "grad_norm": 45.99290466308594, + "learning_rate": 5.75724165580836e-06, + "loss": 1.7691, + "step": 7106 + }, + { + "epoch": 2.14, + "grad_norm": 15.260518074035645, + "learning_rate": 5.75523704520397e-06, + "loss": 1.7976, + "step": 7107 + }, + { + "epoch": 2.14, + "grad_norm": 18.973817825317383, + "learning_rate": 5.753232434599579e-06, + "loss": 1.355, + "step": 7108 + }, + { + "epoch": 2.14, + "grad_norm": 16.729829788208008, + "learning_rate": 5.751227823995189e-06, + "loss": 1.2276, + "step": 7109 + }, + { + "epoch": 2.14, + "grad_norm": 15.583281517028809, + "learning_rate": 5.749223213390799e-06, + "loss": 1.3231, + "step": 7110 + }, + { + "epoch": 2.14, + "grad_norm": 9.61965274810791, + "learning_rate": 5.747218602786409e-06, + "loss": 0.6578, + "step": 7111 + }, + { + "epoch": 2.14, + "grad_norm": 14.255253791809082, + "learning_rate": 5.745213992182019e-06, + "loss": 1.0521, + "step": 7112 + }, + { + "epoch": 2.14, + "grad_norm": 28.25699234008789, + "learning_rate": 5.743209381577629e-06, + "loss": 1.7292, + "step": 7113 + }, + { + "epoch": 2.14, + "grad_norm": 24.005878448486328, + "learning_rate": 5.741204770973239e-06, + "loss": 1.3648, + "step": 7114 + }, + { + "epoch": 2.14, + "grad_norm": 55.37303924560547, + "learning_rate": 5.739200160368849e-06, + "loss": 1.104, + "step": 7115 + }, + { + "epoch": 2.14, + "grad_norm": 18.499359130859375, + "learning_rate": 5.737195549764458e-06, + "loss": 1.4972, + "step": 7116 + }, + { + "epoch": 2.14, + "grad_norm": 14.160520553588867, + "learning_rate": 5.735190939160069e-06, + "loss": 1.2106, + "step": 7117 + }, + { + "epoch": 2.14, + "grad_norm": 12.180475234985352, + "learning_rate": 5.733186328555679e-06, + "loss": 1.4281, + "step": 7118 + }, + { + "epoch": 2.14, + "grad_norm": 10.075895309448242, + "learning_rate": 5.731181717951288e-06, + "loss": 0.811, + "step": 7119 + }, + { + "epoch": 2.14, + "grad_norm": 62.43848419189453, + "learning_rate": 5.729177107346898e-06, + "loss": 2.8725, + "step": 7120 + }, + { + "epoch": 2.14, + "grad_norm": 23.31486701965332, + "learning_rate": 5.7271724967425076e-06, + "loss": 2.8676, + "step": 7121 + }, + { + "epoch": 2.14, + "grad_norm": 13.410110473632812, + "learning_rate": 5.725167886138118e-06, + "loss": 1.0656, + "step": 7122 + }, + { + "epoch": 2.14, + "grad_norm": 25.776451110839844, + "learning_rate": 5.7231632755337286e-06, + "loss": 1.9623, + "step": 7123 + }, + { + "epoch": 2.14, + "grad_norm": 19.853778839111328, + "learning_rate": 5.721158664929339e-06, + "loss": 1.2182, + "step": 7124 + }, + { + "epoch": 2.14, + "grad_norm": 30.225332260131836, + "learning_rate": 5.719154054324948e-06, + "loss": 1.4418, + "step": 7125 + }, + { + "epoch": 2.14, + "grad_norm": 9.551996231079102, + "learning_rate": 5.717149443720558e-06, + "loss": 1.1343, + "step": 7126 + }, + { + "epoch": 2.14, + "grad_norm": 33.873294830322266, + "learning_rate": 5.715144833116167e-06, + "loss": 1.9743, + "step": 7127 + }, + { + "epoch": 2.14, + "grad_norm": 11.439074516296387, + "learning_rate": 5.713140222511777e-06, + "loss": 1.1761, + "step": 7128 + }, + { + "epoch": 2.14, + "grad_norm": 11.673768043518066, + "learning_rate": 5.711135611907388e-06, + "loss": 1.4744, + "step": 7129 + }, + { + "epoch": 2.14, + "grad_norm": 22.201738357543945, + "learning_rate": 5.7091310013029975e-06, + "loss": 2.8592, + "step": 7130 + }, + { + "epoch": 2.14, + "grad_norm": 11.81099796295166, + "learning_rate": 5.7071263906986075e-06, + "loss": 1.2218, + "step": 7131 + }, + { + "epoch": 2.14, + "grad_norm": 26.6683406829834, + "learning_rate": 5.705121780094217e-06, + "loss": 1.2941, + "step": 7132 + }, + { + "epoch": 2.14, + "grad_norm": 8.380386352539062, + "learning_rate": 5.703117169489827e-06, + "loss": 0.6369, + "step": 7133 + }, + { + "epoch": 2.14, + "grad_norm": 11.767404556274414, + "learning_rate": 5.701112558885437e-06, + "loss": 1.0829, + "step": 7134 + }, + { + "epoch": 2.15, + "grad_norm": 25.861289978027344, + "learning_rate": 5.699107948281047e-06, + "loss": 1.9822, + "step": 7135 + }, + { + "epoch": 2.15, + "grad_norm": 27.93609619140625, + "learning_rate": 5.697103337676657e-06, + "loss": 1.3616, + "step": 7136 + }, + { + "epoch": 2.15, + "grad_norm": 87.64994812011719, + "learning_rate": 5.695098727072267e-06, + "loss": 2.3206, + "step": 7137 + }, + { + "epoch": 2.15, + "grad_norm": 11.817543029785156, + "learning_rate": 5.6930941164678764e-06, + "loss": 1.2847, + "step": 7138 + }, + { + "epoch": 2.15, + "grad_norm": 21.869726181030273, + "learning_rate": 5.6910895058634865e-06, + "loss": 1.2728, + "step": 7139 + }, + { + "epoch": 2.15, + "grad_norm": 12.761932373046875, + "learning_rate": 5.689084895259096e-06, + "loss": 1.5109, + "step": 7140 + }, + { + "epoch": 2.15, + "grad_norm": 10.641894340515137, + "learning_rate": 5.687080284654706e-06, + "loss": 0.7452, + "step": 7141 + }, + { + "epoch": 2.15, + "grad_norm": 17.009889602661133, + "learning_rate": 5.685075674050317e-06, + "loss": 1.8117, + "step": 7142 + }, + { + "epoch": 2.15, + "grad_norm": 24.577238082885742, + "learning_rate": 5.683071063445926e-06, + "loss": 1.1146, + "step": 7143 + }, + { + "epoch": 2.15, + "grad_norm": 35.608741760253906, + "learning_rate": 5.681066452841536e-06, + "loss": 2.1795, + "step": 7144 + }, + { + "epoch": 2.15, + "grad_norm": 11.501810073852539, + "learning_rate": 5.679061842237145e-06, + "loss": 0.9263, + "step": 7145 + }, + { + "epoch": 2.15, + "grad_norm": 17.368040084838867, + "learning_rate": 5.6770572316327554e-06, + "loss": 1.4102, + "step": 7146 + }, + { + "epoch": 2.15, + "grad_norm": 16.95510482788086, + "learning_rate": 5.6750526210283655e-06, + "loss": 1.601, + "step": 7147 + }, + { + "epoch": 2.15, + "grad_norm": 8.293194770812988, + "learning_rate": 5.673048010423976e-06, + "loss": 0.8742, + "step": 7148 + }, + { + "epoch": 2.15, + "grad_norm": 6.959547519683838, + "learning_rate": 5.671043399819586e-06, + "loss": 0.4787, + "step": 7149 + }, + { + "epoch": 2.15, + "grad_norm": 15.539986610412598, + "learning_rate": 5.669038789215196e-06, + "loss": 0.8455, + "step": 7150 + }, + { + "epoch": 2.15, + "grad_norm": 11.356565475463867, + "learning_rate": 5.667034178610805e-06, + "loss": 1.1054, + "step": 7151 + }, + { + "epoch": 2.15, + "grad_norm": 25.823801040649414, + "learning_rate": 5.665029568006415e-06, + "loss": 1.7274, + "step": 7152 + }, + { + "epoch": 2.15, + "grad_norm": 43.120635986328125, + "learning_rate": 5.663024957402024e-06, + "loss": 1.6869, + "step": 7153 + }, + { + "epoch": 2.15, + "grad_norm": 8.684465408325195, + "learning_rate": 5.661020346797635e-06, + "loss": 1.2637, + "step": 7154 + }, + { + "epoch": 2.15, + "grad_norm": 23.894262313842773, + "learning_rate": 5.659015736193245e-06, + "loss": 1.4771, + "step": 7155 + }, + { + "epoch": 2.15, + "grad_norm": 26.122549057006836, + "learning_rate": 5.6570111255888546e-06, + "loss": 1.3112, + "step": 7156 + }, + { + "epoch": 2.15, + "grad_norm": 20.096174240112305, + "learning_rate": 5.655006514984465e-06, + "loss": 1.0569, + "step": 7157 + }, + { + "epoch": 2.15, + "grad_norm": 36.696006774902344, + "learning_rate": 5.653001904380075e-06, + "loss": 1.7493, + "step": 7158 + }, + { + "epoch": 2.15, + "grad_norm": 28.245203018188477, + "learning_rate": 5.650997293775684e-06, + "loss": 1.5218, + "step": 7159 + }, + { + "epoch": 2.15, + "grad_norm": 27.659273147583008, + "learning_rate": 5.648992683171295e-06, + "loss": 2.6867, + "step": 7160 + }, + { + "epoch": 2.15, + "grad_norm": 47.04728317260742, + "learning_rate": 5.646988072566905e-06, + "loss": 1.1954, + "step": 7161 + }, + { + "epoch": 2.15, + "grad_norm": 17.212331771850586, + "learning_rate": 5.644983461962514e-06, + "loss": 2.1451, + "step": 7162 + }, + { + "epoch": 2.15, + "grad_norm": 12.564343452453613, + "learning_rate": 5.642978851358124e-06, + "loss": 1.1914, + "step": 7163 + }, + { + "epoch": 2.15, + "grad_norm": 38.6114501953125, + "learning_rate": 5.6409742407537336e-06, + "loss": 0.9675, + "step": 7164 + }, + { + "epoch": 2.15, + "grad_norm": 49.109031677246094, + "learning_rate": 5.638969630149344e-06, + "loss": 1.4241, + "step": 7165 + }, + { + "epoch": 2.15, + "grad_norm": 17.366100311279297, + "learning_rate": 5.6369650195449546e-06, + "loss": 1.9756, + "step": 7166 + }, + { + "epoch": 2.15, + "grad_norm": 79.63975524902344, + "learning_rate": 5.634960408940564e-06, + "loss": 1.9999, + "step": 7167 + }, + { + "epoch": 2.16, + "grad_norm": 28.03964614868164, + "learning_rate": 5.632955798336174e-06, + "loss": 1.4169, + "step": 7168 + }, + { + "epoch": 2.16, + "grad_norm": 23.203266143798828, + "learning_rate": 5.630951187731783e-06, + "loss": 1.3344, + "step": 7169 + }, + { + "epoch": 2.16, + "grad_norm": 12.224784851074219, + "learning_rate": 5.628946577127393e-06, + "loss": 1.4286, + "step": 7170 + }, + { + "epoch": 2.16, + "grad_norm": 9.047876358032227, + "learning_rate": 5.626941966523003e-06, + "loss": 1.1898, + "step": 7171 + }, + { + "epoch": 2.16, + "grad_norm": 22.1667423248291, + "learning_rate": 5.624937355918613e-06, + "loss": 1.945, + "step": 7172 + }, + { + "epoch": 2.16, + "grad_norm": 13.911121368408203, + "learning_rate": 5.6229327453142235e-06, + "loss": 1.3742, + "step": 7173 + }, + { + "epoch": 2.16, + "grad_norm": 9.241662979125977, + "learning_rate": 5.6209281347098335e-06, + "loss": 1.0261, + "step": 7174 + }, + { + "epoch": 2.16, + "grad_norm": 13.640268325805664, + "learning_rate": 5.618923524105443e-06, + "loss": 1.1987, + "step": 7175 + }, + { + "epoch": 2.16, + "grad_norm": 21.423112869262695, + "learning_rate": 5.616918913501053e-06, + "loss": 1.722, + "step": 7176 + }, + { + "epoch": 2.16, + "grad_norm": 33.87396240234375, + "learning_rate": 5.614914302896662e-06, + "loss": 0.9787, + "step": 7177 + }, + { + "epoch": 2.16, + "grad_norm": 29.940101623535156, + "learning_rate": 5.612909692292273e-06, + "loss": 1.792, + "step": 7178 + }, + { + "epoch": 2.16, + "grad_norm": 22.763090133666992, + "learning_rate": 5.610905081687883e-06, + "loss": 1.3948, + "step": 7179 + }, + { + "epoch": 2.16, + "grad_norm": 33.94498062133789, + "learning_rate": 5.608900471083492e-06, + "loss": 2.2237, + "step": 7180 + }, + { + "epoch": 2.16, + "grad_norm": 15.50265884399414, + "learning_rate": 5.6068958604791024e-06, + "loss": 1.5329, + "step": 7181 + }, + { + "epoch": 2.16, + "grad_norm": 20.567745208740234, + "learning_rate": 5.6048912498747125e-06, + "loss": 1.7991, + "step": 7182 + }, + { + "epoch": 2.16, + "grad_norm": 38.924346923828125, + "learning_rate": 5.602886639270322e-06, + "loss": 1.8058, + "step": 7183 + }, + { + "epoch": 2.16, + "grad_norm": 23.95490264892578, + "learning_rate": 5.600882028665932e-06, + "loss": 0.9036, + "step": 7184 + }, + { + "epoch": 2.16, + "grad_norm": 8.675360679626465, + "learning_rate": 5.598877418061543e-06, + "loss": 1.1445, + "step": 7185 + }, + { + "epoch": 2.16, + "grad_norm": 52.8515510559082, + "learning_rate": 5.596872807457152e-06, + "loss": 0.9069, + "step": 7186 + }, + { + "epoch": 2.16, + "grad_norm": 14.209603309631348, + "learning_rate": 5.594868196852762e-06, + "loss": 1.1668, + "step": 7187 + }, + { + "epoch": 2.16, + "grad_norm": 19.132238388061523, + "learning_rate": 5.592863586248371e-06, + "loss": 1.6934, + "step": 7188 + }, + { + "epoch": 2.16, + "grad_norm": 16.72909927368164, + "learning_rate": 5.5908589756439814e-06, + "loss": 1.127, + "step": 7189 + }, + { + "epoch": 2.16, + "grad_norm": 11.741925239562988, + "learning_rate": 5.588854365039591e-06, + "loss": 1.9914, + "step": 7190 + }, + { + "epoch": 2.16, + "grad_norm": 34.14512634277344, + "learning_rate": 5.586849754435202e-06, + "loss": 2.0267, + "step": 7191 + }, + { + "epoch": 2.16, + "grad_norm": 15.906356811523438, + "learning_rate": 5.584845143830812e-06, + "loss": 1.2969, + "step": 7192 + }, + { + "epoch": 2.16, + "grad_norm": 14.884824752807617, + "learning_rate": 5.582840533226421e-06, + "loss": 1.6604, + "step": 7193 + }, + { + "epoch": 2.16, + "grad_norm": 14.613624572753906, + "learning_rate": 5.580835922622031e-06, + "loss": 1.3751, + "step": 7194 + }, + { + "epoch": 2.16, + "grad_norm": 14.830942153930664, + "learning_rate": 5.578831312017641e-06, + "loss": 1.5889, + "step": 7195 + }, + { + "epoch": 2.16, + "grad_norm": 17.539030075073242, + "learning_rate": 5.57682670141325e-06, + "loss": 1.0911, + "step": 7196 + }, + { + "epoch": 2.16, + "grad_norm": 28.30144500732422, + "learning_rate": 5.574822090808861e-06, + "loss": 1.2038, + "step": 7197 + }, + { + "epoch": 2.16, + "grad_norm": 11.865046501159668, + "learning_rate": 5.572817480204471e-06, + "loss": 1.2387, + "step": 7198 + }, + { + "epoch": 2.16, + "grad_norm": 29.797569274902344, + "learning_rate": 5.5708128696000806e-06, + "loss": 2.5909, + "step": 7199 + }, + { + "epoch": 2.16, + "grad_norm": 14.200777053833008, + "learning_rate": 5.568808258995691e-06, + "loss": 1.5443, + "step": 7200 + }, + { + "epoch": 2.16, + "eval_loss": 0.1803688108921051, + "eval_runtime": 43.5403, + "eval_samples_per_second": 33.969, + "eval_steps_per_second": 33.969, + "step": 7200 + }, + { + "epoch": 2.17, + "grad_norm": 8.243300437927246, + "learning_rate": 5.5668036483913e-06, + "loss": 1.2393, + "step": 7201 + }, + { + "epoch": 2.17, + "grad_norm": 29.610637664794922, + "learning_rate": 5.56479903778691e-06, + "loss": 0.9743, + "step": 7202 + }, + { + "epoch": 2.17, + "grad_norm": 30.562345504760742, + "learning_rate": 5.562794427182521e-06, + "loss": 1.5775, + "step": 7203 + }, + { + "epoch": 2.17, + "grad_norm": 13.141053199768066, + "learning_rate": 5.56078981657813e-06, + "loss": 1.026, + "step": 7204 + }, + { + "epoch": 2.17, + "grad_norm": 15.844094276428223, + "learning_rate": 5.55878520597374e-06, + "loss": 1.8327, + "step": 7205 + }, + { + "epoch": 2.17, + "grad_norm": 7.878782272338867, + "learning_rate": 5.5567805953693495e-06, + "loss": 1.0848, + "step": 7206 + }, + { + "epoch": 2.17, + "grad_norm": 12.858576774597168, + "learning_rate": 5.5547759847649596e-06, + "loss": 1.5242, + "step": 7207 + }, + { + "epoch": 2.17, + "grad_norm": 11.919968605041504, + "learning_rate": 5.55277137416057e-06, + "loss": 1.6723, + "step": 7208 + }, + { + "epoch": 2.17, + "grad_norm": 11.811442375183105, + "learning_rate": 5.5507667635561806e-06, + "loss": 1.1354, + "step": 7209 + }, + { + "epoch": 2.17, + "grad_norm": 32.790809631347656, + "learning_rate": 5.54876215295179e-06, + "loss": 1.136, + "step": 7210 + }, + { + "epoch": 2.17, + "grad_norm": 29.143482208251953, + "learning_rate": 5.5467575423474e-06, + "loss": 1.3966, + "step": 7211 + }, + { + "epoch": 2.17, + "grad_norm": 24.455890655517578, + "learning_rate": 5.544752931743009e-06, + "loss": 2.0205, + "step": 7212 + }, + { + "epoch": 2.17, + "grad_norm": 23.328998565673828, + "learning_rate": 5.542748321138619e-06, + "loss": 1.6471, + "step": 7213 + }, + { + "epoch": 2.17, + "grad_norm": 26.35028076171875, + "learning_rate": 5.5407437105342285e-06, + "loss": 1.5245, + "step": 7214 + }, + { + "epoch": 2.17, + "grad_norm": 23.13437843322754, + "learning_rate": 5.538739099929839e-06, + "loss": 1.7795, + "step": 7215 + }, + { + "epoch": 2.17, + "grad_norm": 16.176660537719727, + "learning_rate": 5.5367344893254495e-06, + "loss": 1.5189, + "step": 7216 + }, + { + "epoch": 2.17, + "grad_norm": 16.43119239807129, + "learning_rate": 5.534729878721059e-06, + "loss": 1.0649, + "step": 7217 + }, + { + "epoch": 2.17, + "grad_norm": 16.297130584716797, + "learning_rate": 5.532725268116669e-06, + "loss": 1.0049, + "step": 7218 + }, + { + "epoch": 2.17, + "grad_norm": 11.09915542602539, + "learning_rate": 5.530720657512279e-06, + "loss": 1.3301, + "step": 7219 + }, + { + "epoch": 2.17, + "grad_norm": 10.667206764221191, + "learning_rate": 5.528716046907888e-06, + "loss": 1.0879, + "step": 7220 + }, + { + "epoch": 2.17, + "grad_norm": 22.059289932250977, + "learning_rate": 5.526711436303499e-06, + "loss": 1.7633, + "step": 7221 + }, + { + "epoch": 2.17, + "grad_norm": 45.51456069946289, + "learning_rate": 5.524706825699109e-06, + "loss": 1.451, + "step": 7222 + }, + { + "epoch": 2.17, + "grad_norm": 13.462637901306152, + "learning_rate": 5.522702215094718e-06, + "loss": 1.4225, + "step": 7223 + }, + { + "epoch": 2.17, + "grad_norm": 11.555397033691406, + "learning_rate": 5.5206976044903284e-06, + "loss": 1.2796, + "step": 7224 + }, + { + "epoch": 2.17, + "grad_norm": 25.498672485351562, + "learning_rate": 5.518692993885938e-06, + "loss": 1.1889, + "step": 7225 + }, + { + "epoch": 2.17, + "grad_norm": 18.293188095092773, + "learning_rate": 5.516688383281548e-06, + "loss": 2.2397, + "step": 7226 + }, + { + "epoch": 2.17, + "grad_norm": 17.86231231689453, + "learning_rate": 5.514683772677157e-06, + "loss": 1.5173, + "step": 7227 + }, + { + "epoch": 2.17, + "grad_norm": 18.436603546142578, + "learning_rate": 5.512679162072768e-06, + "loss": 1.2884, + "step": 7228 + }, + { + "epoch": 2.17, + "grad_norm": 10.068883895874023, + "learning_rate": 5.510674551468378e-06, + "loss": 1.314, + "step": 7229 + }, + { + "epoch": 2.17, + "grad_norm": 9.923480987548828, + "learning_rate": 5.508669940863987e-06, + "loss": 0.8318, + "step": 7230 + }, + { + "epoch": 2.17, + "grad_norm": 30.2728214263916, + "learning_rate": 5.506665330259597e-06, + "loss": 1.7006, + "step": 7231 + }, + { + "epoch": 2.17, + "grad_norm": 24.514514923095703, + "learning_rate": 5.5046607196552074e-06, + "loss": 1.2372, + "step": 7232 + }, + { + "epoch": 2.17, + "grad_norm": 8.947182655334473, + "learning_rate": 5.502656109050817e-06, + "loss": 0.6531, + "step": 7233 + }, + { + "epoch": 2.17, + "grad_norm": 32.466487884521484, + "learning_rate": 5.500651498446428e-06, + "loss": 0.9527, + "step": 7234 + }, + { + "epoch": 2.18, + "grad_norm": 19.44408416748047, + "learning_rate": 5.498646887842038e-06, + "loss": 1.8595, + "step": 7235 + }, + { + "epoch": 2.18, + "grad_norm": 15.075602531433105, + "learning_rate": 5.496642277237647e-06, + "loss": 1.2715, + "step": 7236 + }, + { + "epoch": 2.18, + "grad_norm": 40.67919921875, + "learning_rate": 5.494637666633257e-06, + "loss": 1.1642, + "step": 7237 + }, + { + "epoch": 2.18, + "grad_norm": 11.373068809509277, + "learning_rate": 5.492633056028866e-06, + "loss": 0.8346, + "step": 7238 + }, + { + "epoch": 2.18, + "grad_norm": 19.530393600463867, + "learning_rate": 5.490628445424476e-06, + "loss": 1.1305, + "step": 7239 + }, + { + "epoch": 2.18, + "grad_norm": 93.64016723632812, + "learning_rate": 5.488623834820087e-06, + "loss": 1.6901, + "step": 7240 + }, + { + "epoch": 2.18, + "grad_norm": 26.258949279785156, + "learning_rate": 5.4866192242156965e-06, + "loss": 2.7283, + "step": 7241 + }, + { + "epoch": 2.18, + "grad_norm": 23.18985366821289, + "learning_rate": 5.4846146136113066e-06, + "loss": 1.3425, + "step": 7242 + }, + { + "epoch": 2.18, + "grad_norm": 18.908994674682617, + "learning_rate": 5.482610003006917e-06, + "loss": 1.075, + "step": 7243 + }, + { + "epoch": 2.18, + "grad_norm": 16.2775936126709, + "learning_rate": 5.480605392402526e-06, + "loss": 1.6765, + "step": 7244 + }, + { + "epoch": 2.18, + "grad_norm": 22.71320915222168, + "learning_rate": 5.478600781798136e-06, + "loss": 0.7902, + "step": 7245 + }, + { + "epoch": 2.18, + "grad_norm": 27.827980041503906, + "learning_rate": 5.476596171193747e-06, + "loss": 1.4328, + "step": 7246 + }, + { + "epoch": 2.18, + "grad_norm": 11.23440170288086, + "learning_rate": 5.474591560589356e-06, + "loss": 1.3235, + "step": 7247 + }, + { + "epoch": 2.18, + "grad_norm": 32.618648529052734, + "learning_rate": 5.472586949984966e-06, + "loss": 1.3066, + "step": 7248 + }, + { + "epoch": 2.18, + "grad_norm": 41.39394760131836, + "learning_rate": 5.4705823393805755e-06, + "loss": 1.2173, + "step": 7249 + }, + { + "epoch": 2.18, + "grad_norm": 17.75341796875, + "learning_rate": 5.4685777287761856e-06, + "loss": 0.9759, + "step": 7250 + }, + { + "epoch": 2.18, + "grad_norm": 24.715322494506836, + "learning_rate": 5.466573118171795e-06, + "loss": 2.2409, + "step": 7251 + }, + { + "epoch": 2.18, + "grad_norm": 16.27878761291504, + "learning_rate": 5.464568507567406e-06, + "loss": 1.0532, + "step": 7252 + }, + { + "epoch": 2.18, + "grad_norm": 31.52488899230957, + "learning_rate": 5.462563896963016e-06, + "loss": 2.1667, + "step": 7253 + }, + { + "epoch": 2.18, + "grad_norm": 24.314443588256836, + "learning_rate": 5.460559286358625e-06, + "loss": 1.6379, + "step": 7254 + }, + { + "epoch": 2.18, + "grad_norm": 9.153544425964355, + "learning_rate": 5.458554675754235e-06, + "loss": 0.9792, + "step": 7255 + }, + { + "epoch": 2.18, + "grad_norm": 15.48164176940918, + "learning_rate": 5.456550065149845e-06, + "loss": 1.3478, + "step": 7256 + }, + { + "epoch": 2.18, + "grad_norm": 13.745574951171875, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.0092, + "step": 7257 + }, + { + "epoch": 2.18, + "grad_norm": 42.99429702758789, + "learning_rate": 5.452540843941065e-06, + "loss": 2.0053, + "step": 7258 + }, + { + "epoch": 2.18, + "grad_norm": 15.831052780151367, + "learning_rate": 5.4505362333366755e-06, + "loss": 1.2616, + "step": 7259 + }, + { + "epoch": 2.18, + "grad_norm": 18.406320571899414, + "learning_rate": 5.448531622732285e-06, + "loss": 1.7931, + "step": 7260 + }, + { + "epoch": 2.18, + "grad_norm": 23.25520133972168, + "learning_rate": 5.446527012127895e-06, + "loss": 1.7849, + "step": 7261 + }, + { + "epoch": 2.18, + "grad_norm": 11.698332786560059, + "learning_rate": 5.444522401523504e-06, + "loss": 1.0173, + "step": 7262 + }, + { + "epoch": 2.18, + "grad_norm": 18.50562286376953, + "learning_rate": 5.442517790919114e-06, + "loss": 1.0716, + "step": 7263 + }, + { + "epoch": 2.18, + "grad_norm": 18.679840087890625, + "learning_rate": 5.440513180314725e-06, + "loss": 1.331, + "step": 7264 + }, + { + "epoch": 2.18, + "grad_norm": 11.41555118560791, + "learning_rate": 5.438508569710334e-06, + "loss": 1.2174, + "step": 7265 + }, + { + "epoch": 2.18, + "grad_norm": 12.062613487243652, + "learning_rate": 5.436503959105944e-06, + "loss": 1.7347, + "step": 7266 + }, + { + "epoch": 2.18, + "grad_norm": 12.057971000671387, + "learning_rate": 5.4344993485015544e-06, + "loss": 1.2392, + "step": 7267 + }, + { + "epoch": 2.19, + "grad_norm": 14.793939590454102, + "learning_rate": 5.432494737897164e-06, + "loss": 1.7844, + "step": 7268 + }, + { + "epoch": 2.19, + "grad_norm": 20.210912704467773, + "learning_rate": 5.430490127292774e-06, + "loss": 2.0136, + "step": 7269 + }, + { + "epoch": 2.19, + "grad_norm": 6.897406578063965, + "learning_rate": 5.428485516688383e-06, + "loss": 1.0915, + "step": 7270 + }, + { + "epoch": 2.19, + "grad_norm": 12.593406677246094, + "learning_rate": 5.426480906083994e-06, + "loss": 0.7125, + "step": 7271 + }, + { + "epoch": 2.19, + "grad_norm": 12.755428314208984, + "learning_rate": 5.424476295479604e-06, + "loss": 1.9477, + "step": 7272 + }, + { + "epoch": 2.19, + "grad_norm": 7.096426486968994, + "learning_rate": 5.422471684875213e-06, + "loss": 0.718, + "step": 7273 + }, + { + "epoch": 2.19, + "grad_norm": 8.153369903564453, + "learning_rate": 5.420467074270823e-06, + "loss": 0.7129, + "step": 7274 + }, + { + "epoch": 2.19, + "grad_norm": 14.67286491394043, + "learning_rate": 5.418462463666433e-06, + "loss": 1.6797, + "step": 7275 + }, + { + "epoch": 2.19, + "grad_norm": 18.533056259155273, + "learning_rate": 5.416457853062043e-06, + "loss": 1.1951, + "step": 7276 + }, + { + "epoch": 2.19, + "grad_norm": 8.047659873962402, + "learning_rate": 5.414453242457654e-06, + "loss": 0.6167, + "step": 7277 + }, + { + "epoch": 2.19, + "grad_norm": 11.328049659729004, + "learning_rate": 5.412448631853263e-06, + "loss": 0.9758, + "step": 7278 + }, + { + "epoch": 2.19, + "grad_norm": 23.584749221801758, + "learning_rate": 5.410444021248873e-06, + "loss": 1.7252, + "step": 7279 + }, + { + "epoch": 2.19, + "grad_norm": 22.34720230102539, + "learning_rate": 5.408439410644483e-06, + "loss": 2.0207, + "step": 7280 + }, + { + "epoch": 2.19, + "grad_norm": 19.402429580688477, + "learning_rate": 5.406434800040092e-06, + "loss": 1.2057, + "step": 7281 + }, + { + "epoch": 2.19, + "grad_norm": 16.989200592041016, + "learning_rate": 5.404430189435702e-06, + "loss": 1.9234, + "step": 7282 + }, + { + "epoch": 2.19, + "grad_norm": 26.748077392578125, + "learning_rate": 5.402425578831313e-06, + "loss": 1.8995, + "step": 7283 + }, + { + "epoch": 2.19, + "grad_norm": 97.96756744384766, + "learning_rate": 5.4004209682269225e-06, + "loss": 2.0797, + "step": 7284 + }, + { + "epoch": 2.19, + "grad_norm": 20.710670471191406, + "learning_rate": 5.398416357622533e-06, + "loss": 1.0238, + "step": 7285 + }, + { + "epoch": 2.19, + "grad_norm": 13.581433296203613, + "learning_rate": 5.396411747018142e-06, + "loss": 1.6866, + "step": 7286 + }, + { + "epoch": 2.19, + "grad_norm": 12.058156967163086, + "learning_rate": 5.394407136413752e-06, + "loss": 1.0288, + "step": 7287 + }, + { + "epoch": 2.19, + "grad_norm": 21.06386375427246, + "learning_rate": 5.392402525809361e-06, + "loss": 1.271, + "step": 7288 + }, + { + "epoch": 2.19, + "grad_norm": 51.95856857299805, + "learning_rate": 5.390397915204972e-06, + "loss": 1.5959, + "step": 7289 + }, + { + "epoch": 2.19, + "grad_norm": 39.71308898925781, + "learning_rate": 5.388393304600582e-06, + "loss": 1.7519, + "step": 7290 + }, + { + "epoch": 2.19, + "grad_norm": 22.637428283691406, + "learning_rate": 5.386388693996191e-06, + "loss": 2.0241, + "step": 7291 + }, + { + "epoch": 2.19, + "grad_norm": 13.722464561462402, + "learning_rate": 5.3843840833918015e-06, + "loss": 1.0318, + "step": 7292 + }, + { + "epoch": 2.19, + "grad_norm": 35.6817512512207, + "learning_rate": 5.3823794727874116e-06, + "loss": 1.7191, + "step": 7293 + }, + { + "epoch": 2.19, + "grad_norm": 13.845057487487793, + "learning_rate": 5.380374862183021e-06, + "loss": 1.3774, + "step": 7294 + }, + { + "epoch": 2.19, + "grad_norm": 12.214203834533691, + "learning_rate": 5.378370251578632e-06, + "loss": 1.5962, + "step": 7295 + }, + { + "epoch": 2.19, + "grad_norm": 66.19721984863281, + "learning_rate": 5.376365640974242e-06, + "loss": 1.2639, + "step": 7296 + }, + { + "epoch": 2.19, + "grad_norm": 14.872382164001465, + "learning_rate": 5.374361030369851e-06, + "loss": 1.6655, + "step": 7297 + }, + { + "epoch": 2.19, + "grad_norm": 19.328413009643555, + "learning_rate": 5.372356419765461e-06, + "loss": 1.3506, + "step": 7298 + }, + { + "epoch": 2.19, + "grad_norm": 12.651726722717285, + "learning_rate": 5.37035180916107e-06, + "loss": 1.2916, + "step": 7299 + }, + { + "epoch": 2.19, + "grad_norm": 13.568867683410645, + "learning_rate": 5.3683471985566805e-06, + "loss": 1.116, + "step": 7300 + }, + { + "epoch": 2.2, + "grad_norm": 17.879594802856445, + "learning_rate": 5.366342587952291e-06, + "loss": 1.9263, + "step": 7301 + }, + { + "epoch": 2.2, + "grad_norm": 16.369518280029297, + "learning_rate": 5.364337977347901e-06, + "loss": 0.7172, + "step": 7302 + }, + { + "epoch": 2.2, + "grad_norm": 37.3539924621582, + "learning_rate": 5.362333366743511e-06, + "loss": 2.3236, + "step": 7303 + }, + { + "epoch": 2.2, + "grad_norm": 28.66567611694336, + "learning_rate": 5.360328756139121e-06, + "loss": 1.4089, + "step": 7304 + }, + { + "epoch": 2.2, + "grad_norm": 15.510285377502441, + "learning_rate": 5.35832414553473e-06, + "loss": 1.4704, + "step": 7305 + }, + { + "epoch": 2.2, + "grad_norm": 18.096101760864258, + "learning_rate": 5.35631953493034e-06, + "loss": 1.029, + "step": 7306 + }, + { + "epoch": 2.2, + "grad_norm": 34.01200866699219, + "learning_rate": 5.354314924325949e-06, + "loss": 1.732, + "step": 7307 + }, + { + "epoch": 2.2, + "grad_norm": 12.730477333068848, + "learning_rate": 5.35231031372156e-06, + "loss": 2.1386, + "step": 7308 + }, + { + "epoch": 2.2, + "grad_norm": 17.955995559692383, + "learning_rate": 5.35030570311717e-06, + "loss": 0.6812, + "step": 7309 + }, + { + "epoch": 2.2, + "grad_norm": 9.278726577758789, + "learning_rate": 5.34830109251278e-06, + "loss": 0.5906, + "step": 7310 + }, + { + "epoch": 2.2, + "grad_norm": 7.9487199783325195, + "learning_rate": 5.34629648190839e-06, + "loss": 0.6598, + "step": 7311 + }, + { + "epoch": 2.2, + "grad_norm": 14.72791576385498, + "learning_rate": 5.344291871303999e-06, + "loss": 0.7821, + "step": 7312 + }, + { + "epoch": 2.2, + "grad_norm": 8.070356369018555, + "learning_rate": 5.342287260699609e-06, + "loss": 0.7893, + "step": 7313 + }, + { + "epoch": 2.2, + "grad_norm": 14.87585163116455, + "learning_rate": 5.34028265009522e-06, + "loss": 1.5793, + "step": 7314 + }, + { + "epoch": 2.2, + "grad_norm": 119.86707305908203, + "learning_rate": 5.338278039490829e-06, + "loss": 2.2575, + "step": 7315 + }, + { + "epoch": 2.2, + "grad_norm": 22.031450271606445, + "learning_rate": 5.336273428886439e-06, + "loss": 0.9648, + "step": 7316 + }, + { + "epoch": 2.2, + "grad_norm": 20.54271697998047, + "learning_rate": 5.334268818282049e-06, + "loss": 1.7682, + "step": 7317 + }, + { + "epoch": 2.2, + "grad_norm": 18.402219772338867, + "learning_rate": 5.332264207677659e-06, + "loss": 1.3206, + "step": 7318 + }, + { + "epoch": 2.2, + "grad_norm": 21.85274887084961, + "learning_rate": 5.330259597073269e-06, + "loss": 2.4046, + "step": 7319 + }, + { + "epoch": 2.2, + "grad_norm": 12.441575050354004, + "learning_rate": 5.32825498646888e-06, + "loss": 1.8915, + "step": 7320 + }, + { + "epoch": 2.2, + "eval_loss": 0.18911099433898926, + "eval_runtime": 43.726, + "eval_samples_per_second": 33.824, + "eval_steps_per_second": 33.824, + "step": 7320 + }, + { + "epoch": 2.2, + "grad_norm": 18.565292358398438, + "learning_rate": 5.326250375864489e-06, + "loss": 1.4864, + "step": 7321 + }, + { + "epoch": 2.2, + "grad_norm": 21.362342834472656, + "learning_rate": 5.324245765260099e-06, + "loss": 1.0767, + "step": 7322 + }, + { + "epoch": 2.2, + "grad_norm": 15.389693260192871, + "learning_rate": 5.322241154655708e-06, + "loss": 0.9276, + "step": 7323 + }, + { + "epoch": 2.2, + "grad_norm": 83.4190673828125, + "learning_rate": 5.320236544051318e-06, + "loss": 2.4627, + "step": 7324 + }, + { + "epoch": 2.2, + "grad_norm": 17.144039154052734, + "learning_rate": 5.3182319334469275e-06, + "loss": 1.6204, + "step": 7325 + }, + { + "epoch": 2.2, + "grad_norm": 12.209066390991211, + "learning_rate": 5.316227322842538e-06, + "loss": 0.9967, + "step": 7326 + }, + { + "epoch": 2.2, + "grad_norm": 20.075319290161133, + "learning_rate": 5.3142227122381485e-06, + "loss": 2.2126, + "step": 7327 + }, + { + "epoch": 2.2, + "grad_norm": 22.937063217163086, + "learning_rate": 5.312218101633759e-06, + "loss": 1.583, + "step": 7328 + }, + { + "epoch": 2.2, + "grad_norm": 9.0242338180542, + "learning_rate": 5.310213491029368e-06, + "loss": 0.6662, + "step": 7329 + }, + { + "epoch": 2.2, + "grad_norm": 8.918848037719727, + "learning_rate": 5.308208880424978e-06, + "loss": 0.8654, + "step": 7330 + }, + { + "epoch": 2.2, + "grad_norm": 32.961360931396484, + "learning_rate": 5.306204269820587e-06, + "loss": 1.8518, + "step": 7331 + }, + { + "epoch": 2.2, + "grad_norm": 16.80774688720703, + "learning_rate": 5.304199659216198e-06, + "loss": 1.4526, + "step": 7332 + }, + { + "epoch": 2.2, + "grad_norm": 86.86822509765625, + "learning_rate": 5.302195048611808e-06, + "loss": 1.354, + "step": 7333 + }, + { + "epoch": 2.21, + "grad_norm": 18.777795791625977, + "learning_rate": 5.300190438007417e-06, + "loss": 1.2628, + "step": 7334 + }, + { + "epoch": 2.21, + "grad_norm": 47.588802337646484, + "learning_rate": 5.2981858274030275e-06, + "loss": 3.0136, + "step": 7335 + }, + { + "epoch": 2.21, + "grad_norm": 41.61444091796875, + "learning_rate": 5.296181216798637e-06, + "loss": 2.3059, + "step": 7336 + }, + { + "epoch": 2.21, + "grad_norm": 16.625478744506836, + "learning_rate": 5.294176606194247e-06, + "loss": 1.0123, + "step": 7337 + }, + { + "epoch": 2.21, + "grad_norm": 50.9505729675293, + "learning_rate": 5.292171995589858e-06, + "loss": 1.1573, + "step": 7338 + }, + { + "epoch": 2.21, + "grad_norm": 3.5535824298858643, + "learning_rate": 5.290167384985467e-06, + "loss": 0.5366, + "step": 7339 + }, + { + "epoch": 2.21, + "grad_norm": 13.707948684692383, + "learning_rate": 5.288162774381077e-06, + "loss": 1.2388, + "step": 7340 + }, + { + "epoch": 2.21, + "grad_norm": 117.4101333618164, + "learning_rate": 5.286158163776687e-06, + "loss": 2.3683, + "step": 7341 + }, + { + "epoch": 2.21, + "grad_norm": 13.597126960754395, + "learning_rate": 5.284153553172296e-06, + "loss": 1.0199, + "step": 7342 + }, + { + "epoch": 2.21, + "grad_norm": 8.304295539855957, + "learning_rate": 5.2821489425679065e-06, + "loss": 0.7169, + "step": 7343 + }, + { + "epoch": 2.21, + "grad_norm": 14.256132125854492, + "learning_rate": 5.280144331963517e-06, + "loss": 1.2964, + "step": 7344 + }, + { + "epoch": 2.21, + "grad_norm": 86.91548919677734, + "learning_rate": 5.278139721359127e-06, + "loss": 3.997, + "step": 7345 + }, + { + "epoch": 2.21, + "grad_norm": 16.045759201049805, + "learning_rate": 5.276135110754737e-06, + "loss": 1.1771, + "step": 7346 + }, + { + "epoch": 2.21, + "grad_norm": 39.05559158325195, + "learning_rate": 5.274130500150346e-06, + "loss": 2.0468, + "step": 7347 + }, + { + "epoch": 2.21, + "grad_norm": 13.659476280212402, + "learning_rate": 5.272125889545956e-06, + "loss": 0.9715, + "step": 7348 + }, + { + "epoch": 2.21, + "grad_norm": 12.945648193359375, + "learning_rate": 5.270121278941565e-06, + "loss": 1.0217, + "step": 7349 + }, + { + "epoch": 2.21, + "grad_norm": 11.518617630004883, + "learning_rate": 5.268116668337175e-06, + "loss": 1.2031, + "step": 7350 + }, + { + "epoch": 2.21, + "grad_norm": 43.542816162109375, + "learning_rate": 5.266112057732786e-06, + "loss": 1.8437, + "step": 7351 + }, + { + "epoch": 2.21, + "grad_norm": 9.149428367614746, + "learning_rate": 5.2641074471283955e-06, + "loss": 1.1645, + "step": 7352 + }, + { + "epoch": 2.21, + "grad_norm": 17.6467227935791, + "learning_rate": 5.262102836524006e-06, + "loss": 1.3694, + "step": 7353 + }, + { + "epoch": 2.21, + "grad_norm": 30.845457077026367, + "learning_rate": 5.260098225919616e-06, + "loss": 1.3288, + "step": 7354 + }, + { + "epoch": 2.21, + "grad_norm": 17.407968521118164, + "learning_rate": 5.258093615315225e-06, + "loss": 1.3238, + "step": 7355 + }, + { + "epoch": 2.21, + "grad_norm": 22.961395263671875, + "learning_rate": 5.256089004710835e-06, + "loss": 1.1421, + "step": 7356 + }, + { + "epoch": 2.21, + "grad_norm": 21.641342163085938, + "learning_rate": 5.254084394106446e-06, + "loss": 1.2616, + "step": 7357 + }, + { + "epoch": 2.21, + "grad_norm": 16.428573608398438, + "learning_rate": 5.252079783502055e-06, + "loss": 0.9203, + "step": 7358 + }, + { + "epoch": 2.21, + "grad_norm": 17.370075225830078, + "learning_rate": 5.250075172897665e-06, + "loss": 1.0207, + "step": 7359 + }, + { + "epoch": 2.21, + "grad_norm": 17.009870529174805, + "learning_rate": 5.2480705622932745e-06, + "loss": 1.3327, + "step": 7360 + }, + { + "epoch": 2.21, + "grad_norm": 16.25709342956543, + "learning_rate": 5.246065951688885e-06, + "loss": 1.274, + "step": 7361 + }, + { + "epoch": 2.21, + "grad_norm": 11.702857971191406, + "learning_rate": 5.244061341084495e-06, + "loss": 1.3384, + "step": 7362 + }, + { + "epoch": 2.21, + "grad_norm": 39.857627868652344, + "learning_rate": 5.242056730480105e-06, + "loss": 1.6856, + "step": 7363 + }, + { + "epoch": 2.21, + "grad_norm": 34.70151138305664, + "learning_rate": 5.240052119875715e-06, + "loss": 1.7488, + "step": 7364 + }, + { + "epoch": 2.21, + "grad_norm": 8.398064613342285, + "learning_rate": 5.238047509271325e-06, + "loss": 1.1744, + "step": 7365 + }, + { + "epoch": 2.21, + "grad_norm": 16.08351707458496, + "learning_rate": 5.236042898666934e-06, + "loss": 1.2976, + "step": 7366 + }, + { + "epoch": 2.21, + "grad_norm": 41.76036834716797, + "learning_rate": 5.234038288062544e-06, + "loss": 2.1369, + "step": 7367 + }, + { + "epoch": 2.22, + "grad_norm": 14.112876892089844, + "learning_rate": 5.2320336774581535e-06, + "loss": 0.5372, + "step": 7368 + }, + { + "epoch": 2.22, + "grad_norm": 25.881088256835938, + "learning_rate": 5.230029066853764e-06, + "loss": 1.3301, + "step": 7369 + }, + { + "epoch": 2.22, + "grad_norm": 28.427711486816406, + "learning_rate": 5.2280244562493745e-06, + "loss": 1.6783, + "step": 7370 + }, + { + "epoch": 2.22, + "grad_norm": 10.222917556762695, + "learning_rate": 5.226019845644984e-06, + "loss": 0.9323, + "step": 7371 + }, + { + "epoch": 2.22, + "grad_norm": 8.820756912231445, + "learning_rate": 5.224015235040594e-06, + "loss": 0.9893, + "step": 7372 + }, + { + "epoch": 2.22, + "grad_norm": 17.932085037231445, + "learning_rate": 5.222010624436203e-06, + "loss": 1.7065, + "step": 7373 + }, + { + "epoch": 2.22, + "grad_norm": 11.802325248718262, + "learning_rate": 5.220006013831813e-06, + "loss": 1.3393, + "step": 7374 + }, + { + "epoch": 2.22, + "grad_norm": 23.23512077331543, + "learning_rate": 5.218001403227424e-06, + "loss": 1.1122, + "step": 7375 + }, + { + "epoch": 2.22, + "grad_norm": 15.658203125, + "learning_rate": 5.215996792623033e-06, + "loss": 1.3305, + "step": 7376 + }, + { + "epoch": 2.22, + "grad_norm": 39.932865142822266, + "learning_rate": 5.213992182018643e-06, + "loss": 1.7015, + "step": 7377 + }, + { + "epoch": 2.22, + "grad_norm": 33.233642578125, + "learning_rate": 5.2119875714142535e-06, + "loss": 1.5578, + "step": 7378 + }, + { + "epoch": 2.22, + "grad_norm": 12.168890953063965, + "learning_rate": 5.209982960809863e-06, + "loss": 0.9835, + "step": 7379 + }, + { + "epoch": 2.22, + "grad_norm": 9.322324752807617, + "learning_rate": 5.207978350205473e-06, + "loss": 0.8729, + "step": 7380 + }, + { + "epoch": 2.22, + "grad_norm": 32.88364791870117, + "learning_rate": 5.205973739601084e-06, + "loss": 1.5136, + "step": 7381 + }, + { + "epoch": 2.22, + "grad_norm": 24.65951156616211, + "learning_rate": 5.203969128996693e-06, + "loss": 1.2119, + "step": 7382 + }, + { + "epoch": 2.22, + "grad_norm": 9.384584426879883, + "learning_rate": 5.201964518392303e-06, + "loss": 0.8526, + "step": 7383 + }, + { + "epoch": 2.22, + "grad_norm": 16.68515396118164, + "learning_rate": 5.199959907787912e-06, + "loss": 1.6815, + "step": 7384 + }, + { + "epoch": 2.22, + "grad_norm": 12.431203842163086, + "learning_rate": 5.197955297183522e-06, + "loss": 0.8398, + "step": 7385 + }, + { + "epoch": 2.22, + "grad_norm": 10.749804496765137, + "learning_rate": 5.1959506865791325e-06, + "loss": 1.6542, + "step": 7386 + }, + { + "epoch": 2.22, + "grad_norm": 19.671411514282227, + "learning_rate": 5.1939460759747425e-06, + "loss": 1.1745, + "step": 7387 + }, + { + "epoch": 2.22, + "grad_norm": 15.82059383392334, + "learning_rate": 5.191941465370353e-06, + "loss": 1.5952, + "step": 7388 + }, + { + "epoch": 2.22, + "grad_norm": 11.592869758605957, + "learning_rate": 5.189936854765963e-06, + "loss": 1.375, + "step": 7389 + }, + { + "epoch": 2.22, + "grad_norm": 12.64307975769043, + "learning_rate": 5.187932244161572e-06, + "loss": 1.5403, + "step": 7390 + }, + { + "epoch": 2.22, + "grad_norm": 21.732891082763672, + "learning_rate": 5.185927633557182e-06, + "loss": 0.9123, + "step": 7391 + }, + { + "epoch": 2.22, + "grad_norm": 11.803874969482422, + "learning_rate": 5.183923022952791e-06, + "loss": 1.1493, + "step": 7392 + }, + { + "epoch": 2.22, + "grad_norm": 13.172371864318848, + "learning_rate": 5.181918412348401e-06, + "loss": 1.5287, + "step": 7393 + }, + { + "epoch": 2.22, + "grad_norm": 11.935776710510254, + "learning_rate": 5.179913801744012e-06, + "loss": 1.0215, + "step": 7394 + }, + { + "epoch": 2.22, + "grad_norm": 41.79684829711914, + "learning_rate": 5.1779091911396215e-06, + "loss": 2.0082, + "step": 7395 + }, + { + "epoch": 2.22, + "grad_norm": 20.29804229736328, + "learning_rate": 5.175904580535232e-06, + "loss": 1.9343, + "step": 7396 + }, + { + "epoch": 2.22, + "grad_norm": 13.398682594299316, + "learning_rate": 5.173899969930841e-06, + "loss": 1.1503, + "step": 7397 + }, + { + "epoch": 2.22, + "grad_norm": 24.813034057617188, + "learning_rate": 5.171895359326451e-06, + "loss": 1.0149, + "step": 7398 + }, + { + "epoch": 2.22, + "grad_norm": 23.43954849243164, + "learning_rate": 5.169890748722061e-06, + "loss": 1.2993, + "step": 7399 + }, + { + "epoch": 2.22, + "grad_norm": 14.955592155456543, + "learning_rate": 5.167886138117671e-06, + "loss": 1.3491, + "step": 7400 + }, + { + "epoch": 2.23, + "grad_norm": 15.617820739746094, + "learning_rate": 5.165881527513281e-06, + "loss": 1.1608, + "step": 7401 + }, + { + "epoch": 2.23, + "grad_norm": 18.086448669433594, + "learning_rate": 5.163876916908891e-06, + "loss": 1.4179, + "step": 7402 + }, + { + "epoch": 2.23, + "grad_norm": 7.104720115661621, + "learning_rate": 5.1618723063045005e-06, + "loss": 0.8867, + "step": 7403 + }, + { + "epoch": 2.23, + "grad_norm": 15.77996826171875, + "learning_rate": 5.159867695700111e-06, + "loss": 1.276, + "step": 7404 + }, + { + "epoch": 2.23, + "grad_norm": 28.867000579833984, + "learning_rate": 5.15786308509572e-06, + "loss": 1.6279, + "step": 7405 + }, + { + "epoch": 2.23, + "grad_norm": 26.46651268005371, + "learning_rate": 5.155858474491331e-06, + "loss": 1.2349, + "step": 7406 + }, + { + "epoch": 2.23, + "grad_norm": 15.890294075012207, + "learning_rate": 5.153853863886941e-06, + "loss": 2.3934, + "step": 7407 + }, + { + "epoch": 2.23, + "grad_norm": 37.84567642211914, + "learning_rate": 5.15184925328255e-06, + "loss": 1.5725, + "step": 7408 + }, + { + "epoch": 2.23, + "grad_norm": 13.57693862915039, + "learning_rate": 5.14984464267816e-06, + "loss": 1.4944, + "step": 7409 + }, + { + "epoch": 2.23, + "grad_norm": 16.91181755065918, + "learning_rate": 5.147840032073769e-06, + "loss": 1.7839, + "step": 7410 + }, + { + "epoch": 2.23, + "grad_norm": 36.64265441894531, + "learning_rate": 5.1458354214693795e-06, + "loss": 2.3358, + "step": 7411 + }, + { + "epoch": 2.23, + "grad_norm": 5.110405921936035, + "learning_rate": 5.14383081086499e-06, + "loss": 0.6339, + "step": 7412 + }, + { + "epoch": 2.23, + "grad_norm": 13.546368598937988, + "learning_rate": 5.1418262002606005e-06, + "loss": 2.2864, + "step": 7413 + }, + { + "epoch": 2.23, + "grad_norm": 13.818781852722168, + "learning_rate": 5.13982158965621e-06, + "loss": 1.8962, + "step": 7414 + }, + { + "epoch": 2.23, + "grad_norm": 8.545878410339355, + "learning_rate": 5.13781697905182e-06, + "loss": 1.9361, + "step": 7415 + }, + { + "epoch": 2.23, + "grad_norm": 22.833742141723633, + "learning_rate": 5.135812368447429e-06, + "loss": 1.869, + "step": 7416 + }, + { + "epoch": 2.23, + "grad_norm": 48.628639221191406, + "learning_rate": 5.133807757843039e-06, + "loss": 1.6338, + "step": 7417 + }, + { + "epoch": 2.23, + "grad_norm": 18.07792091369629, + "learning_rate": 5.13180314723865e-06, + "loss": 1.0774, + "step": 7418 + }, + { + "epoch": 2.23, + "grad_norm": 12.404952049255371, + "learning_rate": 5.129798536634259e-06, + "loss": 1.8888, + "step": 7419 + }, + { + "epoch": 2.23, + "grad_norm": 16.634307861328125, + "learning_rate": 5.127793926029869e-06, + "loss": 0.9383, + "step": 7420 + }, + { + "epoch": 2.23, + "grad_norm": 13.299745559692383, + "learning_rate": 5.125789315425479e-06, + "loss": 1.4448, + "step": 7421 + }, + { + "epoch": 2.23, + "grad_norm": 20.849573135375977, + "learning_rate": 5.123784704821089e-06, + "loss": 1.8726, + "step": 7422 + }, + { + "epoch": 2.23, + "grad_norm": 69.56287384033203, + "learning_rate": 5.121780094216699e-06, + "loss": 2.2076, + "step": 7423 + }, + { + "epoch": 2.23, + "grad_norm": 13.498839378356934, + "learning_rate": 5.119775483612309e-06, + "loss": 0.9413, + "step": 7424 + }, + { + "epoch": 2.23, + "grad_norm": 18.645475387573242, + "learning_rate": 5.117770873007919e-06, + "loss": 1.2546, + "step": 7425 + }, + { + "epoch": 2.23, + "grad_norm": 24.639263153076172, + "learning_rate": 5.115766262403529e-06, + "loss": 1.5101, + "step": 7426 + }, + { + "epoch": 2.23, + "grad_norm": 9.19277572631836, + "learning_rate": 5.113761651799138e-06, + "loss": 0.789, + "step": 7427 + }, + { + "epoch": 2.23, + "grad_norm": 13.205931663513184, + "learning_rate": 5.111757041194748e-06, + "loss": 1.2832, + "step": 7428 + }, + { + "epoch": 2.23, + "grad_norm": 26.456480026245117, + "learning_rate": 5.109752430590358e-06, + "loss": 2.1971, + "step": 7429 + }, + { + "epoch": 2.23, + "grad_norm": 16.407329559326172, + "learning_rate": 5.1077478199859685e-06, + "loss": 1.7726, + "step": 7430 + }, + { + "epoch": 2.23, + "grad_norm": 24.08245277404785, + "learning_rate": 5.105743209381579e-06, + "loss": 1.5051, + "step": 7431 + }, + { + "epoch": 2.23, + "grad_norm": 42.648616790771484, + "learning_rate": 5.103738598777188e-06, + "loss": 1.9396, + "step": 7432 + }, + { + "epoch": 2.23, + "grad_norm": 21.788301467895508, + "learning_rate": 5.101733988172798e-06, + "loss": 1.0098, + "step": 7433 + }, + { + "epoch": 2.24, + "grad_norm": 19.175148010253906, + "learning_rate": 5.099729377568407e-06, + "loss": 1.4408, + "step": 7434 + }, + { + "epoch": 2.24, + "grad_norm": 33.719520568847656, + "learning_rate": 5.097724766964017e-06, + "loss": 1.6809, + "step": 7435 + }, + { + "epoch": 2.24, + "grad_norm": 22.389249801635742, + "learning_rate": 5.095720156359627e-06, + "loss": 1.5647, + "step": 7436 + }, + { + "epoch": 2.24, + "grad_norm": 21.30531120300293, + "learning_rate": 5.0937155457552374e-06, + "loss": 1.5027, + "step": 7437 + }, + { + "epoch": 2.24, + "grad_norm": 26.76338768005371, + "learning_rate": 5.0917109351508475e-06, + "loss": 1.3216, + "step": 7438 + }, + { + "epoch": 2.24, + "grad_norm": 13.98495101928711, + "learning_rate": 5.089706324546458e-06, + "loss": 1.1699, + "step": 7439 + }, + { + "epoch": 2.24, + "grad_norm": 6.971924304962158, + "learning_rate": 5.087701713942067e-06, + "loss": 0.8873, + "step": 7440 + }, + { + "epoch": 2.24, + "eval_loss": 0.1698659360408783, + "eval_runtime": 43.7492, + "eval_samples_per_second": 33.806, + "eval_steps_per_second": 33.806, + "step": 7440 + }, + { + "epoch": 2.24, + "grad_norm": 15.718365669250488, + "learning_rate": 5.085697103337677e-06, + "loss": 1.535, + "step": 7441 + }, + { + "epoch": 2.24, + "grad_norm": 23.17116928100586, + "learning_rate": 5.083692492733286e-06, + "loss": 1.6606, + "step": 7442 + }, + { + "epoch": 2.24, + "grad_norm": 17.73175621032715, + "learning_rate": 5.081687882128897e-06, + "loss": 1.6878, + "step": 7443 + }, + { + "epoch": 2.24, + "grad_norm": 19.423385620117188, + "learning_rate": 5.079683271524507e-06, + "loss": 1.3193, + "step": 7444 + }, + { + "epoch": 2.24, + "grad_norm": 11.797534942626953, + "learning_rate": 5.077678660920116e-06, + "loss": 0.881, + "step": 7445 + }, + { + "epoch": 2.24, + "grad_norm": 15.763280868530273, + "learning_rate": 5.0756740503157265e-06, + "loss": 1.4587, + "step": 7446 + }, + { + "epoch": 2.24, + "grad_norm": 8.128504753112793, + "learning_rate": 5.073669439711337e-06, + "loss": 1.0937, + "step": 7447 + }, + { + "epoch": 2.24, + "grad_norm": 31.446063995361328, + "learning_rate": 5.071664829106946e-06, + "loss": 1.4584, + "step": 7448 + }, + { + "epoch": 2.24, + "grad_norm": 13.80343246459961, + "learning_rate": 5.069660218502557e-06, + "loss": 0.7605, + "step": 7449 + }, + { + "epoch": 2.24, + "grad_norm": 10.658504486083984, + "learning_rate": 5.067655607898167e-06, + "loss": 0.8793, + "step": 7450 + }, + { + "epoch": 2.24, + "grad_norm": 12.806272506713867, + "learning_rate": 5.065650997293776e-06, + "loss": 0.9486, + "step": 7451 + }, + { + "epoch": 2.24, + "grad_norm": 9.60067081451416, + "learning_rate": 5.063646386689386e-06, + "loss": 1.0777, + "step": 7452 + }, + { + "epoch": 2.24, + "grad_norm": 10.402573585510254, + "learning_rate": 5.061641776084995e-06, + "loss": 1.0899, + "step": 7453 + }, + { + "epoch": 2.24, + "grad_norm": 24.5078182220459, + "learning_rate": 5.0596371654806055e-06, + "loss": 1.6828, + "step": 7454 + }, + { + "epoch": 2.24, + "grad_norm": 20.464935302734375, + "learning_rate": 5.057632554876216e-06, + "loss": 1.5368, + "step": 7455 + }, + { + "epoch": 2.24, + "grad_norm": 13.39626693725586, + "learning_rate": 5.055627944271826e-06, + "loss": 0.9746, + "step": 7456 + }, + { + "epoch": 2.24, + "grad_norm": 18.0714168548584, + "learning_rate": 5.053623333667436e-06, + "loss": 1.0704, + "step": 7457 + }, + { + "epoch": 2.24, + "grad_norm": 33.03631591796875, + "learning_rate": 5.051618723063045e-06, + "loss": 1.4787, + "step": 7458 + }, + { + "epoch": 2.24, + "grad_norm": 101.09317779541016, + "learning_rate": 5.049614112458655e-06, + "loss": 1.9596, + "step": 7459 + }, + { + "epoch": 2.24, + "grad_norm": 27.06547737121582, + "learning_rate": 5.047609501854265e-06, + "loss": 0.824, + "step": 7460 + }, + { + "epoch": 2.24, + "grad_norm": 26.54560089111328, + "learning_rate": 5.045604891249875e-06, + "loss": 1.0234, + "step": 7461 + }, + { + "epoch": 2.24, + "grad_norm": 29.58421516418457, + "learning_rate": 5.043600280645485e-06, + "loss": 1.8553, + "step": 7462 + }, + { + "epoch": 2.24, + "grad_norm": 11.534109115600586, + "learning_rate": 5.041595670041095e-06, + "loss": 0.9154, + "step": 7463 + }, + { + "epoch": 2.24, + "grad_norm": 23.17076301574707, + "learning_rate": 5.039591059436705e-06, + "loss": 2.0129, + "step": 7464 + }, + { + "epoch": 2.24, + "grad_norm": 19.68193244934082, + "learning_rate": 5.037586448832315e-06, + "loss": 1.0759, + "step": 7465 + }, + { + "epoch": 2.24, + "grad_norm": 21.87065887451172, + "learning_rate": 5.035581838227924e-06, + "loss": 0.9128, + "step": 7466 + }, + { + "epoch": 2.25, + "grad_norm": 21.481231689453125, + "learning_rate": 5.033577227623535e-06, + "loss": 1.7024, + "step": 7467 + }, + { + "epoch": 2.25, + "grad_norm": 11.491585731506348, + "learning_rate": 5.031572617019145e-06, + "loss": 1.0131, + "step": 7468 + }, + { + "epoch": 2.25, + "grad_norm": 14.809111595153809, + "learning_rate": 5.029568006414754e-06, + "loss": 1.3544, + "step": 7469 + }, + { + "epoch": 2.25, + "grad_norm": 31.751813888549805, + "learning_rate": 5.027563395810364e-06, + "loss": 1.477, + "step": 7470 + }, + { + "epoch": 2.25, + "grad_norm": 24.474321365356445, + "learning_rate": 5.025558785205974e-06, + "loss": 1.5531, + "step": 7471 + }, + { + "epoch": 2.25, + "grad_norm": 182.55990600585938, + "learning_rate": 5.023554174601584e-06, + "loss": 2.9409, + "step": 7472 + }, + { + "epoch": 2.25, + "grad_norm": 17.08757972717285, + "learning_rate": 5.0215495639971945e-06, + "loss": 1.093, + "step": 7473 + }, + { + "epoch": 2.25, + "grad_norm": 18.484947204589844, + "learning_rate": 5.019544953392805e-06, + "loss": 1.7707, + "step": 7474 + }, + { + "epoch": 2.25, + "grad_norm": 14.49228572845459, + "learning_rate": 5.017540342788414e-06, + "loss": 1.4408, + "step": 7475 + }, + { + "epoch": 2.25, + "grad_norm": 10.136235237121582, + "learning_rate": 5.015535732184024e-06, + "loss": 1.0075, + "step": 7476 + }, + { + "epoch": 2.25, + "grad_norm": 63.96528244018555, + "learning_rate": 5.013531121579633e-06, + "loss": 2.0736, + "step": 7477 + }, + { + "epoch": 2.25, + "grad_norm": 10.56208610534668, + "learning_rate": 5.011526510975243e-06, + "loss": 1.0639, + "step": 7478 + }, + { + "epoch": 2.25, + "grad_norm": 11.87157154083252, + "learning_rate": 5.0095219003708525e-06, + "loss": 1.0202, + "step": 7479 + }, + { + "epoch": 2.25, + "grad_norm": 11.656047821044922, + "learning_rate": 5.0075172897664634e-06, + "loss": 1.3296, + "step": 7480 + }, + { + "epoch": 2.25, + "grad_norm": 20.969064712524414, + "learning_rate": 5.0055126791620735e-06, + "loss": 2.0833, + "step": 7481 + }, + { + "epoch": 2.25, + "grad_norm": 12.001461029052734, + "learning_rate": 5.003508068557683e-06, + "loss": 1.13, + "step": 7482 + }, + { + "epoch": 2.25, + "grad_norm": 17.097755432128906, + "learning_rate": 5.001503457953293e-06, + "loss": 1.3623, + "step": 7483 + }, + { + "epoch": 2.25, + "grad_norm": 10.47364330291748, + "learning_rate": 4.999498847348903e-06, + "loss": 0.7631, + "step": 7484 + }, + { + "epoch": 2.25, + "grad_norm": 11.359280586242676, + "learning_rate": 4.997494236744513e-06, + "loss": 0.6724, + "step": 7485 + }, + { + "epoch": 2.25, + "grad_norm": 10.474048614501953, + "learning_rate": 4.995489626140122e-06, + "loss": 0.5766, + "step": 7486 + }, + { + "epoch": 2.25, + "grad_norm": 14.837994575500488, + "learning_rate": 4.993485015535733e-06, + "loss": 1.1248, + "step": 7487 + }, + { + "epoch": 2.25, + "grad_norm": 18.054821014404297, + "learning_rate": 4.991480404931342e-06, + "loss": 1.0936, + "step": 7488 + }, + { + "epoch": 2.25, + "grad_norm": 9.805904388427734, + "learning_rate": 4.9894757943269525e-06, + "loss": 0.8442, + "step": 7489 + }, + { + "epoch": 2.25, + "grad_norm": 44.7296257019043, + "learning_rate": 4.987471183722563e-06, + "loss": 2.3775, + "step": 7490 + }, + { + "epoch": 2.25, + "grad_norm": 13.354528427124023, + "learning_rate": 4.985466573118173e-06, + "loss": 1.3387, + "step": 7491 + }, + { + "epoch": 2.25, + "grad_norm": 24.598257064819336, + "learning_rate": 4.983461962513782e-06, + "loss": 2.0694, + "step": 7492 + }, + { + "epoch": 2.25, + "grad_norm": 28.982959747314453, + "learning_rate": 4.981457351909392e-06, + "loss": 1.3952, + "step": 7493 + }, + { + "epoch": 2.25, + "grad_norm": 16.613788604736328, + "learning_rate": 4.979452741305002e-06, + "loss": 1.1931, + "step": 7494 + }, + { + "epoch": 2.25, + "grad_norm": 11.446743965148926, + "learning_rate": 4.977448130700611e-06, + "loss": 0.8703, + "step": 7495 + }, + { + "epoch": 2.25, + "grad_norm": 14.696474075317383, + "learning_rate": 4.975443520096221e-06, + "loss": 1.3185, + "step": 7496 + }, + { + "epoch": 2.25, + "grad_norm": 33.95481491088867, + "learning_rate": 4.9734389094918315e-06, + "loss": 1.3152, + "step": 7497 + }, + { + "epoch": 2.25, + "grad_norm": 9.814773559570312, + "learning_rate": 4.9714342988874416e-06, + "loss": 1.4784, + "step": 7498 + }, + { + "epoch": 2.25, + "grad_norm": 31.25359344482422, + "learning_rate": 4.969429688283051e-06, + "loss": 1.9966, + "step": 7499 + }, + { + "epoch": 2.25, + "grad_norm": 28.215229034423828, + "learning_rate": 4.967425077678662e-06, + "loss": 2.6453, + "step": 7500 + }, + { + "epoch": 2.26, + "grad_norm": 16.87133026123047, + "learning_rate": 4.965420467074271e-06, + "loss": 1.0434, + "step": 7501 + }, + { + "epoch": 2.26, + "grad_norm": 16.18372344970703, + "learning_rate": 4.963415856469881e-06, + "loss": 1.9418, + "step": 7502 + }, + { + "epoch": 2.26, + "grad_norm": 10.651266098022461, + "learning_rate": 4.961411245865491e-06, + "loss": 1.5416, + "step": 7503 + }, + { + "epoch": 2.26, + "grad_norm": 37.711185455322266, + "learning_rate": 4.959406635261101e-06, + "loss": 1.3423, + "step": 7504 + }, + { + "epoch": 2.26, + "grad_norm": 21.620492935180664, + "learning_rate": 4.9574020246567105e-06, + "loss": 1.435, + "step": 7505 + }, + { + "epoch": 2.26, + "grad_norm": 52.260536193847656, + "learning_rate": 4.9553974140523205e-06, + "loss": 1.8419, + "step": 7506 + }, + { + "epoch": 2.26, + "grad_norm": 21.462879180908203, + "learning_rate": 4.953392803447931e-06, + "loss": 0.8966, + "step": 7507 + }, + { + "epoch": 2.26, + "grad_norm": 9.330449104309082, + "learning_rate": 4.951388192843541e-06, + "loss": 0.8809, + "step": 7508 + }, + { + "epoch": 2.26, + "grad_norm": 31.02956771850586, + "learning_rate": 4.949383582239151e-06, + "loss": 1.8236, + "step": 7509 + }, + { + "epoch": 2.26, + "grad_norm": 19.69917869567871, + "learning_rate": 4.94737897163476e-06, + "loss": 1.4695, + "step": 7510 + }, + { + "epoch": 2.26, + "grad_norm": 13.709826469421387, + "learning_rate": 4.94537436103037e-06, + "loss": 0.9547, + "step": 7511 + }, + { + "epoch": 2.26, + "grad_norm": 14.561116218566895, + "learning_rate": 4.94336975042598e-06, + "loss": 2.0196, + "step": 7512 + }, + { + "epoch": 2.26, + "grad_norm": 12.211623191833496, + "learning_rate": 4.94136513982159e-06, + "loss": 1.0027, + "step": 7513 + }, + { + "epoch": 2.26, + "grad_norm": 94.37039947509766, + "learning_rate": 4.9393605292171995e-06, + "loss": 1.9356, + "step": 7514 + }, + { + "epoch": 2.26, + "grad_norm": 14.454056739807129, + "learning_rate": 4.9373559186128105e-06, + "loss": 1.0738, + "step": 7515 + }, + { + "epoch": 2.26, + "grad_norm": 88.53865051269531, + "learning_rate": 4.93535130800842e-06, + "loss": 1.7283, + "step": 7516 + }, + { + "epoch": 2.26, + "grad_norm": 225.10995483398438, + "learning_rate": 4.93334669740403e-06, + "loss": 1.8414, + "step": 7517 + }, + { + "epoch": 2.26, + "grad_norm": 16.295265197753906, + "learning_rate": 4.93134208679964e-06, + "loss": 1.7194, + "step": 7518 + }, + { + "epoch": 2.26, + "grad_norm": 102.90641021728516, + "learning_rate": 4.929337476195249e-06, + "loss": 1.1804, + "step": 7519 + }, + { + "epoch": 2.26, + "grad_norm": 22.84174156188965, + "learning_rate": 4.927332865590859e-06, + "loss": 1.0631, + "step": 7520 + }, + { + "epoch": 2.26, + "grad_norm": 29.34662628173828, + "learning_rate": 4.925328254986469e-06, + "loss": 1.2535, + "step": 7521 + }, + { + "epoch": 2.26, + "grad_norm": 11.276557922363281, + "learning_rate": 4.923323644382079e-06, + "loss": 1.1659, + "step": 7522 + }, + { + "epoch": 2.26, + "grad_norm": 14.418801307678223, + "learning_rate": 4.921319033777689e-06, + "loss": 1.9511, + "step": 7523 + }, + { + "epoch": 2.26, + "grad_norm": 20.165433883666992, + "learning_rate": 4.9193144231732995e-06, + "loss": 1.258, + "step": 7524 + }, + { + "epoch": 2.26, + "grad_norm": 15.483535766601562, + "learning_rate": 4.917309812568909e-06, + "loss": 1.0741, + "step": 7525 + }, + { + "epoch": 2.26, + "grad_norm": 23.793718338012695, + "learning_rate": 4.915305201964519e-06, + "loss": 1.4441, + "step": 7526 + }, + { + "epoch": 2.26, + "grad_norm": 23.1822452545166, + "learning_rate": 4.913300591360129e-06, + "loss": 1.3305, + "step": 7527 + }, + { + "epoch": 2.26, + "grad_norm": 18.903297424316406, + "learning_rate": 4.911295980755739e-06, + "loss": 1.1903, + "step": 7528 + }, + { + "epoch": 2.26, + "grad_norm": 18.78779411315918, + "learning_rate": 4.909291370151348e-06, + "loss": 1.7469, + "step": 7529 + }, + { + "epoch": 2.26, + "grad_norm": 15.670270919799805, + "learning_rate": 4.907286759546958e-06, + "loss": 1.1513, + "step": 7530 + }, + { + "epoch": 2.26, + "grad_norm": 22.992828369140625, + "learning_rate": 4.905282148942568e-06, + "loss": 0.9966, + "step": 7531 + }, + { + "epoch": 2.26, + "grad_norm": 16.692344665527344, + "learning_rate": 4.9032775383381785e-06, + "loss": 1.3511, + "step": 7532 + }, + { + "epoch": 2.26, + "grad_norm": 17.386524200439453, + "learning_rate": 4.901272927733789e-06, + "loss": 1.4587, + "step": 7533 + }, + { + "epoch": 2.27, + "grad_norm": 18.931062698364258, + "learning_rate": 4.899268317129398e-06, + "loss": 2.273, + "step": 7534 + }, + { + "epoch": 2.27, + "grad_norm": 86.52537536621094, + "learning_rate": 4.897263706525008e-06, + "loss": 3.4529, + "step": 7535 + }, + { + "epoch": 2.27, + "grad_norm": 9.628124237060547, + "learning_rate": 4.895259095920617e-06, + "loss": 0.8626, + "step": 7536 + }, + { + "epoch": 2.27, + "grad_norm": 10.610788345336914, + "learning_rate": 4.893254485316228e-06, + "loss": 1.2612, + "step": 7537 + }, + { + "epoch": 2.27, + "grad_norm": 15.068879127502441, + "learning_rate": 4.891249874711837e-06, + "loss": 1.2702, + "step": 7538 + }, + { + "epoch": 2.27, + "grad_norm": 25.841157913208008, + "learning_rate": 4.889245264107447e-06, + "loss": 1.5596, + "step": 7539 + }, + { + "epoch": 2.27, + "grad_norm": 13.19726276397705, + "learning_rate": 4.8872406535030575e-06, + "loss": 1.1938, + "step": 7540 + }, + { + "epoch": 2.27, + "grad_norm": 30.624975204467773, + "learning_rate": 4.8852360428986676e-06, + "loss": 2.6475, + "step": 7541 + }, + { + "epoch": 2.27, + "grad_norm": 17.774322509765625, + "learning_rate": 4.883231432294277e-06, + "loss": 0.8152, + "step": 7542 + }, + { + "epoch": 2.27, + "grad_norm": 13.231738090515137, + "learning_rate": 4.881226821689887e-06, + "loss": 1.0322, + "step": 7543 + }, + { + "epoch": 2.27, + "grad_norm": 18.530324935913086, + "learning_rate": 4.879222211085497e-06, + "loss": 1.6454, + "step": 7544 + }, + { + "epoch": 2.27, + "grad_norm": 32.340023040771484, + "learning_rate": 4.877217600481107e-06, + "loss": 1.1846, + "step": 7545 + }, + { + "epoch": 2.27, + "grad_norm": 13.608917236328125, + "learning_rate": 4.875212989876717e-06, + "loss": 1.307, + "step": 7546 + }, + { + "epoch": 2.27, + "grad_norm": 33.83692932128906, + "learning_rate": 4.873208379272326e-06, + "loss": 1.4187, + "step": 7547 + }, + { + "epoch": 2.27, + "grad_norm": 41.42520523071289, + "learning_rate": 4.8712037686679365e-06, + "loss": 1.5752, + "step": 7548 + }, + { + "epoch": 2.27, + "grad_norm": 7.894377708435059, + "learning_rate": 4.8691991580635466e-06, + "loss": 1.1328, + "step": 7549 + }, + { + "epoch": 2.27, + "grad_norm": 22.25946807861328, + "learning_rate": 4.867194547459157e-06, + "loss": 2.0451, + "step": 7550 + }, + { + "epoch": 2.27, + "grad_norm": 49.385562896728516, + "learning_rate": 4.865189936854766e-06, + "loss": 2.2722, + "step": 7551 + }, + { + "epoch": 2.27, + "grad_norm": 13.534929275512695, + "learning_rate": 4.863185326250377e-06, + "loss": 0.7596, + "step": 7552 + }, + { + "epoch": 2.27, + "grad_norm": 20.882999420166016, + "learning_rate": 4.861180715645986e-06, + "loss": 1.9043, + "step": 7553 + }, + { + "epoch": 2.27, + "grad_norm": 9.814292907714844, + "learning_rate": 4.859176105041596e-06, + "loss": 1.0012, + "step": 7554 + }, + { + "epoch": 2.27, + "grad_norm": 23.16642951965332, + "learning_rate": 4.857171494437206e-06, + "loss": 1.4161, + "step": 7555 + }, + { + "epoch": 2.27, + "grad_norm": 41.616703033447266, + "learning_rate": 4.8551668838328154e-06, + "loss": 2.3166, + "step": 7556 + }, + { + "epoch": 2.27, + "grad_norm": 14.239105224609375, + "learning_rate": 4.8531622732284255e-06, + "loss": 1.1065, + "step": 7557 + }, + { + "epoch": 2.27, + "grad_norm": 8.06943130493164, + "learning_rate": 4.851157662624036e-06, + "loss": 0.6586, + "step": 7558 + }, + { + "epoch": 2.27, + "grad_norm": 16.735288619995117, + "learning_rate": 4.849153052019646e-06, + "loss": 0.4483, + "step": 7559 + }, + { + "epoch": 2.27, + "grad_norm": 13.807044982910156, + "learning_rate": 4.847148441415255e-06, + "loss": 1.3318, + "step": 7560 + }, + { + "epoch": 2.27, + "eval_loss": 0.1706262081861496, + "eval_runtime": 43.7986, + "eval_samples_per_second": 33.768, + "eval_steps_per_second": 33.768, + "step": 7560 + }, + { + "epoch": 2.27, + "grad_norm": 18.019256591796875, + "learning_rate": 4.845143830810866e-06, + "loss": 1.3919, + "step": 7561 + }, + { + "epoch": 2.27, + "grad_norm": 16.097368240356445, + "learning_rate": 4.843139220206475e-06, + "loss": 1.5764, + "step": 7562 + }, + { + "epoch": 2.27, + "grad_norm": 13.278281211853027, + "learning_rate": 4.841134609602085e-06, + "loss": 1.0933, + "step": 7563 + }, + { + "epoch": 2.27, + "grad_norm": 10.068906784057617, + "learning_rate": 4.839129998997695e-06, + "loss": 0.8741, + "step": 7564 + }, + { + "epoch": 2.27, + "grad_norm": 16.215852737426758, + "learning_rate": 4.837125388393305e-06, + "loss": 2.1497, + "step": 7565 + }, + { + "epoch": 2.27, + "grad_norm": 11.731410026550293, + "learning_rate": 4.835120777788915e-06, + "loss": 1.7863, + "step": 7566 + }, + { + "epoch": 2.28, + "grad_norm": 10.969109535217285, + "learning_rate": 4.833116167184525e-06, + "loss": 0.8401, + "step": 7567 + }, + { + "epoch": 2.28, + "grad_norm": 10.927252769470215, + "learning_rate": 4.831111556580135e-06, + "loss": 1.4045, + "step": 7568 + }, + { + "epoch": 2.28, + "grad_norm": 41.4652214050293, + "learning_rate": 4.829106945975745e-06, + "loss": 3.292, + "step": 7569 + }, + { + "epoch": 2.28, + "grad_norm": 8.78602123260498, + "learning_rate": 4.827102335371355e-06, + "loss": 1.6186, + "step": 7570 + }, + { + "epoch": 2.28, + "grad_norm": 30.659353256225586, + "learning_rate": 4.825097724766964e-06, + "loss": 1.3395, + "step": 7571 + }, + { + "epoch": 2.28, + "grad_norm": 9.962449073791504, + "learning_rate": 4.823093114162574e-06, + "loss": 1.2939, + "step": 7572 + }, + { + "epoch": 2.28, + "grad_norm": 10.435197830200195, + "learning_rate": 4.821088503558184e-06, + "loss": 1.0703, + "step": 7573 + }, + { + "epoch": 2.28, + "grad_norm": 20.504138946533203, + "learning_rate": 4.819083892953794e-06, + "loss": 1.5566, + "step": 7574 + }, + { + "epoch": 2.28, + "grad_norm": 15.632285118103027, + "learning_rate": 4.817079282349404e-06, + "loss": 1.2294, + "step": 7575 + }, + { + "epoch": 2.28, + "grad_norm": 15.574999809265137, + "learning_rate": 4.815074671745015e-06, + "loss": 1.4429, + "step": 7576 + }, + { + "epoch": 2.28, + "grad_norm": 10.924356460571289, + "learning_rate": 4.813070061140624e-06, + "loss": 1.0845, + "step": 7577 + }, + { + "epoch": 2.28, + "grad_norm": 24.5418758392334, + "learning_rate": 4.811065450536234e-06, + "loss": 1.175, + "step": 7578 + }, + { + "epoch": 2.28, + "grad_norm": 29.236740112304688, + "learning_rate": 4.809060839931843e-06, + "loss": 0.9208, + "step": 7579 + }, + { + "epoch": 2.28, + "grad_norm": 11.928509712219238, + "learning_rate": 4.807056229327453e-06, + "loss": 0.7309, + "step": 7580 + }, + { + "epoch": 2.28, + "grad_norm": 31.78061294555664, + "learning_rate": 4.805051618723063e-06, + "loss": 1.4187, + "step": 7581 + }, + { + "epoch": 2.28, + "grad_norm": 18.418073654174805, + "learning_rate": 4.803047008118673e-06, + "loss": 2.4135, + "step": 7582 + }, + { + "epoch": 2.28, + "grad_norm": 27.633852005004883, + "learning_rate": 4.8010423975142835e-06, + "loss": 1.7401, + "step": 7583 + }, + { + "epoch": 2.28, + "grad_norm": 48.656333923339844, + "learning_rate": 4.799037786909893e-06, + "loss": 1.6712, + "step": 7584 + }, + { + "epoch": 2.28, + "grad_norm": 15.722077369689941, + "learning_rate": 4.797033176305503e-06, + "loss": 1.2471, + "step": 7585 + }, + { + "epoch": 2.28, + "grad_norm": 8.848787307739258, + "learning_rate": 4.795028565701113e-06, + "loss": 0.993, + "step": 7586 + }, + { + "epoch": 2.28, + "grad_norm": 15.616233825683594, + "learning_rate": 4.793023955096723e-06, + "loss": 1.3871, + "step": 7587 + }, + { + "epoch": 2.28, + "grad_norm": 24.031694412231445, + "learning_rate": 4.791019344492332e-06, + "loss": 1.2137, + "step": 7588 + }, + { + "epoch": 2.28, + "grad_norm": 11.082610130310059, + "learning_rate": 4.789014733887943e-06, + "loss": 1.1946, + "step": 7589 + }, + { + "epoch": 2.28, + "grad_norm": 18.813764572143555, + "learning_rate": 4.787010123283552e-06, + "loss": 1.1478, + "step": 7590 + }, + { + "epoch": 2.28, + "grad_norm": 20.931020736694336, + "learning_rate": 4.7850055126791625e-06, + "loss": 1.3695, + "step": 7591 + }, + { + "epoch": 2.28, + "grad_norm": 15.224300384521484, + "learning_rate": 4.7830009020747726e-06, + "loss": 1.0471, + "step": 7592 + }, + { + "epoch": 2.28, + "grad_norm": 13.662450790405273, + "learning_rate": 4.780996291470383e-06, + "loss": 1.0492, + "step": 7593 + }, + { + "epoch": 2.28, + "grad_norm": 11.05825424194336, + "learning_rate": 4.778991680865992e-06, + "loss": 1.4997, + "step": 7594 + }, + { + "epoch": 2.28, + "grad_norm": 14.814659118652344, + "learning_rate": 4.776987070261602e-06, + "loss": 1.5903, + "step": 7595 + }, + { + "epoch": 2.28, + "grad_norm": 20.72893524169922, + "learning_rate": 4.774982459657212e-06, + "loss": 0.9354, + "step": 7596 + }, + { + "epoch": 2.28, + "grad_norm": 16.968223571777344, + "learning_rate": 4.772977849052821e-06, + "loss": 1.7869, + "step": 7597 + }, + { + "epoch": 2.28, + "grad_norm": 12.797143936157227, + "learning_rate": 4.770973238448432e-06, + "loss": 1.4657, + "step": 7598 + }, + { + "epoch": 2.28, + "grad_norm": 278.7440185546875, + "learning_rate": 4.7689686278440414e-06, + "loss": 1.1522, + "step": 7599 + }, + { + "epoch": 2.29, + "grad_norm": 11.309410095214844, + "learning_rate": 4.7669640172396515e-06, + "loss": 1.6979, + "step": 7600 + }, + { + "epoch": 2.29, + "grad_norm": 17.630992889404297, + "learning_rate": 4.764959406635262e-06, + "loss": 1.2774, + "step": 7601 + }, + { + "epoch": 2.29, + "grad_norm": 64.68533325195312, + "learning_rate": 4.762954796030872e-06, + "loss": 1.6083, + "step": 7602 + }, + { + "epoch": 2.29, + "grad_norm": 12.072052001953125, + "learning_rate": 4.760950185426481e-06, + "loss": 0.8717, + "step": 7603 + }, + { + "epoch": 2.29, + "grad_norm": 13.834127426147461, + "learning_rate": 4.758945574822091e-06, + "loss": 1.385, + "step": 7604 + }, + { + "epoch": 2.29, + "grad_norm": 18.952781677246094, + "learning_rate": 4.756940964217701e-06, + "loss": 1.2853, + "step": 7605 + }, + { + "epoch": 2.29, + "grad_norm": 18.030406951904297, + "learning_rate": 4.754936353613311e-06, + "loss": 1.6859, + "step": 7606 + }, + { + "epoch": 2.29, + "grad_norm": 35.28142166137695, + "learning_rate": 4.752931743008921e-06, + "loss": 2.1156, + "step": 7607 + }, + { + "epoch": 2.29, + "grad_norm": 28.663076400756836, + "learning_rate": 4.7509271324045305e-06, + "loss": 2.1559, + "step": 7608 + }, + { + "epoch": 2.29, + "grad_norm": 11.263956069946289, + "learning_rate": 4.748922521800141e-06, + "loss": 0.9984, + "step": 7609 + }, + { + "epoch": 2.29, + "grad_norm": 9.154500007629395, + "learning_rate": 4.746917911195751e-06, + "loss": 1.3324, + "step": 7610 + }, + { + "epoch": 2.29, + "grad_norm": 18.66733169555664, + "learning_rate": 4.744913300591361e-06, + "loss": 0.6866, + "step": 7611 + }, + { + "epoch": 2.29, + "grad_norm": 22.89703369140625, + "learning_rate": 4.74290868998697e-06, + "loss": 2.2213, + "step": 7612 + }, + { + "epoch": 2.29, + "grad_norm": 9.362771987915039, + "learning_rate": 4.740904079382581e-06, + "loss": 1.069, + "step": 7613 + }, + { + "epoch": 2.29, + "grad_norm": 100.19771575927734, + "learning_rate": 4.73889946877819e-06, + "loss": 1.1066, + "step": 7614 + }, + { + "epoch": 2.29, + "grad_norm": 12.872753143310547, + "learning_rate": 4.7368948581738e-06, + "loss": 1.3306, + "step": 7615 + }, + { + "epoch": 2.29, + "grad_norm": 13.48827075958252, + "learning_rate": 4.73489024756941e-06, + "loss": 0.8473, + "step": 7616 + }, + { + "epoch": 2.29, + "grad_norm": 8.565572738647461, + "learning_rate": 4.7328856369650204e-06, + "loss": 0.7963, + "step": 7617 + }, + { + "epoch": 2.29, + "grad_norm": 16.844417572021484, + "learning_rate": 4.73088102636063e-06, + "loss": 0.9599, + "step": 7618 + }, + { + "epoch": 2.29, + "grad_norm": 16.240650177001953, + "learning_rate": 4.72887641575624e-06, + "loss": 1.0792, + "step": 7619 + }, + { + "epoch": 2.29, + "grad_norm": 13.393976211547852, + "learning_rate": 4.72687180515185e-06, + "loss": 1.1918, + "step": 7620 + }, + { + "epoch": 2.29, + "grad_norm": 27.26254653930664, + "learning_rate": 4.724867194547459e-06, + "loss": 1.5554, + "step": 7621 + }, + { + "epoch": 2.29, + "grad_norm": 19.264162063598633, + "learning_rate": 4.722862583943069e-06, + "loss": 1.0877, + "step": 7622 + }, + { + "epoch": 2.29, + "grad_norm": 14.326976776123047, + "learning_rate": 4.720857973338679e-06, + "loss": 1.4076, + "step": 7623 + }, + { + "epoch": 2.29, + "grad_norm": 15.193740844726562, + "learning_rate": 4.718853362734289e-06, + "loss": 1.6355, + "step": 7624 + }, + { + "epoch": 2.29, + "grad_norm": 9.33708381652832, + "learning_rate": 4.7168487521298986e-06, + "loss": 1.4984, + "step": 7625 + }, + { + "epoch": 2.29, + "grad_norm": 84.12486267089844, + "learning_rate": 4.7148441415255095e-06, + "loss": 1.9932, + "step": 7626 + }, + { + "epoch": 2.29, + "grad_norm": 21.830821990966797, + "learning_rate": 4.712839530921119e-06, + "loss": 1.4148, + "step": 7627 + }, + { + "epoch": 2.29, + "grad_norm": 11.36988353729248, + "learning_rate": 4.710834920316729e-06, + "loss": 1.3706, + "step": 7628 + }, + { + "epoch": 2.29, + "grad_norm": 4.701085567474365, + "learning_rate": 4.708830309712339e-06, + "loss": 0.4887, + "step": 7629 + }, + { + "epoch": 2.29, + "grad_norm": 40.99164962768555, + "learning_rate": 4.706825699107949e-06, + "loss": 3.0745, + "step": 7630 + }, + { + "epoch": 2.29, + "grad_norm": 46.76643753051758, + "learning_rate": 4.704821088503558e-06, + "loss": 1.5348, + "step": 7631 + }, + { + "epoch": 2.29, + "grad_norm": 24.31828498840332, + "learning_rate": 4.702816477899168e-06, + "loss": 1.1564, + "step": 7632 + }, + { + "epoch": 2.29, + "grad_norm": 32.74114227294922, + "learning_rate": 4.700811867294778e-06, + "loss": 0.9132, + "step": 7633 + }, + { + "epoch": 2.3, + "grad_norm": 36.050968170166016, + "learning_rate": 4.6988072566903885e-06, + "loss": 1.0512, + "step": 7634 + }, + { + "epoch": 2.3, + "grad_norm": 11.422075271606445, + "learning_rate": 4.6968026460859986e-06, + "loss": 1.2059, + "step": 7635 + }, + { + "epoch": 2.3, + "grad_norm": 35.184654235839844, + "learning_rate": 4.694798035481608e-06, + "loss": 1.4813, + "step": 7636 + }, + { + "epoch": 2.3, + "grad_norm": 45.07157516479492, + "learning_rate": 4.692793424877218e-06, + "loss": 1.2941, + "step": 7637 + }, + { + "epoch": 2.3, + "grad_norm": 37.6901969909668, + "learning_rate": 4.690788814272828e-06, + "loss": 3.1126, + "step": 7638 + }, + { + "epoch": 2.3, + "grad_norm": 21.26262664794922, + "learning_rate": 4.688784203668438e-06, + "loss": 1.3052, + "step": 7639 + }, + { + "epoch": 2.3, + "grad_norm": 26.654037475585938, + "learning_rate": 4.686779593064047e-06, + "loss": 1.6718, + "step": 7640 + }, + { + "epoch": 2.3, + "grad_norm": 19.267364501953125, + "learning_rate": 4.684774982459657e-06, + "loss": 1.1093, + "step": 7641 + }, + { + "epoch": 2.3, + "grad_norm": 11.072561264038086, + "learning_rate": 4.6827703718552674e-06, + "loss": 1.1142, + "step": 7642 + }, + { + "epoch": 2.3, + "grad_norm": 17.212007522583008, + "learning_rate": 4.6807657612508775e-06, + "loss": 1.1688, + "step": 7643 + }, + { + "epoch": 2.3, + "grad_norm": 14.088850021362305, + "learning_rate": 4.678761150646488e-06, + "loss": 1.0786, + "step": 7644 + }, + { + "epoch": 2.3, + "grad_norm": 8.411274909973145, + "learning_rate": 4.676756540042097e-06, + "loss": 1.6672, + "step": 7645 + }, + { + "epoch": 2.3, + "grad_norm": 10.209150314331055, + "learning_rate": 4.674751929437707e-06, + "loss": 0.6725, + "step": 7646 + }, + { + "epoch": 2.3, + "grad_norm": 8.190345764160156, + "learning_rate": 4.672747318833317e-06, + "loss": 0.7018, + "step": 7647 + }, + { + "epoch": 2.3, + "grad_norm": 17.66701316833496, + "learning_rate": 4.670742708228927e-06, + "loss": 1.9426, + "step": 7648 + }, + { + "epoch": 2.3, + "grad_norm": 29.10671043395996, + "learning_rate": 4.668738097624536e-06, + "loss": 1.8796, + "step": 7649 + }, + { + "epoch": 2.3, + "grad_norm": 10.14902114868164, + "learning_rate": 4.666733487020147e-06, + "loss": 0.9493, + "step": 7650 + }, + { + "epoch": 2.3, + "grad_norm": 21.747249603271484, + "learning_rate": 4.6647288764157565e-06, + "loss": 0.8706, + "step": 7651 + }, + { + "epoch": 2.3, + "grad_norm": 52.81174087524414, + "learning_rate": 4.662724265811367e-06, + "loss": 2.5637, + "step": 7652 + }, + { + "epoch": 2.3, + "grad_norm": 21.7281494140625, + "learning_rate": 4.660719655206977e-06, + "loss": 1.5396, + "step": 7653 + }, + { + "epoch": 2.3, + "grad_norm": 9.083329200744629, + "learning_rate": 4.658715044602587e-06, + "loss": 0.701, + "step": 7654 + }, + { + "epoch": 2.3, + "grad_norm": 8.001774787902832, + "learning_rate": 4.656710433998196e-06, + "loss": 0.9427, + "step": 7655 + }, + { + "epoch": 2.3, + "grad_norm": 8.815278053283691, + "learning_rate": 4.654705823393806e-06, + "loss": 1.6543, + "step": 7656 + }, + { + "epoch": 2.3, + "grad_norm": 37.267616271972656, + "learning_rate": 4.652701212789416e-06, + "loss": 0.9176, + "step": 7657 + }, + { + "epoch": 2.3, + "grad_norm": 32.767486572265625, + "learning_rate": 4.650696602185025e-06, + "loss": 1.6066, + "step": 7658 + }, + { + "epoch": 2.3, + "grad_norm": 15.666519165039062, + "learning_rate": 4.648691991580636e-06, + "loss": 1.3438, + "step": 7659 + }, + { + "epoch": 2.3, + "grad_norm": 31.18556785583496, + "learning_rate": 4.646687380976246e-06, + "loss": 2.5968, + "step": 7660 + }, + { + "epoch": 2.3, + "grad_norm": 21.61691665649414, + "learning_rate": 4.644682770371856e-06, + "loss": 1.7081, + "step": 7661 + }, + { + "epoch": 2.3, + "grad_norm": 6.294021129608154, + "learning_rate": 4.642678159767465e-06, + "loss": 0.5644, + "step": 7662 + }, + { + "epoch": 2.3, + "grad_norm": 17.326904296875, + "learning_rate": 4.640673549163076e-06, + "loss": 1.2007, + "step": 7663 + }, + { + "epoch": 2.3, + "grad_norm": 7.6418986320495605, + "learning_rate": 4.638668938558685e-06, + "loss": 1.5581, + "step": 7664 + }, + { + "epoch": 2.3, + "grad_norm": 36.48931121826172, + "learning_rate": 4.636664327954295e-06, + "loss": 1.8396, + "step": 7665 + }, + { + "epoch": 2.3, + "grad_norm": 12.859515190124512, + "learning_rate": 4.634659717349905e-06, + "loss": 1.6651, + "step": 7666 + }, + { + "epoch": 2.31, + "grad_norm": 19.518774032592773, + "learning_rate": 4.632655106745515e-06, + "loss": 1.0046, + "step": 7667 + }, + { + "epoch": 2.31, + "grad_norm": 17.942481994628906, + "learning_rate": 4.6306504961411246e-06, + "loss": 1.7626, + "step": 7668 + }, + { + "epoch": 2.31, + "grad_norm": 17.79322624206543, + "learning_rate": 4.628645885536735e-06, + "loss": 0.8779, + "step": 7669 + }, + { + "epoch": 2.31, + "grad_norm": 17.742807388305664, + "learning_rate": 4.626641274932345e-06, + "loss": 0.9714, + "step": 7670 + }, + { + "epoch": 2.31, + "grad_norm": 14.361837387084961, + "learning_rate": 4.624636664327955e-06, + "loss": 1.1075, + "step": 7671 + }, + { + "epoch": 2.31, + "grad_norm": 19.260719299316406, + "learning_rate": 4.622632053723565e-06, + "loss": 2.0738, + "step": 7672 + }, + { + "epoch": 2.31, + "grad_norm": 19.078481674194336, + "learning_rate": 4.620627443119174e-06, + "loss": 1.3898, + "step": 7673 + }, + { + "epoch": 2.31, + "grad_norm": 11.892518043518066, + "learning_rate": 4.618622832514784e-06, + "loss": 1.4389, + "step": 7674 + }, + { + "epoch": 2.31, + "grad_norm": 37.109336853027344, + "learning_rate": 4.616618221910394e-06, + "loss": 1.5463, + "step": 7675 + }, + { + "epoch": 2.31, + "grad_norm": 35.61430358886719, + "learning_rate": 4.614613611306004e-06, + "loss": 1.2612, + "step": 7676 + }, + { + "epoch": 2.31, + "grad_norm": 10.030492782592773, + "learning_rate": 4.612609000701614e-06, + "loss": 1.0428, + "step": 7677 + }, + { + "epoch": 2.31, + "grad_norm": 32.46242141723633, + "learning_rate": 4.6106043900972246e-06, + "loss": 1.2366, + "step": 7678 + }, + { + "epoch": 2.31, + "grad_norm": 63.76512908935547, + "learning_rate": 4.608599779492834e-06, + "loss": 1.8638, + "step": 7679 + }, + { + "epoch": 2.31, + "grad_norm": 21.762617111206055, + "learning_rate": 4.606595168888444e-06, + "loss": 1.7875, + "step": 7680 + }, + { + "epoch": 2.31, + "eval_loss": 0.16951656341552734, + "eval_runtime": 43.7895, + "eval_samples_per_second": 33.775, + "eval_steps_per_second": 33.775, + "step": 7680 + }, + { + "epoch": 2.31, + "grad_norm": 10.09822940826416, + "learning_rate": 4.604590558284054e-06, + "loss": 1.1142, + "step": 7681 + }, + { + "epoch": 2.31, + "grad_norm": 10.086823463439941, + "learning_rate": 4.602585947679663e-06, + "loss": 0.8318, + "step": 7682 + }, + { + "epoch": 2.31, + "grad_norm": 11.713811874389648, + "learning_rate": 4.600581337075273e-06, + "loss": 0.9897, + "step": 7683 + }, + { + "epoch": 2.31, + "grad_norm": 20.750566482543945, + "learning_rate": 4.598576726470883e-06, + "loss": 1.5894, + "step": 7684 + }, + { + "epoch": 2.31, + "grad_norm": 17.660737991333008, + "learning_rate": 4.5965721158664935e-06, + "loss": 1.4457, + "step": 7685 + }, + { + "epoch": 2.31, + "grad_norm": 11.275466918945312, + "learning_rate": 4.594567505262103e-06, + "loss": 1.0097, + "step": 7686 + }, + { + "epoch": 2.31, + "grad_norm": 9.963224411010742, + "learning_rate": 4.592562894657714e-06, + "loss": 1.7383, + "step": 7687 + }, + { + "epoch": 2.31, + "grad_norm": 5.169731140136719, + "learning_rate": 4.590558284053323e-06, + "loss": 0.5528, + "step": 7688 + }, + { + "epoch": 2.31, + "grad_norm": 10.396340370178223, + "learning_rate": 4.588553673448933e-06, + "loss": 0.967, + "step": 7689 + }, + { + "epoch": 2.31, + "grad_norm": 15.52597427368164, + "learning_rate": 4.586549062844543e-06, + "loss": 0.9266, + "step": 7690 + }, + { + "epoch": 2.31, + "grad_norm": 12.798748016357422, + "learning_rate": 4.584544452240153e-06, + "loss": 1.288, + "step": 7691 + }, + { + "epoch": 2.31, + "grad_norm": 21.744783401489258, + "learning_rate": 4.582539841635762e-06, + "loss": 1.3389, + "step": 7692 + }, + { + "epoch": 2.31, + "grad_norm": 7.416219711303711, + "learning_rate": 4.5805352310313724e-06, + "loss": 0.9147, + "step": 7693 + }, + { + "epoch": 2.31, + "grad_norm": 27.035898208618164, + "learning_rate": 4.5785306204269825e-06, + "loss": 2.5926, + "step": 7694 + }, + { + "epoch": 2.31, + "grad_norm": 18.482648849487305, + "learning_rate": 4.576526009822593e-06, + "loss": 2.276, + "step": 7695 + }, + { + "epoch": 2.31, + "grad_norm": 52.933494567871094, + "learning_rate": 4.574521399218203e-06, + "loss": 2.2703, + "step": 7696 + }, + { + "epoch": 2.31, + "grad_norm": 11.743091583251953, + "learning_rate": 4.572516788613812e-06, + "loss": 0.9347, + "step": 7697 + }, + { + "epoch": 2.31, + "grad_norm": 16.696788787841797, + "learning_rate": 4.570512178009422e-06, + "loss": 1.3391, + "step": 7698 + }, + { + "epoch": 2.31, + "grad_norm": 23.81439971923828, + "learning_rate": 4.568507567405032e-06, + "loss": 2.2063, + "step": 7699 + }, + { + "epoch": 2.32, + "grad_norm": 11.924534797668457, + "learning_rate": 4.566502956800642e-06, + "loss": 1.0567, + "step": 7700 + }, + { + "epoch": 2.32, + "grad_norm": 15.285096168518066, + "learning_rate": 4.564498346196251e-06, + "loss": 1.6988, + "step": 7701 + }, + { + "epoch": 2.32, + "grad_norm": 14.276802062988281, + "learning_rate": 4.562493735591862e-06, + "loss": 0.7809, + "step": 7702 + }, + { + "epoch": 2.32, + "grad_norm": 19.88568687438965, + "learning_rate": 4.560489124987472e-06, + "loss": 1.8969, + "step": 7703 + }, + { + "epoch": 2.32, + "grad_norm": 22.174179077148438, + "learning_rate": 4.558484514383082e-06, + "loss": 1.4157, + "step": 7704 + }, + { + "epoch": 2.32, + "grad_norm": 26.93759536743164, + "learning_rate": 4.556479903778691e-06, + "loss": 0.7634, + "step": 7705 + }, + { + "epoch": 2.32, + "grad_norm": 14.25698184967041, + "learning_rate": 4.554475293174301e-06, + "loss": 0.9706, + "step": 7706 + }, + { + "epoch": 2.32, + "grad_norm": 24.950761795043945, + "learning_rate": 4.552470682569911e-06, + "loss": 1.938, + "step": 7707 + }, + { + "epoch": 2.32, + "grad_norm": 7.129110336303711, + "learning_rate": 4.550466071965521e-06, + "loss": 0.538, + "step": 7708 + }, + { + "epoch": 2.32, + "grad_norm": 10.459171295166016, + "learning_rate": 4.548461461361131e-06, + "loss": 1.3554, + "step": 7709 + }, + { + "epoch": 2.32, + "grad_norm": 26.577316284179688, + "learning_rate": 4.5464568507567405e-06, + "loss": 2.1107, + "step": 7710 + }, + { + "epoch": 2.32, + "grad_norm": 87.70563507080078, + "learning_rate": 4.5444522401523506e-06, + "loss": 1.3952, + "step": 7711 + }, + { + "epoch": 2.32, + "grad_norm": 10.34211254119873, + "learning_rate": 4.542447629547961e-06, + "loss": 0.7507, + "step": 7712 + }, + { + "epoch": 2.32, + "grad_norm": 33.580265045166016, + "learning_rate": 4.540443018943571e-06, + "loss": 1.8236, + "step": 7713 + }, + { + "epoch": 2.32, + "grad_norm": 13.009629249572754, + "learning_rate": 4.53843840833918e-06, + "loss": 1.2249, + "step": 7714 + }, + { + "epoch": 2.32, + "grad_norm": 8.66175365447998, + "learning_rate": 4.536433797734791e-06, + "loss": 0.9531, + "step": 7715 + }, + { + "epoch": 2.32, + "grad_norm": 17.59461784362793, + "learning_rate": 4.5344291871304e-06, + "loss": 1.6012, + "step": 7716 + }, + { + "epoch": 2.32, + "grad_norm": 26.786701202392578, + "learning_rate": 4.53242457652601e-06, + "loss": 1.3363, + "step": 7717 + }, + { + "epoch": 2.32, + "grad_norm": 10.355632781982422, + "learning_rate": 4.53041996592162e-06, + "loss": 0.8374, + "step": 7718 + }, + { + "epoch": 2.32, + "grad_norm": 13.772407531738281, + "learning_rate": 4.52841535531723e-06, + "loss": 1.9609, + "step": 7719 + }, + { + "epoch": 2.32, + "grad_norm": 10.235556602478027, + "learning_rate": 4.52641074471284e-06, + "loss": 1.4712, + "step": 7720 + }, + { + "epoch": 2.32, + "grad_norm": 37.70382308959961, + "learning_rate": 4.52440613410845e-06, + "loss": 1.8212, + "step": 7721 + }, + { + "epoch": 2.32, + "grad_norm": 16.85622215270996, + "learning_rate": 4.52240152350406e-06, + "loss": 1.3294, + "step": 7722 + }, + { + "epoch": 2.32, + "grad_norm": 13.376880645751953, + "learning_rate": 4.520396912899669e-06, + "loss": 0.8711, + "step": 7723 + }, + { + "epoch": 2.32, + "grad_norm": 24.79469871520996, + "learning_rate": 4.51839230229528e-06, + "loss": 1.125, + "step": 7724 + }, + { + "epoch": 2.32, + "grad_norm": 21.333232879638672, + "learning_rate": 4.516387691690889e-06, + "loss": 1.7883, + "step": 7725 + }, + { + "epoch": 2.32, + "grad_norm": 11.4044771194458, + "learning_rate": 4.514383081086499e-06, + "loss": 1.1805, + "step": 7726 + }, + { + "epoch": 2.32, + "grad_norm": 51.108253479003906, + "learning_rate": 4.512378470482109e-06, + "loss": 1.9263, + "step": 7727 + }, + { + "epoch": 2.32, + "grad_norm": 21.30112075805664, + "learning_rate": 4.5103738598777195e-06, + "loss": 1.3295, + "step": 7728 + }, + { + "epoch": 2.32, + "grad_norm": 26.95627784729004, + "learning_rate": 4.508369249273329e-06, + "loss": 2.7558, + "step": 7729 + }, + { + "epoch": 2.32, + "grad_norm": 13.784543991088867, + "learning_rate": 4.506364638668939e-06, + "loss": 2.2716, + "step": 7730 + }, + { + "epoch": 2.32, + "grad_norm": 55.046142578125, + "learning_rate": 4.504360028064549e-06, + "loss": 1.819, + "step": 7731 + }, + { + "epoch": 2.32, + "grad_norm": 10.930252075195312, + "learning_rate": 4.502355417460159e-06, + "loss": 0.8165, + "step": 7732 + }, + { + "epoch": 2.33, + "grad_norm": 23.520112991333008, + "learning_rate": 4.500350806855769e-06, + "loss": 0.9347, + "step": 7733 + }, + { + "epoch": 2.33, + "grad_norm": 8.282185554504395, + "learning_rate": 4.498346196251378e-06, + "loss": 1.1021, + "step": 7734 + }, + { + "epoch": 2.33, + "grad_norm": 15.41140365600586, + "learning_rate": 4.496341585646988e-06, + "loss": 1.5547, + "step": 7735 + }, + { + "epoch": 2.33, + "grad_norm": 26.10579490661621, + "learning_rate": 4.4943369750425984e-06, + "loss": 1.77, + "step": 7736 + }, + { + "epoch": 2.33, + "grad_norm": 11.937204360961914, + "learning_rate": 4.4923323644382085e-06, + "loss": 1.0362, + "step": 7737 + }, + { + "epoch": 2.33, + "grad_norm": 21.467954635620117, + "learning_rate": 4.490327753833818e-06, + "loss": 1.802, + "step": 7738 + }, + { + "epoch": 2.33, + "grad_norm": 17.1977596282959, + "learning_rate": 4.488323143229429e-06, + "loss": 1.2613, + "step": 7739 + }, + { + "epoch": 2.33, + "grad_norm": 11.602217674255371, + "learning_rate": 4.486318532625038e-06, + "loss": 1.5562, + "step": 7740 + }, + { + "epoch": 2.33, + "grad_norm": 50.02082061767578, + "learning_rate": 4.484313922020648e-06, + "loss": 1.2008, + "step": 7741 + }, + { + "epoch": 2.33, + "grad_norm": 16.229761123657227, + "learning_rate": 4.482309311416258e-06, + "loss": 1.5572, + "step": 7742 + }, + { + "epoch": 2.33, + "grad_norm": 16.347227096557617, + "learning_rate": 4.480304700811867e-06, + "loss": 1.4817, + "step": 7743 + }, + { + "epoch": 2.33, + "grad_norm": 9.144948959350586, + "learning_rate": 4.478300090207477e-06, + "loss": 0.6118, + "step": 7744 + }, + { + "epoch": 2.33, + "grad_norm": 45.46603775024414, + "learning_rate": 4.4762954796030875e-06, + "loss": 1.9398, + "step": 7745 + }, + { + "epoch": 2.33, + "grad_norm": 10.924186706542969, + "learning_rate": 4.474290868998698e-06, + "loss": 1.4317, + "step": 7746 + }, + { + "epoch": 2.33, + "grad_norm": 39.43347930908203, + "learning_rate": 4.472286258394307e-06, + "loss": 2.4369, + "step": 7747 + }, + { + "epoch": 2.33, + "grad_norm": 124.04064178466797, + "learning_rate": 4.470281647789917e-06, + "loss": 3.3272, + "step": 7748 + }, + { + "epoch": 2.33, + "grad_norm": 6.773767471313477, + "learning_rate": 4.468277037185527e-06, + "loss": 0.5183, + "step": 7749 + }, + { + "epoch": 2.33, + "grad_norm": 21.174972534179688, + "learning_rate": 4.466272426581137e-06, + "loss": 2.1152, + "step": 7750 + }, + { + "epoch": 2.33, + "grad_norm": 15.640390396118164, + "learning_rate": 4.464267815976746e-06, + "loss": 1.0071, + "step": 7751 + }, + { + "epoch": 2.33, + "grad_norm": 11.41567611694336, + "learning_rate": 4.462263205372357e-06, + "loss": 0.9003, + "step": 7752 + }, + { + "epoch": 2.33, + "grad_norm": 13.2136869430542, + "learning_rate": 4.4602585947679665e-06, + "loss": 1.4521, + "step": 7753 + }, + { + "epoch": 2.33, + "grad_norm": 24.674755096435547, + "learning_rate": 4.4582539841635766e-06, + "loss": 1.1701, + "step": 7754 + }, + { + "epoch": 2.33, + "grad_norm": 27.077539443969727, + "learning_rate": 4.456249373559187e-06, + "loss": 1.3348, + "step": 7755 + }, + { + "epoch": 2.33, + "grad_norm": 15.865095138549805, + "learning_rate": 4.454244762954797e-06, + "loss": 0.9423, + "step": 7756 + }, + { + "epoch": 2.33, + "grad_norm": 10.586139678955078, + "learning_rate": 4.452240152350406e-06, + "loss": 0.9348, + "step": 7757 + }, + { + "epoch": 2.33, + "grad_norm": 14.161200523376465, + "learning_rate": 4.450235541746016e-06, + "loss": 1.5252, + "step": 7758 + }, + { + "epoch": 2.33, + "grad_norm": 23.082622528076172, + "learning_rate": 4.448230931141626e-06, + "loss": 1.3711, + "step": 7759 + }, + { + "epoch": 2.33, + "grad_norm": 19.711462020874023, + "learning_rate": 4.446226320537235e-06, + "loss": 2.655, + "step": 7760 + }, + { + "epoch": 2.33, + "grad_norm": 24.49384880065918, + "learning_rate": 4.444221709932846e-06, + "loss": 1.0757, + "step": 7761 + }, + { + "epoch": 2.33, + "grad_norm": 9.763320922851562, + "learning_rate": 4.4422170993284555e-06, + "loss": 0.9065, + "step": 7762 + }, + { + "epoch": 2.33, + "grad_norm": 8.923637390136719, + "learning_rate": 4.440212488724066e-06, + "loss": 1.1599, + "step": 7763 + }, + { + "epoch": 2.33, + "grad_norm": 193.84384155273438, + "learning_rate": 4.438207878119676e-06, + "loss": 1.5881, + "step": 7764 + }, + { + "epoch": 2.33, + "grad_norm": 4.945940971374512, + "learning_rate": 4.436203267515286e-06, + "loss": 0.698, + "step": 7765 + }, + { + "epoch": 2.33, + "grad_norm": 8.121231079101562, + "learning_rate": 4.434198656910895e-06, + "loss": 0.9452, + "step": 7766 + }, + { + "epoch": 2.34, + "grad_norm": 28.268239974975586, + "learning_rate": 4.432194046306505e-06, + "loss": 1.5929, + "step": 7767 + }, + { + "epoch": 2.34, + "grad_norm": 70.76728057861328, + "learning_rate": 4.430189435702115e-06, + "loss": 1.6534, + "step": 7768 + }, + { + "epoch": 2.34, + "grad_norm": 23.936372756958008, + "learning_rate": 4.428184825097725e-06, + "loss": 1.2137, + "step": 7769 + }, + { + "epoch": 2.34, + "grad_norm": 9.887136459350586, + "learning_rate": 4.426180214493335e-06, + "loss": 0.8383, + "step": 7770 + }, + { + "epoch": 2.34, + "grad_norm": 14.792794227600098, + "learning_rate": 4.424175603888945e-06, + "loss": 0.9405, + "step": 7771 + }, + { + "epoch": 2.34, + "grad_norm": 17.71567726135254, + "learning_rate": 4.422170993284555e-06, + "loss": 1.063, + "step": 7772 + }, + { + "epoch": 2.34, + "grad_norm": 17.9732608795166, + "learning_rate": 4.420166382680165e-06, + "loss": 1.4261, + "step": 7773 + }, + { + "epoch": 2.34, + "grad_norm": 25.94655418395996, + "learning_rate": 4.418161772075775e-06, + "loss": 2.0324, + "step": 7774 + }, + { + "epoch": 2.34, + "grad_norm": 39.89909744262695, + "learning_rate": 4.416157161471384e-06, + "loss": 1.7023, + "step": 7775 + }, + { + "epoch": 2.34, + "grad_norm": 23.909780502319336, + "learning_rate": 4.414152550866995e-06, + "loss": 0.9315, + "step": 7776 + }, + { + "epoch": 2.34, + "grad_norm": 13.213065147399902, + "learning_rate": 4.412147940262604e-06, + "loss": 0.9687, + "step": 7777 + }, + { + "epoch": 2.34, + "grad_norm": 45.419342041015625, + "learning_rate": 4.410143329658214e-06, + "loss": 1.2235, + "step": 7778 + }, + { + "epoch": 2.34, + "grad_norm": 18.985164642333984, + "learning_rate": 4.4081387190538244e-06, + "loss": 1.544, + "step": 7779 + }, + { + "epoch": 2.34, + "grad_norm": 9.602522850036621, + "learning_rate": 4.4061341084494345e-06, + "loss": 0.7917, + "step": 7780 + }, + { + "epoch": 2.34, + "grad_norm": 38.75558090209961, + "learning_rate": 4.404129497845044e-06, + "loss": 1.5868, + "step": 7781 + }, + { + "epoch": 2.34, + "grad_norm": 51.216766357421875, + "learning_rate": 4.402124887240654e-06, + "loss": 1.2511, + "step": 7782 + }, + { + "epoch": 2.34, + "grad_norm": 9.934971809387207, + "learning_rate": 4.400120276636264e-06, + "loss": 1.18, + "step": 7783 + }, + { + "epoch": 2.34, + "grad_norm": 10.260887145996094, + "learning_rate": 4.398115666031873e-06, + "loss": 0.5996, + "step": 7784 + }, + { + "epoch": 2.34, + "grad_norm": 26.906917572021484, + "learning_rate": 4.396111055427484e-06, + "loss": 1.635, + "step": 7785 + }, + { + "epoch": 2.34, + "grad_norm": 11.61650562286377, + "learning_rate": 4.394106444823093e-06, + "loss": 0.8312, + "step": 7786 + }, + { + "epoch": 2.34, + "grad_norm": 40.0263786315918, + "learning_rate": 4.392101834218703e-06, + "loss": 0.9066, + "step": 7787 + }, + { + "epoch": 2.34, + "grad_norm": 31.856773376464844, + "learning_rate": 4.390097223614313e-06, + "loss": 1.9564, + "step": 7788 + }, + { + "epoch": 2.34, + "grad_norm": 33.76646423339844, + "learning_rate": 4.388092613009924e-06, + "loss": 1.9249, + "step": 7789 + }, + { + "epoch": 2.34, + "grad_norm": 24.660625457763672, + "learning_rate": 4.386088002405533e-06, + "loss": 2.0123, + "step": 7790 + }, + { + "epoch": 2.34, + "grad_norm": 16.023160934448242, + "learning_rate": 4.384083391801143e-06, + "loss": 2.5144, + "step": 7791 + }, + { + "epoch": 2.34, + "grad_norm": 18.865354537963867, + "learning_rate": 4.382078781196753e-06, + "loss": 1.4033, + "step": 7792 + }, + { + "epoch": 2.34, + "grad_norm": 17.00137710571289, + "learning_rate": 4.380074170592363e-06, + "loss": 1.7571, + "step": 7793 + }, + { + "epoch": 2.34, + "grad_norm": 22.194595336914062, + "learning_rate": 4.378069559987972e-06, + "loss": 1.5826, + "step": 7794 + }, + { + "epoch": 2.34, + "grad_norm": 24.60336685180664, + "learning_rate": 4.376064949383582e-06, + "loss": 1.7541, + "step": 7795 + }, + { + "epoch": 2.34, + "grad_norm": 14.990224838256836, + "learning_rate": 4.3740603387791925e-06, + "loss": 1.1141, + "step": 7796 + }, + { + "epoch": 2.34, + "grad_norm": 17.370750427246094, + "learning_rate": 4.3720557281748026e-06, + "loss": 0.947, + "step": 7797 + }, + { + "epoch": 2.34, + "grad_norm": 32.88360595703125, + "learning_rate": 4.370051117570413e-06, + "loss": 1.035, + "step": 7798 + }, + { + "epoch": 2.34, + "grad_norm": 13.272078514099121, + "learning_rate": 4.368046506966022e-06, + "loss": 0.8005, + "step": 7799 + }, + { + "epoch": 2.35, + "grad_norm": 28.69300651550293, + "learning_rate": 4.366041896361632e-06, + "loss": 1.6131, + "step": 7800 + }, + { + "epoch": 2.35, + "eval_loss": 0.17263449728488922, + "eval_runtime": 43.5025, + "eval_samples_per_second": 33.998, + "eval_steps_per_second": 33.998, + "step": 7800 + }, + { + "epoch": 2.35, + "grad_norm": 21.5859317779541, + "learning_rate": 4.364037285757242e-06, + "loss": 1.961, + "step": 7801 + }, + { + "epoch": 2.35, + "grad_norm": 45.89167785644531, + "learning_rate": 4.362032675152852e-06, + "loss": 0.887, + "step": 7802 + }, + { + "epoch": 2.35, + "grad_norm": 9.14841079711914, + "learning_rate": 4.360028064548461e-06, + "loss": 0.82, + "step": 7803 + }, + { + "epoch": 2.35, + "grad_norm": 13.917359352111816, + "learning_rate": 4.358023453944072e-06, + "loss": 0.6227, + "step": 7804 + }, + { + "epoch": 2.35, + "grad_norm": 10.889512062072754, + "learning_rate": 4.3560188433396815e-06, + "loss": 0.885, + "step": 7805 + }, + { + "epoch": 2.35, + "grad_norm": 11.779256820678711, + "learning_rate": 4.354014232735292e-06, + "loss": 1.6273, + "step": 7806 + }, + { + "epoch": 2.35, + "grad_norm": 9.816779136657715, + "learning_rate": 4.352009622130902e-06, + "loss": 1.2789, + "step": 7807 + }, + { + "epoch": 2.35, + "grad_norm": 10.623241424560547, + "learning_rate": 4.350005011526511e-06, + "loss": 0.888, + "step": 7808 + }, + { + "epoch": 2.35, + "grad_norm": 15.834614753723145, + "learning_rate": 4.348000400922121e-06, + "loss": 0.8899, + "step": 7809 + }, + { + "epoch": 2.35, + "grad_norm": 8.324065208435059, + "learning_rate": 4.345995790317731e-06, + "loss": 0.7989, + "step": 7810 + }, + { + "epoch": 2.35, + "grad_norm": 17.0432186126709, + "learning_rate": 4.343991179713341e-06, + "loss": 2.3516, + "step": 7811 + }, + { + "epoch": 2.35, + "grad_norm": 20.02086639404297, + "learning_rate": 4.3419865691089504e-06, + "loss": 0.8566, + "step": 7812 + }, + { + "epoch": 2.35, + "grad_norm": 7.930890083312988, + "learning_rate": 4.339981958504561e-06, + "loss": 1.2351, + "step": 7813 + }, + { + "epoch": 2.35, + "grad_norm": 30.14038848876953, + "learning_rate": 4.337977347900171e-06, + "loss": 1.7222, + "step": 7814 + }, + { + "epoch": 2.35, + "grad_norm": 12.645387649536133, + "learning_rate": 4.335972737295781e-06, + "loss": 1.3162, + "step": 7815 + }, + { + "epoch": 2.35, + "grad_norm": 17.415475845336914, + "learning_rate": 4.333968126691391e-06, + "loss": 1.8934, + "step": 7816 + }, + { + "epoch": 2.35, + "grad_norm": 13.79565143585205, + "learning_rate": 4.331963516087001e-06, + "loss": 1.4519, + "step": 7817 + }, + { + "epoch": 2.35, + "grad_norm": 14.080390930175781, + "learning_rate": 4.32995890548261e-06, + "loss": 0.899, + "step": 7818 + }, + { + "epoch": 2.35, + "grad_norm": 71.52656555175781, + "learning_rate": 4.32795429487822e-06, + "loss": 1.7353, + "step": 7819 + }, + { + "epoch": 2.35, + "grad_norm": 20.801849365234375, + "learning_rate": 4.32594968427383e-06, + "loss": 1.2644, + "step": 7820 + }, + { + "epoch": 2.35, + "grad_norm": 40.78676223754883, + "learning_rate": 4.32394507366944e-06, + "loss": 2.8361, + "step": 7821 + }, + { + "epoch": 2.35, + "grad_norm": 14.759114265441895, + "learning_rate": 4.3219404630650504e-06, + "loss": 1.0354, + "step": 7822 + }, + { + "epoch": 2.35, + "grad_norm": 24.163408279418945, + "learning_rate": 4.31993585246066e-06, + "loss": 1.1916, + "step": 7823 + }, + { + "epoch": 2.35, + "grad_norm": 8.51475715637207, + "learning_rate": 4.31793124185627e-06, + "loss": 1.5931, + "step": 7824 + }, + { + "epoch": 2.35, + "grad_norm": 16.245559692382812, + "learning_rate": 4.31592663125188e-06, + "loss": 1.3994, + "step": 7825 + }, + { + "epoch": 2.35, + "grad_norm": 78.04993438720703, + "learning_rate": 4.31392202064749e-06, + "loss": 2.4156, + "step": 7826 + }, + { + "epoch": 2.35, + "grad_norm": 11.318324089050293, + "learning_rate": 4.311917410043099e-06, + "loss": 1.7852, + "step": 7827 + }, + { + "epoch": 2.35, + "grad_norm": 9.104801177978516, + "learning_rate": 4.309912799438709e-06, + "loss": 0.7264, + "step": 7828 + }, + { + "epoch": 2.35, + "grad_norm": 12.194317817687988, + "learning_rate": 4.307908188834319e-06, + "loss": 1.4084, + "step": 7829 + }, + { + "epoch": 2.35, + "grad_norm": 39.487327575683594, + "learning_rate": 4.305903578229929e-06, + "loss": 1.4993, + "step": 7830 + }, + { + "epoch": 2.35, + "grad_norm": 24.5257511138916, + "learning_rate": 4.303898967625539e-06, + "loss": 1.7682, + "step": 7831 + }, + { + "epoch": 2.35, + "grad_norm": 22.789091110229492, + "learning_rate": 4.301894357021149e-06, + "loss": 2.2842, + "step": 7832 + }, + { + "epoch": 2.36, + "grad_norm": 14.122344017028809, + "learning_rate": 4.299889746416759e-06, + "loss": 0.7481, + "step": 7833 + }, + { + "epoch": 2.36, + "grad_norm": 9.010689735412598, + "learning_rate": 4.297885135812369e-06, + "loss": 1.0806, + "step": 7834 + }, + { + "epoch": 2.36, + "grad_norm": 20.649883270263672, + "learning_rate": 4.295880525207979e-06, + "loss": 1.4924, + "step": 7835 + }, + { + "epoch": 2.36, + "grad_norm": 24.006229400634766, + "learning_rate": 4.293875914603588e-06, + "loss": 1.4348, + "step": 7836 + }, + { + "epoch": 2.36, + "grad_norm": 30.37126350402832, + "learning_rate": 4.291871303999198e-06, + "loss": 1.5881, + "step": 7837 + }, + { + "epoch": 2.36, + "grad_norm": 10.542387962341309, + "learning_rate": 4.289866693394808e-06, + "loss": 0.7792, + "step": 7838 + }, + { + "epoch": 2.36, + "grad_norm": 19.010889053344727, + "learning_rate": 4.2878620827904185e-06, + "loss": 1.4441, + "step": 7839 + }, + { + "epoch": 2.36, + "grad_norm": 13.315712928771973, + "learning_rate": 4.285857472186028e-06, + "loss": 1.0043, + "step": 7840 + }, + { + "epoch": 2.36, + "grad_norm": 11.758505821228027, + "learning_rate": 4.283852861581639e-06, + "loss": 1.304, + "step": 7841 + }, + { + "epoch": 2.36, + "grad_norm": 14.62195873260498, + "learning_rate": 4.281848250977248e-06, + "loss": 1.1863, + "step": 7842 + }, + { + "epoch": 2.36, + "grad_norm": 17.724573135375977, + "learning_rate": 4.279843640372858e-06, + "loss": 1.0244, + "step": 7843 + }, + { + "epoch": 2.36, + "grad_norm": 49.88128662109375, + "learning_rate": 4.277839029768468e-06, + "loss": 1.6886, + "step": 7844 + }, + { + "epoch": 2.36, + "grad_norm": 28.493061065673828, + "learning_rate": 4.275834419164077e-06, + "loss": 1.7018, + "step": 7845 + }, + { + "epoch": 2.36, + "grad_norm": 17.226518630981445, + "learning_rate": 4.273829808559687e-06, + "loss": 1.3155, + "step": 7846 + }, + { + "epoch": 2.36, + "grad_norm": 14.533112525939941, + "learning_rate": 4.2718251979552975e-06, + "loss": 1.6489, + "step": 7847 + }, + { + "epoch": 2.36, + "grad_norm": 154.60177612304688, + "learning_rate": 4.2698205873509075e-06, + "loss": 3.9771, + "step": 7848 + }, + { + "epoch": 2.36, + "grad_norm": 24.33751678466797, + "learning_rate": 4.267815976746517e-06, + "loss": 1.5103, + "step": 7849 + }, + { + "epoch": 2.36, + "grad_norm": 7.450550556182861, + "learning_rate": 4.265811366142128e-06, + "loss": 0.8299, + "step": 7850 + }, + { + "epoch": 2.36, + "grad_norm": 11.310479164123535, + "learning_rate": 4.263806755537737e-06, + "loss": 1.0471, + "step": 7851 + }, + { + "epoch": 2.36, + "grad_norm": 22.815073013305664, + "learning_rate": 4.261802144933347e-06, + "loss": 2.6051, + "step": 7852 + }, + { + "epoch": 2.36, + "grad_norm": 49.23891067504883, + "learning_rate": 4.259797534328957e-06, + "loss": 1.1229, + "step": 7853 + }, + { + "epoch": 2.36, + "grad_norm": 12.846585273742676, + "learning_rate": 4.257792923724567e-06, + "loss": 1.0635, + "step": 7854 + }, + { + "epoch": 2.36, + "grad_norm": 7.844564437866211, + "learning_rate": 4.2557883131201764e-06, + "loss": 0.8797, + "step": 7855 + }, + { + "epoch": 2.36, + "grad_norm": 12.018916130065918, + "learning_rate": 4.2537837025157865e-06, + "loss": 1.2833, + "step": 7856 + }, + { + "epoch": 2.36, + "grad_norm": 15.914363861083984, + "learning_rate": 4.251779091911397e-06, + "loss": 1.2172, + "step": 7857 + }, + { + "epoch": 2.36, + "grad_norm": 20.0510311126709, + "learning_rate": 4.249774481307007e-06, + "loss": 0.7942, + "step": 7858 + }, + { + "epoch": 2.36, + "grad_norm": 45.73860549926758, + "learning_rate": 4.247769870702617e-06, + "loss": 2.2663, + "step": 7859 + }, + { + "epoch": 2.36, + "grad_norm": 18.577871322631836, + "learning_rate": 4.245765260098226e-06, + "loss": 1.8624, + "step": 7860 + }, + { + "epoch": 2.36, + "grad_norm": 14.684171676635742, + "learning_rate": 4.243760649493836e-06, + "loss": 1.2921, + "step": 7861 + }, + { + "epoch": 2.36, + "grad_norm": 20.421449661254883, + "learning_rate": 4.241756038889446e-06, + "loss": 1.2322, + "step": 7862 + }, + { + "epoch": 2.36, + "grad_norm": 11.350706100463867, + "learning_rate": 4.239751428285056e-06, + "loss": 1.3288, + "step": 7863 + }, + { + "epoch": 2.36, + "grad_norm": 21.545028686523438, + "learning_rate": 4.2377468176806655e-06, + "loss": 1.4258, + "step": 7864 + }, + { + "epoch": 2.36, + "grad_norm": 18.22039222717285, + "learning_rate": 4.2357422070762764e-06, + "loss": 1.4584, + "step": 7865 + }, + { + "epoch": 2.37, + "grad_norm": 28.657711029052734, + "learning_rate": 4.233737596471886e-06, + "loss": 2.444, + "step": 7866 + }, + { + "epoch": 2.37, + "grad_norm": 7.091336727142334, + "learning_rate": 4.231732985867496e-06, + "loss": 1.0055, + "step": 7867 + }, + { + "epoch": 2.37, + "grad_norm": 10.07259750366211, + "learning_rate": 4.229728375263106e-06, + "loss": 0.9513, + "step": 7868 + }, + { + "epoch": 2.37, + "grad_norm": 32.32286834716797, + "learning_rate": 4.227723764658715e-06, + "loss": 1.2243, + "step": 7869 + }, + { + "epoch": 2.37, + "grad_norm": 14.63747787475586, + "learning_rate": 4.225719154054325e-06, + "loss": 1.3113, + "step": 7870 + }, + { + "epoch": 2.37, + "grad_norm": 13.216215133666992, + "learning_rate": 4.223714543449935e-06, + "loss": 1.3608, + "step": 7871 + }, + { + "epoch": 2.37, + "grad_norm": 22.928085327148438, + "learning_rate": 4.221709932845545e-06, + "loss": 1.5632, + "step": 7872 + }, + { + "epoch": 2.37, + "grad_norm": 14.180635452270508, + "learning_rate": 4.2197053222411546e-06, + "loss": 1.0618, + "step": 7873 + }, + { + "epoch": 2.37, + "grad_norm": 13.465377807617188, + "learning_rate": 4.217700711636765e-06, + "loss": 2.2106, + "step": 7874 + }, + { + "epoch": 2.37, + "grad_norm": 21.83644676208496, + "learning_rate": 4.215696101032375e-06, + "loss": 1.1141, + "step": 7875 + }, + { + "epoch": 2.37, + "grad_norm": 9.465641021728516, + "learning_rate": 4.213691490427985e-06, + "loss": 0.9889, + "step": 7876 + }, + { + "epoch": 2.37, + "grad_norm": 38.847965240478516, + "learning_rate": 4.211686879823594e-06, + "loss": 2.6969, + "step": 7877 + }, + { + "epoch": 2.37, + "grad_norm": 12.433218002319336, + "learning_rate": 4.209682269219205e-06, + "loss": 0.5288, + "step": 7878 + }, + { + "epoch": 2.37, + "grad_norm": 13.307668685913086, + "learning_rate": 4.207677658614814e-06, + "loss": 0.9523, + "step": 7879 + }, + { + "epoch": 2.37, + "grad_norm": 57.23488235473633, + "learning_rate": 4.205673048010424e-06, + "loss": 3.0293, + "step": 7880 + }, + { + "epoch": 2.37, + "grad_norm": 16.167064666748047, + "learning_rate": 4.203668437406034e-06, + "loss": 1.9897, + "step": 7881 + }, + { + "epoch": 2.37, + "grad_norm": 29.6549072265625, + "learning_rate": 4.2016638268016445e-06, + "loss": 1.3998, + "step": 7882 + }, + { + "epoch": 2.37, + "grad_norm": 10.331962585449219, + "learning_rate": 4.199659216197254e-06, + "loss": 0.8686, + "step": 7883 + }, + { + "epoch": 2.37, + "grad_norm": 16.08567237854004, + "learning_rate": 4.197654605592864e-06, + "loss": 1.3522, + "step": 7884 + }, + { + "epoch": 2.37, + "grad_norm": 21.387054443359375, + "learning_rate": 4.195649994988474e-06, + "loss": 1.0533, + "step": 7885 + }, + { + "epoch": 2.37, + "grad_norm": 18.664920806884766, + "learning_rate": 4.193645384384083e-06, + "loss": 1.7003, + "step": 7886 + }, + { + "epoch": 2.37, + "grad_norm": 12.59611988067627, + "learning_rate": 4.191640773779694e-06, + "loss": 1.0916, + "step": 7887 + }, + { + "epoch": 2.37, + "grad_norm": 11.082344055175781, + "learning_rate": 4.189636163175303e-06, + "loss": 1.3448, + "step": 7888 + }, + { + "epoch": 2.37, + "grad_norm": 15.192607879638672, + "learning_rate": 4.187631552570913e-06, + "loss": 1.1133, + "step": 7889 + }, + { + "epoch": 2.37, + "grad_norm": 12.27822494506836, + "learning_rate": 4.1856269419665235e-06, + "loss": 1.4396, + "step": 7890 + }, + { + "epoch": 2.37, + "grad_norm": 18.4125919342041, + "learning_rate": 4.1836223313621335e-06, + "loss": 0.9025, + "step": 7891 + }, + { + "epoch": 2.37, + "grad_norm": 20.866134643554688, + "learning_rate": 4.181617720757743e-06, + "loss": 1.1648, + "step": 7892 + }, + { + "epoch": 2.37, + "grad_norm": 16.63862419128418, + "learning_rate": 4.179613110153353e-06, + "loss": 1.4421, + "step": 7893 + }, + { + "epoch": 2.37, + "grad_norm": 9.658064842224121, + "learning_rate": 4.177608499548963e-06, + "loss": 0.6854, + "step": 7894 + }, + { + "epoch": 2.37, + "grad_norm": 12.731464385986328, + "learning_rate": 4.175603888944573e-06, + "loss": 1.374, + "step": 7895 + }, + { + "epoch": 2.37, + "grad_norm": 8.585317611694336, + "learning_rate": 4.173599278340183e-06, + "loss": 0.5775, + "step": 7896 + }, + { + "epoch": 2.37, + "grad_norm": 24.50328826904297, + "learning_rate": 4.171594667735792e-06, + "loss": 1.5935, + "step": 7897 + }, + { + "epoch": 2.37, + "grad_norm": 16.08628273010254, + "learning_rate": 4.1695900571314024e-06, + "loss": 1.0627, + "step": 7898 + }, + { + "epoch": 2.37, + "grad_norm": 15.948634147644043, + "learning_rate": 4.1675854465270125e-06, + "loss": 1.4136, + "step": 7899 + }, + { + "epoch": 2.38, + "grad_norm": 31.818986892700195, + "learning_rate": 4.165580835922623e-06, + "loss": 1.8128, + "step": 7900 + }, + { + "epoch": 2.38, + "grad_norm": 37.07505798339844, + "learning_rate": 4.163576225318232e-06, + "loss": 1.9246, + "step": 7901 + }, + { + "epoch": 2.38, + "grad_norm": 15.36788558959961, + "learning_rate": 4.161571614713843e-06, + "loss": 1.9555, + "step": 7902 + }, + { + "epoch": 2.38, + "grad_norm": 17.946441650390625, + "learning_rate": 4.159567004109452e-06, + "loss": 1.1702, + "step": 7903 + }, + { + "epoch": 2.38, + "grad_norm": 12.651650428771973, + "learning_rate": 4.157562393505062e-06, + "loss": 1.0666, + "step": 7904 + }, + { + "epoch": 2.38, + "grad_norm": 31.045000076293945, + "learning_rate": 4.155557782900672e-06, + "loss": 1.553, + "step": 7905 + }, + { + "epoch": 2.38, + "grad_norm": 11.69579029083252, + "learning_rate": 4.153553172296282e-06, + "loss": 1.5206, + "step": 7906 + }, + { + "epoch": 2.38, + "grad_norm": 37.5781135559082, + "learning_rate": 4.1515485616918915e-06, + "loss": 2.1329, + "step": 7907 + }, + { + "epoch": 2.38, + "grad_norm": 21.697532653808594, + "learning_rate": 4.149543951087502e-06, + "loss": 1.4265, + "step": 7908 + }, + { + "epoch": 2.38, + "grad_norm": 9.745403289794922, + "learning_rate": 4.147539340483112e-06, + "loss": 1.1323, + "step": 7909 + }, + { + "epoch": 2.38, + "grad_norm": 15.652579307556152, + "learning_rate": 4.145534729878721e-06, + "loss": 1.3761, + "step": 7910 + }, + { + "epoch": 2.38, + "grad_norm": 12.394492149353027, + "learning_rate": 4.143530119274332e-06, + "loss": 0.9479, + "step": 7911 + }, + { + "epoch": 2.38, + "grad_norm": 19.919248580932617, + "learning_rate": 4.141525508669941e-06, + "loss": 1.2497, + "step": 7912 + }, + { + "epoch": 2.38, + "grad_norm": 13.425742149353027, + "learning_rate": 4.139520898065551e-06, + "loss": 1.0198, + "step": 7913 + }, + { + "epoch": 2.38, + "grad_norm": 17.13833999633789, + "learning_rate": 4.13751628746116e-06, + "loss": 2.0541, + "step": 7914 + }, + { + "epoch": 2.38, + "grad_norm": 13.618558883666992, + "learning_rate": 4.135511676856771e-06, + "loss": 1.3842, + "step": 7915 + }, + { + "epoch": 2.38, + "grad_norm": 5.930107593536377, + "learning_rate": 4.1335070662523806e-06, + "loss": 0.517, + "step": 7916 + }, + { + "epoch": 2.38, + "grad_norm": 4.825802803039551, + "learning_rate": 4.131502455647991e-06, + "loss": 0.3964, + "step": 7917 + }, + { + "epoch": 2.38, + "grad_norm": 25.973148345947266, + "learning_rate": 4.129497845043601e-06, + "loss": 0.9199, + "step": 7918 + }, + { + "epoch": 2.38, + "grad_norm": 9.384675979614258, + "learning_rate": 4.127493234439211e-06, + "loss": 1.1662, + "step": 7919 + }, + { + "epoch": 2.38, + "grad_norm": 59.6272087097168, + "learning_rate": 4.12548862383482e-06, + "loss": 2.4387, + "step": 7920 + }, + { + "epoch": 2.38, + "eval_loss": 0.17472867667675018, + "eval_runtime": 43.5367, + "eval_samples_per_second": 33.971, + "eval_steps_per_second": 33.971, + "step": 7920 + }, + { + "epoch": 2.38, + "grad_norm": 17.422163009643555, + "learning_rate": 4.12348401323043e-06, + "loss": 0.9547, + "step": 7921 + }, + { + "epoch": 2.38, + "grad_norm": 19.566699981689453, + "learning_rate": 4.12147940262604e-06, + "loss": 1.364, + "step": 7922 + }, + { + "epoch": 2.38, + "grad_norm": 13.921435356140137, + "learning_rate": 4.11947479202165e-06, + "loss": 1.5498, + "step": 7923 + }, + { + "epoch": 2.38, + "grad_norm": 46.268829345703125, + "learning_rate": 4.11747018141726e-06, + "loss": 1.7792, + "step": 7924 + }, + { + "epoch": 2.38, + "grad_norm": 37.72319793701172, + "learning_rate": 4.11546557081287e-06, + "loss": 1.582, + "step": 7925 + }, + { + "epoch": 2.38, + "grad_norm": 13.244497299194336, + "learning_rate": 4.11346096020848e-06, + "loss": 1.7341, + "step": 7926 + }, + { + "epoch": 2.38, + "grad_norm": 10.264554977416992, + "learning_rate": 4.11145634960409e-06, + "loss": 0.902, + "step": 7927 + }, + { + "epoch": 2.38, + "grad_norm": 16.762826919555664, + "learning_rate": 4.1094517389997e-06, + "loss": 1.2989, + "step": 7928 + }, + { + "epoch": 2.38, + "grad_norm": 27.864768981933594, + "learning_rate": 4.107447128395309e-06, + "loss": 1.0548, + "step": 7929 + }, + { + "epoch": 2.38, + "grad_norm": 9.768196105957031, + "learning_rate": 4.105442517790919e-06, + "loss": 0.7929, + "step": 7930 + }, + { + "epoch": 2.38, + "grad_norm": 8.759325981140137, + "learning_rate": 4.103437907186529e-06, + "loss": 0.6668, + "step": 7931 + }, + { + "epoch": 2.38, + "grad_norm": 29.228063583374023, + "learning_rate": 4.101433296582139e-06, + "loss": 1.5968, + "step": 7932 + }, + { + "epoch": 2.39, + "grad_norm": 15.066993713378906, + "learning_rate": 4.0994286859777495e-06, + "loss": 1.457, + "step": 7933 + }, + { + "epoch": 2.39, + "grad_norm": 12.962151527404785, + "learning_rate": 4.097424075373359e-06, + "loss": 1.568, + "step": 7934 + }, + { + "epoch": 2.39, + "grad_norm": 9.651278495788574, + "learning_rate": 4.095419464768969e-06, + "loss": 0.9769, + "step": 7935 + }, + { + "epoch": 2.39, + "grad_norm": 27.313953399658203, + "learning_rate": 4.093414854164579e-06, + "loss": 1.1726, + "step": 7936 + }, + { + "epoch": 2.39, + "grad_norm": 30.882871627807617, + "learning_rate": 4.091410243560189e-06, + "loss": 2.3507, + "step": 7937 + }, + { + "epoch": 2.39, + "grad_norm": 16.88389015197754, + "learning_rate": 4.089405632955798e-06, + "loss": 0.9228, + "step": 7938 + }, + { + "epoch": 2.39, + "grad_norm": 54.354469299316406, + "learning_rate": 4.087401022351409e-06, + "loss": 2.5421, + "step": 7939 + }, + { + "epoch": 2.39, + "grad_norm": 18.750852584838867, + "learning_rate": 4.085396411747018e-06, + "loss": 1.2235, + "step": 7940 + }, + { + "epoch": 2.39, + "grad_norm": 7.334155559539795, + "learning_rate": 4.0833918011426284e-06, + "loss": 0.647, + "step": 7941 + }, + { + "epoch": 2.39, + "grad_norm": 14.79666519165039, + "learning_rate": 4.0813871905382385e-06, + "loss": 0.8448, + "step": 7942 + }, + { + "epoch": 2.39, + "grad_norm": 35.3593864440918, + "learning_rate": 4.079382579933849e-06, + "loss": 1.4009, + "step": 7943 + }, + { + "epoch": 2.39, + "grad_norm": 25.9478702545166, + "learning_rate": 4.077377969329458e-06, + "loss": 1.7573, + "step": 7944 + }, + { + "epoch": 2.39, + "grad_norm": 13.793614387512207, + "learning_rate": 4.075373358725068e-06, + "loss": 1.5591, + "step": 7945 + }, + { + "epoch": 2.39, + "grad_norm": 15.494868278503418, + "learning_rate": 4.073368748120678e-06, + "loss": 1.6407, + "step": 7946 + }, + { + "epoch": 2.39, + "grad_norm": 26.147010803222656, + "learning_rate": 4.071364137516287e-06, + "loss": 1.6419, + "step": 7947 + }, + { + "epoch": 2.39, + "grad_norm": 19.630136489868164, + "learning_rate": 4.069359526911898e-06, + "loss": 1.5473, + "step": 7948 + }, + { + "epoch": 2.39, + "grad_norm": 75.69630432128906, + "learning_rate": 4.0673549163075074e-06, + "loss": 2.2357, + "step": 7949 + }, + { + "epoch": 2.39, + "grad_norm": 82.69189453125, + "learning_rate": 4.0653503057031175e-06, + "loss": 1.675, + "step": 7950 + }, + { + "epoch": 2.39, + "grad_norm": 12.869571685791016, + "learning_rate": 4.063345695098728e-06, + "loss": 1.1058, + "step": 7951 + }, + { + "epoch": 2.39, + "grad_norm": 16.09258270263672, + "learning_rate": 4.061341084494338e-06, + "loss": 1.3042, + "step": 7952 + }, + { + "epoch": 2.39, + "grad_norm": 15.368246078491211, + "learning_rate": 4.059336473889947e-06, + "loss": 0.7744, + "step": 7953 + }, + { + "epoch": 2.39, + "grad_norm": 56.09071350097656, + "learning_rate": 4.057331863285557e-06, + "loss": 1.4905, + "step": 7954 + }, + { + "epoch": 2.39, + "grad_norm": 11.899616241455078, + "learning_rate": 4.055327252681167e-06, + "loss": 1.1252, + "step": 7955 + }, + { + "epoch": 2.39, + "grad_norm": 17.915721893310547, + "learning_rate": 4.053322642076777e-06, + "loss": 1.8459, + "step": 7956 + }, + { + "epoch": 2.39, + "grad_norm": 12.431897163391113, + "learning_rate": 4.051318031472386e-06, + "loss": 1.0242, + "step": 7957 + }, + { + "epoch": 2.39, + "grad_norm": 15.987489700317383, + "learning_rate": 4.0493134208679965e-06, + "loss": 2.4057, + "step": 7958 + }, + { + "epoch": 2.39, + "grad_norm": 14.070905685424805, + "learning_rate": 4.0473088102636066e-06, + "loss": 1.2005, + "step": 7959 + }, + { + "epoch": 2.39, + "grad_norm": 26.776086807250977, + "learning_rate": 4.045304199659217e-06, + "loss": 1.1287, + "step": 7960 + }, + { + "epoch": 2.39, + "grad_norm": 21.03402328491211, + "learning_rate": 4.043299589054827e-06, + "loss": 1.6414, + "step": 7961 + }, + { + "epoch": 2.39, + "grad_norm": 16.39931297302246, + "learning_rate": 4.041294978450436e-06, + "loss": 1.7569, + "step": 7962 + }, + { + "epoch": 2.39, + "grad_norm": 22.343650817871094, + "learning_rate": 4.039290367846046e-06, + "loss": 1.0703, + "step": 7963 + }, + { + "epoch": 2.39, + "grad_norm": 21.395599365234375, + "learning_rate": 4.037285757241656e-06, + "loss": 2.1687, + "step": 7964 + }, + { + "epoch": 2.39, + "grad_norm": 53.524776458740234, + "learning_rate": 4.035281146637266e-06, + "loss": 1.6589, + "step": 7965 + }, + { + "epoch": 2.4, + "grad_norm": 12.918240547180176, + "learning_rate": 4.0332765360328755e-06, + "loss": 0.878, + "step": 7966 + }, + { + "epoch": 2.4, + "grad_norm": 10.968676567077637, + "learning_rate": 4.031271925428486e-06, + "loss": 2.1511, + "step": 7967 + }, + { + "epoch": 2.4, + "grad_norm": 21.93174934387207, + "learning_rate": 4.029267314824096e-06, + "loss": 1.5569, + "step": 7968 + }, + { + "epoch": 2.4, + "grad_norm": 18.74276351928711, + "learning_rate": 4.027262704219706e-06, + "loss": 0.7945, + "step": 7969 + }, + { + "epoch": 2.4, + "grad_norm": 9.503168106079102, + "learning_rate": 4.025258093615316e-06, + "loss": 0.8866, + "step": 7970 + }, + { + "epoch": 2.4, + "grad_norm": 39.47118377685547, + "learning_rate": 4.023253483010925e-06, + "loss": 1.4191, + "step": 7971 + }, + { + "epoch": 2.4, + "grad_norm": 11.129521369934082, + "learning_rate": 4.021248872406535e-06, + "loss": 1.1143, + "step": 7972 + }, + { + "epoch": 2.4, + "grad_norm": 13.493324279785156, + "learning_rate": 4.019244261802145e-06, + "loss": 1.5284, + "step": 7973 + }, + { + "epoch": 2.4, + "grad_norm": 12.583949089050293, + "learning_rate": 4.017239651197755e-06, + "loss": 1.6311, + "step": 7974 + }, + { + "epoch": 2.4, + "grad_norm": 7.225354194641113, + "learning_rate": 4.0152350405933645e-06, + "loss": 0.5856, + "step": 7975 + }, + { + "epoch": 2.4, + "grad_norm": 15.300594329833984, + "learning_rate": 4.0132304299889755e-06, + "loss": 1.0028, + "step": 7976 + }, + { + "epoch": 2.4, + "grad_norm": 16.86861228942871, + "learning_rate": 4.011225819384585e-06, + "loss": 1.7745, + "step": 7977 + }, + { + "epoch": 2.4, + "grad_norm": 20.883081436157227, + "learning_rate": 4.009221208780195e-06, + "loss": 1.6959, + "step": 7978 + }, + { + "epoch": 2.4, + "grad_norm": 120.636962890625, + "learning_rate": 4.007216598175805e-06, + "loss": 2.2987, + "step": 7979 + }, + { + "epoch": 2.4, + "grad_norm": 76.55606842041016, + "learning_rate": 4.005211987571415e-06, + "loss": 1.4258, + "step": 7980 + }, + { + "epoch": 2.4, + "grad_norm": 15.42659854888916, + "learning_rate": 4.003207376967024e-06, + "loss": 1.5873, + "step": 7981 + }, + { + "epoch": 2.4, + "grad_norm": 9.378588676452637, + "learning_rate": 4.001202766362634e-06, + "loss": 0.9196, + "step": 7982 + }, + { + "epoch": 2.4, + "grad_norm": 12.483980178833008, + "learning_rate": 3.999198155758244e-06, + "loss": 0.9071, + "step": 7983 + }, + { + "epoch": 2.4, + "grad_norm": 7.117259502410889, + "learning_rate": 3.9971935451538544e-06, + "loss": 0.6292, + "step": 7984 + }, + { + "epoch": 2.4, + "grad_norm": 14.306236267089844, + "learning_rate": 3.9951889345494645e-06, + "loss": 1.0767, + "step": 7985 + }, + { + "epoch": 2.4, + "grad_norm": 46.9137077331543, + "learning_rate": 3.993184323945074e-06, + "loss": 2.4379, + "step": 7986 + }, + { + "epoch": 2.4, + "grad_norm": 17.20838737487793, + "learning_rate": 3.991179713340684e-06, + "loss": 1.2174, + "step": 7987 + }, + { + "epoch": 2.4, + "grad_norm": 9.353306770324707, + "learning_rate": 3.989175102736294e-06, + "loss": 1.0121, + "step": 7988 + }, + { + "epoch": 2.4, + "grad_norm": 17.213781356811523, + "learning_rate": 3.987170492131904e-06, + "loss": 1.4443, + "step": 7989 + }, + { + "epoch": 2.4, + "grad_norm": 6.010331630706787, + "learning_rate": 3.985165881527513e-06, + "loss": 0.8314, + "step": 7990 + }, + { + "epoch": 2.4, + "grad_norm": 12.470282554626465, + "learning_rate": 3.983161270923124e-06, + "loss": 1.9135, + "step": 7991 + }, + { + "epoch": 2.4, + "grad_norm": 25.282251358032227, + "learning_rate": 3.9811566603187334e-06, + "loss": 1.9866, + "step": 7992 + }, + { + "epoch": 2.4, + "grad_norm": 12.670585632324219, + "learning_rate": 3.9791520497143435e-06, + "loss": 0.8544, + "step": 7993 + }, + { + "epoch": 2.4, + "grad_norm": 21.99867057800293, + "learning_rate": 3.977147439109954e-06, + "loss": 0.906, + "step": 7994 + }, + { + "epoch": 2.4, + "grad_norm": 26.105510711669922, + "learning_rate": 3.975142828505563e-06, + "loss": 1.1245, + "step": 7995 + }, + { + "epoch": 2.4, + "grad_norm": 27.73430824279785, + "learning_rate": 3.973138217901173e-06, + "loss": 1.5627, + "step": 7996 + }, + { + "epoch": 2.4, + "grad_norm": 31.076269149780273, + "learning_rate": 3.971133607296783e-06, + "loss": 1.5272, + "step": 7997 + }, + { + "epoch": 2.4, + "grad_norm": 20.252588272094727, + "learning_rate": 3.969128996692393e-06, + "loss": 1.3878, + "step": 7998 + }, + { + "epoch": 2.4, + "grad_norm": 15.601667404174805, + "learning_rate": 3.967124386088002e-06, + "loss": 1.6348, + "step": 7999 + }, + { + "epoch": 2.41, + "grad_norm": 15.273148536682129, + "learning_rate": 3.965119775483612e-06, + "loss": 2.6779, + "step": 8000 + }, + { + "epoch": 2.41, + "grad_norm": 14.738374710083008, + "learning_rate": 3.9631151648792225e-06, + "loss": 1.3222, + "step": 8001 + }, + { + "epoch": 2.41, + "grad_norm": 12.968079566955566, + "learning_rate": 3.9611105542748326e-06, + "loss": 0.8718, + "step": 8002 + }, + { + "epoch": 2.41, + "grad_norm": 15.838987350463867, + "learning_rate": 3.959105943670442e-06, + "loss": 1.1175, + "step": 8003 + }, + { + "epoch": 2.41, + "grad_norm": 11.581687927246094, + "learning_rate": 3.957101333066053e-06, + "loss": 2.2458, + "step": 8004 + }, + { + "epoch": 2.41, + "grad_norm": 13.473747253417969, + "learning_rate": 3.955096722461662e-06, + "loss": 1.0507, + "step": 8005 + }, + { + "epoch": 2.41, + "grad_norm": 22.461040496826172, + "learning_rate": 3.953092111857272e-06, + "loss": 1.8377, + "step": 8006 + }, + { + "epoch": 2.41, + "grad_norm": 9.827531814575195, + "learning_rate": 3.951087501252882e-06, + "loss": 1.3932, + "step": 8007 + }, + { + "epoch": 2.41, + "grad_norm": 17.85820198059082, + "learning_rate": 3.949082890648492e-06, + "loss": 1.0887, + "step": 8008 + }, + { + "epoch": 2.41, + "grad_norm": 10.893664360046387, + "learning_rate": 3.9470782800441015e-06, + "loss": 0.6649, + "step": 8009 + }, + { + "epoch": 2.41, + "grad_norm": 27.924177169799805, + "learning_rate": 3.9450736694397116e-06, + "loss": 1.7343, + "step": 8010 + }, + { + "epoch": 2.41, + "grad_norm": 7.996644973754883, + "learning_rate": 3.943069058835322e-06, + "loss": 1.1344, + "step": 8011 + }, + { + "epoch": 2.41, + "grad_norm": 29.98558807373047, + "learning_rate": 3.941064448230931e-06, + "loss": 1.4769, + "step": 8012 + }, + { + "epoch": 2.41, + "grad_norm": 27.507963180541992, + "learning_rate": 3.939059837626542e-06, + "loss": 0.83, + "step": 8013 + }, + { + "epoch": 2.41, + "grad_norm": 7.844414710998535, + "learning_rate": 3.937055227022151e-06, + "loss": 0.9977, + "step": 8014 + }, + { + "epoch": 2.41, + "grad_norm": 51.909061431884766, + "learning_rate": 3.935050616417761e-06, + "loss": 2.3273, + "step": 8015 + }, + { + "epoch": 2.41, + "grad_norm": 10.7676420211792, + "learning_rate": 3.933046005813371e-06, + "loss": 0.986, + "step": 8016 + }, + { + "epoch": 2.41, + "grad_norm": 14.182157516479492, + "learning_rate": 3.931041395208981e-06, + "loss": 1.3547, + "step": 8017 + }, + { + "epoch": 2.41, + "grad_norm": 15.017866134643555, + "learning_rate": 3.9290367846045905e-06, + "loss": 0.7605, + "step": 8018 + }, + { + "epoch": 2.41, + "grad_norm": 22.285579681396484, + "learning_rate": 3.927032174000201e-06, + "loss": 1.5245, + "step": 8019 + }, + { + "epoch": 2.41, + "grad_norm": 69.15910339355469, + "learning_rate": 3.925027563395811e-06, + "loss": 2.0588, + "step": 8020 + }, + { + "epoch": 2.41, + "grad_norm": 57.589107513427734, + "learning_rate": 3.923022952791421e-06, + "loss": 2.0236, + "step": 8021 + }, + { + "epoch": 2.41, + "grad_norm": 38.282440185546875, + "learning_rate": 3.921018342187031e-06, + "loss": 1.923, + "step": 8022 + }, + { + "epoch": 2.41, + "grad_norm": 16.257158279418945, + "learning_rate": 3.91901373158264e-06, + "loss": 1.2573, + "step": 8023 + }, + { + "epoch": 2.41, + "grad_norm": 13.993424415588379, + "learning_rate": 3.91700912097825e-06, + "loss": 1.1249, + "step": 8024 + }, + { + "epoch": 2.41, + "grad_norm": 18.90182876586914, + "learning_rate": 3.91500451037386e-06, + "loss": 1.4962, + "step": 8025 + }, + { + "epoch": 2.41, + "grad_norm": 19.353958129882812, + "learning_rate": 3.91299989976947e-06, + "loss": 1.0571, + "step": 8026 + }, + { + "epoch": 2.41, + "grad_norm": 31.27533531188965, + "learning_rate": 3.91099528916508e-06, + "loss": 1.6633, + "step": 8027 + }, + { + "epoch": 2.41, + "grad_norm": 15.394329071044922, + "learning_rate": 3.9089906785606905e-06, + "loss": 1.4122, + "step": 8028 + }, + { + "epoch": 2.41, + "grad_norm": 77.46128845214844, + "learning_rate": 3.9069860679563e-06, + "loss": 1.7462, + "step": 8029 + }, + { + "epoch": 2.41, + "grad_norm": 33.42092514038086, + "learning_rate": 3.90498145735191e-06, + "loss": 1.3732, + "step": 8030 + }, + { + "epoch": 2.41, + "grad_norm": 9.593501091003418, + "learning_rate": 3.90297684674752e-06, + "loss": 0.9003, + "step": 8031 + }, + { + "epoch": 2.41, + "grad_norm": 16.690101623535156, + "learning_rate": 3.900972236143129e-06, + "loss": 1.6157, + "step": 8032 + }, + { + "epoch": 2.42, + "grad_norm": 14.943684577941895, + "learning_rate": 3.898967625538739e-06, + "loss": 1.5163, + "step": 8033 + }, + { + "epoch": 2.42, + "grad_norm": 59.907310485839844, + "learning_rate": 3.896963014934349e-06, + "loss": 1.6031, + "step": 8034 + }, + { + "epoch": 2.42, + "grad_norm": 32.62445068359375, + "learning_rate": 3.8949584043299594e-06, + "loss": 1.2945, + "step": 8035 + }, + { + "epoch": 2.42, + "grad_norm": 92.56492614746094, + "learning_rate": 3.892953793725569e-06, + "loss": 1.5538, + "step": 8036 + }, + { + "epoch": 2.42, + "grad_norm": 11.2645902633667, + "learning_rate": 3.890949183121179e-06, + "loss": 1.8437, + "step": 8037 + }, + { + "epoch": 2.42, + "grad_norm": 13.851442337036133, + "learning_rate": 3.888944572516789e-06, + "loss": 0.9473, + "step": 8038 + }, + { + "epoch": 2.42, + "grad_norm": 17.675752639770508, + "learning_rate": 3.886939961912399e-06, + "loss": 1.4754, + "step": 8039 + }, + { + "epoch": 2.42, + "grad_norm": 14.534573554992676, + "learning_rate": 3.884935351308008e-06, + "loss": 1.2699, + "step": 8040 + }, + { + "epoch": 2.42, + "eval_loss": 0.16647757589817047, + "eval_runtime": 43.4766, + "eval_samples_per_second": 34.018, + "eval_steps_per_second": 34.018, + "step": 8040 + }, + { + "epoch": 2.42, + "grad_norm": 10.053756713867188, + "learning_rate": 3.882930740703619e-06, + "loss": 1.1651, + "step": 8041 + }, + { + "epoch": 2.42, + "grad_norm": 15.086660385131836, + "learning_rate": 3.880926130099228e-06, + "loss": 1.2307, + "step": 8042 + }, + { + "epoch": 2.42, + "grad_norm": 45.48273468017578, + "learning_rate": 3.878921519494838e-06, + "loss": 1.7186, + "step": 8043 + }, + { + "epoch": 2.42, + "grad_norm": 34.40294647216797, + "learning_rate": 3.8769169088904485e-06, + "loss": 2.7345, + "step": 8044 + }, + { + "epoch": 2.42, + "grad_norm": 14.807568550109863, + "learning_rate": 3.8749122982860586e-06, + "loss": 0.8994, + "step": 8045 + }, + { + "epoch": 2.42, + "grad_norm": 10.855109214782715, + "learning_rate": 3.872907687681668e-06, + "loss": 1.2174, + "step": 8046 + }, + { + "epoch": 2.42, + "grad_norm": 36.307777404785156, + "learning_rate": 3.870903077077278e-06, + "loss": 2.1917, + "step": 8047 + }, + { + "epoch": 2.42, + "grad_norm": 22.572141647338867, + "learning_rate": 3.868898466472888e-06, + "loss": 0.8395, + "step": 8048 + }, + { + "epoch": 2.42, + "grad_norm": 17.936315536499023, + "learning_rate": 3.866893855868497e-06, + "loss": 2.4869, + "step": 8049 + }, + { + "epoch": 2.42, + "grad_norm": 12.054944038391113, + "learning_rate": 3.864889245264108e-06, + "loss": 1.4992, + "step": 8050 + }, + { + "epoch": 2.42, + "grad_norm": 10.359464645385742, + "learning_rate": 3.862884634659717e-06, + "loss": 1.1123, + "step": 8051 + }, + { + "epoch": 2.42, + "grad_norm": 21.69719886779785, + "learning_rate": 3.8608800240553275e-06, + "loss": 1.9015, + "step": 8052 + }, + { + "epoch": 2.42, + "grad_norm": 9.15548038482666, + "learning_rate": 3.8588754134509376e-06, + "loss": 1.1113, + "step": 8053 + }, + { + "epoch": 2.42, + "grad_norm": 7.168182373046875, + "learning_rate": 3.856870802846548e-06, + "loss": 0.7675, + "step": 8054 + }, + { + "epoch": 2.42, + "grad_norm": 23.34624481201172, + "learning_rate": 3.854866192242157e-06, + "loss": 1.1852, + "step": 8055 + }, + { + "epoch": 2.42, + "grad_norm": 8.906826972961426, + "learning_rate": 3.852861581637767e-06, + "loss": 0.7492, + "step": 8056 + }, + { + "epoch": 2.42, + "grad_norm": 17.47923469543457, + "learning_rate": 3.850856971033377e-06, + "loss": 1.1435, + "step": 8057 + }, + { + "epoch": 2.42, + "grad_norm": 8.960250854492188, + "learning_rate": 3.848852360428987e-06, + "loss": 0.9197, + "step": 8058 + }, + { + "epoch": 2.42, + "grad_norm": 17.509536743164062, + "learning_rate": 3.846847749824597e-06, + "loss": 1.5889, + "step": 8059 + }, + { + "epoch": 2.42, + "grad_norm": 14.029973030090332, + "learning_rate": 3.8448431392202065e-06, + "loss": 1.5197, + "step": 8060 + }, + { + "epoch": 2.42, + "grad_norm": 13.85709285736084, + "learning_rate": 3.8428385286158165e-06, + "loss": 1.1813, + "step": 8061 + }, + { + "epoch": 2.42, + "grad_norm": 11.46907901763916, + "learning_rate": 3.840833918011427e-06, + "loss": 1.1435, + "step": 8062 + }, + { + "epoch": 2.42, + "grad_norm": 11.111536026000977, + "learning_rate": 3.838829307407037e-06, + "loss": 1.6378, + "step": 8063 + }, + { + "epoch": 2.42, + "grad_norm": 13.92691421508789, + "learning_rate": 3.836824696802646e-06, + "loss": 1.0666, + "step": 8064 + }, + { + "epoch": 2.42, + "grad_norm": 11.448002815246582, + "learning_rate": 3.834820086198257e-06, + "loss": 1.2318, + "step": 8065 + }, + { + "epoch": 2.43, + "grad_norm": 13.583879470825195, + "learning_rate": 3.832815475593866e-06, + "loss": 0.7262, + "step": 8066 + }, + { + "epoch": 2.43, + "grad_norm": 17.306129455566406, + "learning_rate": 3.830810864989476e-06, + "loss": 1.5805, + "step": 8067 + }, + { + "epoch": 2.43, + "grad_norm": 17.71708106994629, + "learning_rate": 3.828806254385086e-06, + "loss": 1.5627, + "step": 8068 + }, + { + "epoch": 2.43, + "grad_norm": 13.716277122497559, + "learning_rate": 3.826801643780696e-06, + "loss": 1.223, + "step": 8069 + }, + { + "epoch": 2.43, + "grad_norm": 9.647749900817871, + "learning_rate": 3.824797033176306e-06, + "loss": 2.1291, + "step": 8070 + }, + { + "epoch": 2.43, + "grad_norm": 15.30841064453125, + "learning_rate": 3.822792422571916e-06, + "loss": 1.1331, + "step": 8071 + }, + { + "epoch": 2.43, + "grad_norm": 18.855783462524414, + "learning_rate": 3.820787811967526e-06, + "loss": 1.7297, + "step": 8072 + }, + { + "epoch": 2.43, + "grad_norm": 35.978248596191406, + "learning_rate": 3.818783201363135e-06, + "loss": 0.7591, + "step": 8073 + }, + { + "epoch": 2.43, + "grad_norm": 19.79545021057129, + "learning_rate": 3.816778590758746e-06, + "loss": 1.878, + "step": 8074 + }, + { + "epoch": 2.43, + "grad_norm": 33.37739562988281, + "learning_rate": 3.814773980154355e-06, + "loss": 1.5872, + "step": 8075 + }, + { + "epoch": 2.43, + "grad_norm": 19.29977035522461, + "learning_rate": 3.8127693695499653e-06, + "loss": 0.9943, + "step": 8076 + }, + { + "epoch": 2.43, + "grad_norm": 22.01044273376465, + "learning_rate": 3.8107647589455753e-06, + "loss": 1.9346, + "step": 8077 + }, + { + "epoch": 2.43, + "grad_norm": 24.812864303588867, + "learning_rate": 3.808760148341185e-06, + "loss": 1.4286, + "step": 8078 + }, + { + "epoch": 2.43, + "grad_norm": 16.758087158203125, + "learning_rate": 3.8067555377367947e-06, + "loss": 0.6716, + "step": 8079 + }, + { + "epoch": 2.43, + "grad_norm": 10.366033554077148, + "learning_rate": 3.8047509271324047e-06, + "loss": 1.3467, + "step": 8080 + }, + { + "epoch": 2.43, + "grad_norm": 20.55314064025879, + "learning_rate": 3.802746316528015e-06, + "loss": 1.4071, + "step": 8081 + }, + { + "epoch": 2.43, + "grad_norm": 31.28022003173828, + "learning_rate": 3.8007417059236245e-06, + "loss": 1.7199, + "step": 8082 + }, + { + "epoch": 2.43, + "grad_norm": 6.907041072845459, + "learning_rate": 3.798737095319234e-06, + "loss": 0.3853, + "step": 8083 + }, + { + "epoch": 2.43, + "grad_norm": 24.64528465270996, + "learning_rate": 3.7967324847148447e-06, + "loss": 1.4183, + "step": 8084 + }, + { + "epoch": 2.43, + "grad_norm": 10.540509223937988, + "learning_rate": 3.7947278741104543e-06, + "loss": 1.0129, + "step": 8085 + }, + { + "epoch": 2.43, + "grad_norm": 13.344951629638672, + "learning_rate": 3.792723263506064e-06, + "loss": 0.989, + "step": 8086 + }, + { + "epoch": 2.43, + "grad_norm": 8.0833158493042, + "learning_rate": 3.7907186529016745e-06, + "loss": 0.8597, + "step": 8087 + }, + { + "epoch": 2.43, + "grad_norm": 19.457054138183594, + "learning_rate": 3.788714042297284e-06, + "loss": 1.3676, + "step": 8088 + }, + { + "epoch": 2.43, + "grad_norm": 16.89392852783203, + "learning_rate": 3.786709431692894e-06, + "loss": 1.1124, + "step": 8089 + }, + { + "epoch": 2.43, + "grad_norm": 9.098607063293457, + "learning_rate": 3.784704821088504e-06, + "loss": 0.5794, + "step": 8090 + }, + { + "epoch": 2.43, + "grad_norm": 41.21145248413086, + "learning_rate": 3.7827002104841136e-06, + "loss": 1.9724, + "step": 8091 + }, + { + "epoch": 2.43, + "grad_norm": 5.302923679351807, + "learning_rate": 3.7806955998797236e-06, + "loss": 0.579, + "step": 8092 + }, + { + "epoch": 2.43, + "grad_norm": 14.981736183166504, + "learning_rate": 3.7786909892753337e-06, + "loss": 2.1375, + "step": 8093 + }, + { + "epoch": 2.43, + "grad_norm": 20.563865661621094, + "learning_rate": 3.7766863786709434e-06, + "loss": 1.137, + "step": 8094 + }, + { + "epoch": 2.43, + "grad_norm": 10.908076286315918, + "learning_rate": 3.774681768066553e-06, + "loss": 1.2199, + "step": 8095 + }, + { + "epoch": 2.43, + "grad_norm": 79.2155532836914, + "learning_rate": 3.7726771574621636e-06, + "loss": 2.0151, + "step": 8096 + }, + { + "epoch": 2.43, + "grad_norm": 187.99632263183594, + "learning_rate": 3.7706725468577732e-06, + "loss": 2.0555, + "step": 8097 + }, + { + "epoch": 2.43, + "grad_norm": 38.596275329589844, + "learning_rate": 3.768667936253383e-06, + "loss": 2.0002, + "step": 8098 + }, + { + "epoch": 2.44, + "grad_norm": 8.95095157623291, + "learning_rate": 3.766663325648993e-06, + "loss": 1.0331, + "step": 8099 + }, + { + "epoch": 2.44, + "grad_norm": 17.133882522583008, + "learning_rate": 3.764658715044603e-06, + "loss": 1.5345, + "step": 8100 + }, + { + "epoch": 2.44, + "grad_norm": 8.687376976013184, + "learning_rate": 3.7626541044402127e-06, + "loss": 0.7787, + "step": 8101 + }, + { + "epoch": 2.44, + "grad_norm": 69.27308654785156, + "learning_rate": 3.760649493835823e-06, + "loss": 1.252, + "step": 8102 + }, + { + "epoch": 2.44, + "grad_norm": 23.108619689941406, + "learning_rate": 3.7586448832314325e-06, + "loss": 1.3771, + "step": 8103 + }, + { + "epoch": 2.44, + "grad_norm": 33.36818313598633, + "learning_rate": 3.7566402726270425e-06, + "loss": 1.6731, + "step": 8104 + }, + { + "epoch": 2.44, + "grad_norm": 13.890486717224121, + "learning_rate": 3.7546356620226526e-06, + "loss": 1.39, + "step": 8105 + }, + { + "epoch": 2.44, + "grad_norm": 9.70341682434082, + "learning_rate": 3.7526310514182623e-06, + "loss": 0.7873, + "step": 8106 + }, + { + "epoch": 2.44, + "grad_norm": 15.94449234008789, + "learning_rate": 3.750626440813872e-06, + "loss": 0.9446, + "step": 8107 + }, + { + "epoch": 2.44, + "grad_norm": 19.849061965942383, + "learning_rate": 3.7486218302094825e-06, + "loss": 1.6883, + "step": 8108 + }, + { + "epoch": 2.44, + "grad_norm": 19.96930503845215, + "learning_rate": 3.746617219605092e-06, + "loss": 1.2279, + "step": 8109 + }, + { + "epoch": 2.44, + "grad_norm": 12.898889541625977, + "learning_rate": 3.7446126090007018e-06, + "loss": 1.6165, + "step": 8110 + }, + { + "epoch": 2.44, + "grad_norm": 12.426019668579102, + "learning_rate": 3.742607998396312e-06, + "loss": 1.4589, + "step": 8111 + }, + { + "epoch": 2.44, + "grad_norm": 20.18886375427246, + "learning_rate": 3.740603387791922e-06, + "loss": 1.7677, + "step": 8112 + }, + { + "epoch": 2.44, + "grad_norm": 19.88523292541504, + "learning_rate": 3.7385987771875316e-06, + "loss": 2.5334, + "step": 8113 + }, + { + "epoch": 2.44, + "grad_norm": 39.80482482910156, + "learning_rate": 3.7365941665831417e-06, + "loss": 1.2999, + "step": 8114 + }, + { + "epoch": 2.44, + "grad_norm": 20.459688186645508, + "learning_rate": 3.7345895559787513e-06, + "loss": 1.1679, + "step": 8115 + }, + { + "epoch": 2.44, + "grad_norm": 6.700068950653076, + "learning_rate": 3.7325849453743614e-06, + "loss": 0.7833, + "step": 8116 + }, + { + "epoch": 2.44, + "grad_norm": 10.873579978942871, + "learning_rate": 3.7305803347699715e-06, + "loss": 1.1187, + "step": 8117 + }, + { + "epoch": 2.44, + "grad_norm": 11.668147087097168, + "learning_rate": 3.728575724165581e-06, + "loss": 1.436, + "step": 8118 + }, + { + "epoch": 2.44, + "grad_norm": 24.042890548706055, + "learning_rate": 3.726571113561191e-06, + "loss": 1.6814, + "step": 8119 + }, + { + "epoch": 2.44, + "grad_norm": 7.250650882720947, + "learning_rate": 3.7245665029568013e-06, + "loss": 1.062, + "step": 8120 + }, + { + "epoch": 2.44, + "grad_norm": 8.547741889953613, + "learning_rate": 3.722561892352411e-06, + "loss": 1.0638, + "step": 8121 + }, + { + "epoch": 2.44, + "grad_norm": 9.865437507629395, + "learning_rate": 3.7205572817480207e-06, + "loss": 0.8298, + "step": 8122 + }, + { + "epoch": 2.44, + "grad_norm": 11.00238037109375, + "learning_rate": 3.7185526711436303e-06, + "loss": 0.7298, + "step": 8123 + }, + { + "epoch": 2.44, + "grad_norm": 10.733601570129395, + "learning_rate": 3.716548060539241e-06, + "loss": 0.9721, + "step": 8124 + }, + { + "epoch": 2.44, + "grad_norm": 32.854774475097656, + "learning_rate": 3.7145434499348505e-06, + "loss": 1.4885, + "step": 8125 + }, + { + "epoch": 2.44, + "grad_norm": 11.409563064575195, + "learning_rate": 3.71253883933046e-06, + "loss": 0.8726, + "step": 8126 + }, + { + "epoch": 2.44, + "grad_norm": 37.438106536865234, + "learning_rate": 3.7105342287260702e-06, + "loss": 1.3614, + "step": 8127 + }, + { + "epoch": 2.44, + "grad_norm": 28.72161865234375, + "learning_rate": 3.70852961812168e-06, + "loss": 1.0996, + "step": 8128 + }, + { + "epoch": 2.44, + "grad_norm": 10.785248756408691, + "learning_rate": 3.70652500751729e-06, + "loss": 0.7049, + "step": 8129 + }, + { + "epoch": 2.44, + "grad_norm": 9.802728652954102, + "learning_rate": 3.7045203969129e-06, + "loss": 1.0367, + "step": 8130 + }, + { + "epoch": 2.44, + "grad_norm": 14.082221984863281, + "learning_rate": 3.7025157863085097e-06, + "loss": 1.0125, + "step": 8131 + }, + { + "epoch": 2.44, + "grad_norm": 63.74949645996094, + "learning_rate": 3.7005111757041194e-06, + "loss": 2.3996, + "step": 8132 + }, + { + "epoch": 2.45, + "grad_norm": 9.598031044006348, + "learning_rate": 3.69850656509973e-06, + "loss": 1.0888, + "step": 8133 + }, + { + "epoch": 2.45, + "grad_norm": 15.858400344848633, + "learning_rate": 3.6965019544953396e-06, + "loss": 1.055, + "step": 8134 + }, + { + "epoch": 2.45, + "grad_norm": 10.83476734161377, + "learning_rate": 3.6944973438909492e-06, + "loss": 1.9326, + "step": 8135 + }, + { + "epoch": 2.45, + "grad_norm": 10.07654857635498, + "learning_rate": 3.6924927332865597e-06, + "loss": 1.5472, + "step": 8136 + }, + { + "epoch": 2.45, + "grad_norm": 16.537139892578125, + "learning_rate": 3.6904881226821694e-06, + "loss": 1.2108, + "step": 8137 + }, + { + "epoch": 2.45, + "grad_norm": 15.194998741149902, + "learning_rate": 3.688483512077779e-06, + "loss": 1.0623, + "step": 8138 + }, + { + "epoch": 2.45, + "grad_norm": 43.36076736450195, + "learning_rate": 3.686478901473389e-06, + "loss": 1.2945, + "step": 8139 + }, + { + "epoch": 2.45, + "grad_norm": 9.480472564697266, + "learning_rate": 3.684474290868999e-06, + "loss": 0.5474, + "step": 8140 + }, + { + "epoch": 2.45, + "grad_norm": 36.88868713378906, + "learning_rate": 3.682469680264609e-06, + "loss": 1.0721, + "step": 8141 + }, + { + "epoch": 2.45, + "grad_norm": 42.60389709472656, + "learning_rate": 3.680465069660219e-06, + "loss": 1.4549, + "step": 8142 + }, + { + "epoch": 2.45, + "grad_norm": 22.67746353149414, + "learning_rate": 3.6784604590558286e-06, + "loss": 2.1135, + "step": 8143 + }, + { + "epoch": 2.45, + "grad_norm": 8.67175006866455, + "learning_rate": 3.6764558484514383e-06, + "loss": 1.0506, + "step": 8144 + }, + { + "epoch": 2.45, + "grad_norm": 18.72547149658203, + "learning_rate": 3.674451237847049e-06, + "loss": 0.891, + "step": 8145 + }, + { + "epoch": 2.45, + "grad_norm": 32.578861236572266, + "learning_rate": 3.6724466272426585e-06, + "loss": 1.6625, + "step": 8146 + }, + { + "epoch": 2.45, + "grad_norm": 9.158719062805176, + "learning_rate": 3.670442016638268e-06, + "loss": 0.8982, + "step": 8147 + }, + { + "epoch": 2.45, + "grad_norm": 15.746097564697266, + "learning_rate": 3.6684374060338786e-06, + "loss": 1.349, + "step": 8148 + }, + { + "epoch": 2.45, + "grad_norm": 12.427379608154297, + "learning_rate": 3.6664327954294883e-06, + "loss": 1.283, + "step": 8149 + }, + { + "epoch": 2.45, + "grad_norm": 30.601055145263672, + "learning_rate": 3.664428184825098e-06, + "loss": 0.9753, + "step": 8150 + }, + { + "epoch": 2.45, + "grad_norm": 15.714460372924805, + "learning_rate": 3.662423574220708e-06, + "loss": 1.1643, + "step": 8151 + }, + { + "epoch": 2.45, + "grad_norm": 27.35734748840332, + "learning_rate": 3.6604189636163177e-06, + "loss": 1.7191, + "step": 8152 + }, + { + "epoch": 2.45, + "grad_norm": 16.40424919128418, + "learning_rate": 3.6584143530119278e-06, + "loss": 1.4443, + "step": 8153 + }, + { + "epoch": 2.45, + "grad_norm": 9.086562156677246, + "learning_rate": 3.656409742407538e-06, + "loss": 1.0836, + "step": 8154 + }, + { + "epoch": 2.45, + "grad_norm": 16.159156799316406, + "learning_rate": 3.6544051318031475e-06, + "loss": 1.0811, + "step": 8155 + }, + { + "epoch": 2.45, + "grad_norm": 32.5626220703125, + "learning_rate": 3.652400521198757e-06, + "loss": 1.5753, + "step": 8156 + }, + { + "epoch": 2.45, + "grad_norm": 10.6674165725708, + "learning_rate": 3.6503959105943677e-06, + "loss": 0.9974, + "step": 8157 + }, + { + "epoch": 2.45, + "grad_norm": 11.11431884765625, + "learning_rate": 3.6483912999899773e-06, + "loss": 1.3574, + "step": 8158 + }, + { + "epoch": 2.45, + "grad_norm": 24.535797119140625, + "learning_rate": 3.646386689385587e-06, + "loss": 1.4364, + "step": 8159 + }, + { + "epoch": 2.45, + "grad_norm": 49.98130416870117, + "learning_rate": 3.6443820787811975e-06, + "loss": 1.0015, + "step": 8160 + }, + { + "epoch": 2.45, + "eval_loss": 0.16650927066802979, + "eval_runtime": 44.1127, + "eval_samples_per_second": 33.528, + "eval_steps_per_second": 33.528, + "step": 8160 + }, + { + "epoch": 2.45, + "grad_norm": 9.75783634185791, + "learning_rate": 3.642377468176807e-06, + "loss": 1.1418, + "step": 8161 + }, + { + "epoch": 2.45, + "grad_norm": 15.587701797485352, + "learning_rate": 3.640372857572417e-06, + "loss": 0.7225, + "step": 8162 + }, + { + "epoch": 2.45, + "grad_norm": 17.10492706298828, + "learning_rate": 3.6383682469680265e-06, + "loss": 1.1976, + "step": 8163 + }, + { + "epoch": 2.45, + "grad_norm": 23.952566146850586, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.2036, + "step": 8164 + }, + { + "epoch": 2.45, + "grad_norm": 17.270137786865234, + "learning_rate": 3.6343590257592467e-06, + "loss": 1.6207, + "step": 8165 + }, + { + "epoch": 2.46, + "grad_norm": 16.140460968017578, + "learning_rate": 3.6323544151548563e-06, + "loss": 0.9803, + "step": 8166 + }, + { + "epoch": 2.46, + "grad_norm": 42.045997619628906, + "learning_rate": 3.6303498045504664e-06, + "loss": 2.4721, + "step": 8167 + }, + { + "epoch": 2.46, + "grad_norm": 13.348779678344727, + "learning_rate": 3.628345193946076e-06, + "loss": 1.1113, + "step": 8168 + }, + { + "epoch": 2.46, + "grad_norm": 36.904014587402344, + "learning_rate": 3.6263405833416857e-06, + "loss": 2.5081, + "step": 8169 + }, + { + "epoch": 2.46, + "grad_norm": 23.20426368713379, + "learning_rate": 3.6243359727372962e-06, + "loss": 1.0172, + "step": 8170 + }, + { + "epoch": 2.46, + "grad_norm": 16.895856857299805, + "learning_rate": 3.622331362132906e-06, + "loss": 1.0231, + "step": 8171 + }, + { + "epoch": 2.46, + "grad_norm": 37.31813049316406, + "learning_rate": 3.6203267515285156e-06, + "loss": 1.3518, + "step": 8172 + }, + { + "epoch": 2.46, + "grad_norm": 19.724748611450195, + "learning_rate": 3.618322140924126e-06, + "loss": 1.2338, + "step": 8173 + }, + { + "epoch": 2.46, + "grad_norm": 19.960813522338867, + "learning_rate": 3.6163175303197357e-06, + "loss": 1.4858, + "step": 8174 + }, + { + "epoch": 2.46, + "grad_norm": 11.546113967895508, + "learning_rate": 3.6143129197153454e-06, + "loss": 1.2553, + "step": 8175 + }, + { + "epoch": 2.46, + "grad_norm": 18.865678787231445, + "learning_rate": 3.6123083091109555e-06, + "loss": 1.2506, + "step": 8176 + }, + { + "epoch": 2.46, + "grad_norm": 22.531909942626953, + "learning_rate": 3.6103036985065656e-06, + "loss": 1.6922, + "step": 8177 + }, + { + "epoch": 2.46, + "grad_norm": 20.21176528930664, + "learning_rate": 3.6082990879021752e-06, + "loss": 0.8544, + "step": 8178 + }, + { + "epoch": 2.46, + "grad_norm": 15.008464813232422, + "learning_rate": 3.6062944772977853e-06, + "loss": 1.3876, + "step": 8179 + }, + { + "epoch": 2.46, + "grad_norm": 24.066457748413086, + "learning_rate": 3.604289866693395e-06, + "loss": 1.4522, + "step": 8180 + }, + { + "epoch": 2.46, + "grad_norm": 7.842005729675293, + "learning_rate": 3.6022852560890046e-06, + "loss": 0.5896, + "step": 8181 + }, + { + "epoch": 2.46, + "grad_norm": 13.021004676818848, + "learning_rate": 3.600280645484615e-06, + "loss": 0.7843, + "step": 8182 + }, + { + "epoch": 2.46, + "grad_norm": 21.92922592163086, + "learning_rate": 3.598276034880225e-06, + "loss": 2.2506, + "step": 8183 + }, + { + "epoch": 2.46, + "grad_norm": 11.565603256225586, + "learning_rate": 3.5962714242758345e-06, + "loss": 1.0158, + "step": 8184 + }, + { + "epoch": 2.46, + "grad_norm": 18.74056053161621, + "learning_rate": 3.594266813671445e-06, + "loss": 0.9444, + "step": 8185 + }, + { + "epoch": 2.46, + "grad_norm": 14.562174797058105, + "learning_rate": 3.5922622030670546e-06, + "loss": 0.7885, + "step": 8186 + }, + { + "epoch": 2.46, + "grad_norm": 37.87458038330078, + "learning_rate": 3.5902575924626643e-06, + "loss": 1.1931, + "step": 8187 + }, + { + "epoch": 2.46, + "grad_norm": 14.50774097442627, + "learning_rate": 3.5882529818582744e-06, + "loss": 1.1908, + "step": 8188 + }, + { + "epoch": 2.46, + "grad_norm": 37.05324172973633, + "learning_rate": 3.5862483712538845e-06, + "loss": 2.294, + "step": 8189 + }, + { + "epoch": 2.46, + "grad_norm": 7.687615394592285, + "learning_rate": 3.584243760649494e-06, + "loss": 0.7946, + "step": 8190 + }, + { + "epoch": 2.46, + "grad_norm": 10.446063995361328, + "learning_rate": 3.582239150045104e-06, + "loss": 1.2933, + "step": 8191 + }, + { + "epoch": 2.46, + "grad_norm": 25.05036735534668, + "learning_rate": 3.580234539440714e-06, + "loss": 1.8224, + "step": 8192 + }, + { + "epoch": 2.46, + "grad_norm": 12.898480415344238, + "learning_rate": 3.5782299288363235e-06, + "loss": 1.3031, + "step": 8193 + }, + { + "epoch": 2.46, + "grad_norm": 23.968446731567383, + "learning_rate": 3.576225318231934e-06, + "loss": 2.3017, + "step": 8194 + }, + { + "epoch": 2.46, + "grad_norm": 16.672903060913086, + "learning_rate": 3.5742207076275437e-06, + "loss": 0.9025, + "step": 8195 + }, + { + "epoch": 2.46, + "grad_norm": 10.29025936126709, + "learning_rate": 3.5722160970231534e-06, + "loss": 1.0269, + "step": 8196 + }, + { + "epoch": 2.46, + "grad_norm": 11.360820770263672, + "learning_rate": 3.570211486418764e-06, + "loss": 1.7671, + "step": 8197 + }, + { + "epoch": 2.46, + "grad_norm": 28.387712478637695, + "learning_rate": 3.5682068758143735e-06, + "loss": 0.9867, + "step": 8198 + }, + { + "epoch": 2.47, + "grad_norm": 37.20726776123047, + "learning_rate": 3.566202265209983e-06, + "loss": 2.4591, + "step": 8199 + }, + { + "epoch": 2.47, + "grad_norm": 8.600688934326172, + "learning_rate": 3.5641976546055933e-06, + "loss": 0.8608, + "step": 8200 + }, + { + "epoch": 2.47, + "grad_norm": 42.92110824584961, + "learning_rate": 3.562193044001203e-06, + "loss": 1.5033, + "step": 8201 + }, + { + "epoch": 2.47, + "grad_norm": 9.241715431213379, + "learning_rate": 3.560188433396813e-06, + "loss": 0.8824, + "step": 8202 + }, + { + "epoch": 2.47, + "grad_norm": 18.05348777770996, + "learning_rate": 3.558183822792423e-06, + "loss": 0.8519, + "step": 8203 + }, + { + "epoch": 2.47, + "grad_norm": 13.041126251220703, + "learning_rate": 3.5561792121880328e-06, + "loss": 0.9648, + "step": 8204 + }, + { + "epoch": 2.47, + "grad_norm": 15.101016998291016, + "learning_rate": 3.5541746015836424e-06, + "loss": 1.7081, + "step": 8205 + }, + { + "epoch": 2.47, + "grad_norm": 21.521345138549805, + "learning_rate": 3.5521699909792525e-06, + "loss": 2.7192, + "step": 8206 + }, + { + "epoch": 2.47, + "grad_norm": 27.694257736206055, + "learning_rate": 3.5501653803748626e-06, + "loss": 1.5167, + "step": 8207 + }, + { + "epoch": 2.47, + "grad_norm": 10.351465225219727, + "learning_rate": 3.5481607697704722e-06, + "loss": 1.3463, + "step": 8208 + }, + { + "epoch": 2.47, + "grad_norm": 29.087156295776367, + "learning_rate": 3.546156159166082e-06, + "loss": 1.397, + "step": 8209 + }, + { + "epoch": 2.47, + "grad_norm": 14.024823188781738, + "learning_rate": 3.5441515485616924e-06, + "loss": 0.8097, + "step": 8210 + }, + { + "epoch": 2.47, + "grad_norm": 11.973261833190918, + "learning_rate": 3.542146937957302e-06, + "loss": 0.7829, + "step": 8211 + }, + { + "epoch": 2.47, + "grad_norm": 8.828713417053223, + "learning_rate": 3.5401423273529117e-06, + "loss": 0.7569, + "step": 8212 + }, + { + "epoch": 2.47, + "grad_norm": 32.27656173706055, + "learning_rate": 3.538137716748522e-06, + "loss": 0.9883, + "step": 8213 + }, + { + "epoch": 2.47, + "grad_norm": 54.36296463012695, + "learning_rate": 3.536133106144132e-06, + "loss": 2.0888, + "step": 8214 + }, + { + "epoch": 2.47, + "grad_norm": 30.008255004882812, + "learning_rate": 3.5341284955397416e-06, + "loss": 1.1675, + "step": 8215 + }, + { + "epoch": 2.47, + "grad_norm": 20.43587875366211, + "learning_rate": 3.5321238849353517e-06, + "loss": 1.0271, + "step": 8216 + }, + { + "epoch": 2.47, + "grad_norm": 9.914734840393066, + "learning_rate": 3.5301192743309613e-06, + "loss": 1.4436, + "step": 8217 + }, + { + "epoch": 2.47, + "grad_norm": 10.097274780273438, + "learning_rate": 3.5281146637265714e-06, + "loss": 1.2491, + "step": 8218 + }, + { + "epoch": 2.47, + "grad_norm": 13.336429595947266, + "learning_rate": 3.5261100531221815e-06, + "loss": 1.1802, + "step": 8219 + }, + { + "epoch": 2.47, + "grad_norm": 14.391281127929688, + "learning_rate": 3.524105442517791e-06, + "loss": 1.0317, + "step": 8220 + }, + { + "epoch": 2.47, + "grad_norm": 21.83466339111328, + "learning_rate": 3.522100831913401e-06, + "loss": 2.0406, + "step": 8221 + }, + { + "epoch": 2.47, + "grad_norm": 13.69311809539795, + "learning_rate": 3.5200962213090113e-06, + "loss": 1.0287, + "step": 8222 + }, + { + "epoch": 2.47, + "grad_norm": 14.4343900680542, + "learning_rate": 3.518091610704621e-06, + "loss": 1.1169, + "step": 8223 + }, + { + "epoch": 2.47, + "grad_norm": 19.60431671142578, + "learning_rate": 3.5160870001002306e-06, + "loss": 1.2294, + "step": 8224 + }, + { + "epoch": 2.47, + "grad_norm": 8.042855262756348, + "learning_rate": 3.5140823894958407e-06, + "loss": 0.8052, + "step": 8225 + }, + { + "epoch": 2.47, + "grad_norm": 13.775824546813965, + "learning_rate": 3.512077778891451e-06, + "loss": 1.0722, + "step": 8226 + }, + { + "epoch": 2.47, + "grad_norm": 15.00515079498291, + "learning_rate": 3.5100731682870605e-06, + "loss": 1.0372, + "step": 8227 + }, + { + "epoch": 2.47, + "grad_norm": 52.58286666870117, + "learning_rate": 3.5080685576826705e-06, + "loss": 4.7955, + "step": 8228 + }, + { + "epoch": 2.47, + "grad_norm": 24.154563903808594, + "learning_rate": 3.50606394707828e-06, + "loss": 1.4256, + "step": 8229 + }, + { + "epoch": 2.47, + "grad_norm": 13.502425193786621, + "learning_rate": 3.50405933647389e-06, + "loss": 1.381, + "step": 8230 + }, + { + "epoch": 2.47, + "grad_norm": 9.207113265991211, + "learning_rate": 3.5020547258695004e-06, + "loss": 0.4176, + "step": 8231 + }, + { + "epoch": 2.48, + "grad_norm": 21.608287811279297, + "learning_rate": 3.50005011526511e-06, + "loss": 0.7372, + "step": 8232 + }, + { + "epoch": 2.48, + "grad_norm": 13.105839729309082, + "learning_rate": 3.4980455046607197e-06, + "loss": 0.719, + "step": 8233 + }, + { + "epoch": 2.48, + "grad_norm": 17.119613647460938, + "learning_rate": 3.49604089405633e-06, + "loss": 1.2089, + "step": 8234 + }, + { + "epoch": 2.48, + "grad_norm": 129.32228088378906, + "learning_rate": 3.49403628345194e-06, + "loss": 2.5109, + "step": 8235 + }, + { + "epoch": 2.48, + "grad_norm": 11.509007453918457, + "learning_rate": 3.4920316728475495e-06, + "loss": 1.1199, + "step": 8236 + }, + { + "epoch": 2.48, + "grad_norm": 17.33429718017578, + "learning_rate": 3.4900270622431596e-06, + "loss": 1.3015, + "step": 8237 + }, + { + "epoch": 2.48, + "grad_norm": 28.055233001708984, + "learning_rate": 3.4880224516387697e-06, + "loss": 2.121, + "step": 8238 + }, + { + "epoch": 2.48, + "grad_norm": 22.95208740234375, + "learning_rate": 3.4860178410343794e-06, + "loss": 1.7331, + "step": 8239 + }, + { + "epoch": 2.48, + "grad_norm": 12.567434310913086, + "learning_rate": 3.4840132304299894e-06, + "loss": 1.5196, + "step": 8240 + }, + { + "epoch": 2.48, + "grad_norm": 40.08758544921875, + "learning_rate": 3.482008619825599e-06, + "loss": 1.1594, + "step": 8241 + }, + { + "epoch": 2.48, + "grad_norm": 17.78892707824707, + "learning_rate": 3.4800040092212088e-06, + "loss": 1.0566, + "step": 8242 + }, + { + "epoch": 2.48, + "grad_norm": 28.924680709838867, + "learning_rate": 3.4779993986168193e-06, + "loss": 1.722, + "step": 8243 + }, + { + "epoch": 2.48, + "grad_norm": 13.244800567626953, + "learning_rate": 3.475994788012429e-06, + "loss": 1.5066, + "step": 8244 + }, + { + "epoch": 2.48, + "grad_norm": 46.74547576904297, + "learning_rate": 3.4739901774080386e-06, + "loss": 1.802, + "step": 8245 + }, + { + "epoch": 2.48, + "grad_norm": 19.008142471313477, + "learning_rate": 3.4719855668036482e-06, + "loss": 1.4982, + "step": 8246 + }, + { + "epoch": 2.48, + "grad_norm": 16.53547477722168, + "learning_rate": 3.4699809561992588e-06, + "loss": 1.4899, + "step": 8247 + }, + { + "epoch": 2.48, + "grad_norm": 18.363264083862305, + "learning_rate": 3.4679763455948684e-06, + "loss": 1.022, + "step": 8248 + }, + { + "epoch": 2.48, + "grad_norm": 7.702117443084717, + "learning_rate": 3.465971734990478e-06, + "loss": 1.3944, + "step": 8249 + }, + { + "epoch": 2.48, + "grad_norm": 8.90109634399414, + "learning_rate": 3.4639671243860886e-06, + "loss": 0.8569, + "step": 8250 + }, + { + "epoch": 2.48, + "grad_norm": 34.664485931396484, + "learning_rate": 3.4619625137816982e-06, + "loss": 1.817, + "step": 8251 + }, + { + "epoch": 2.48, + "grad_norm": 12.355849266052246, + "learning_rate": 3.459957903177308e-06, + "loss": 0.9937, + "step": 8252 + }, + { + "epoch": 2.48, + "grad_norm": 12.493422508239746, + "learning_rate": 3.457953292572918e-06, + "loss": 1.1074, + "step": 8253 + }, + { + "epoch": 2.48, + "grad_norm": 20.661190032958984, + "learning_rate": 3.4559486819685277e-06, + "loss": 2.2912, + "step": 8254 + }, + { + "epoch": 2.48, + "grad_norm": 19.867286682128906, + "learning_rate": 3.4539440713641377e-06, + "loss": 1.6991, + "step": 8255 + }, + { + "epoch": 2.48, + "grad_norm": 13.034092903137207, + "learning_rate": 3.451939460759748e-06, + "loss": 0.8973, + "step": 8256 + }, + { + "epoch": 2.48, + "grad_norm": 14.20175552368164, + "learning_rate": 3.4499348501553575e-06, + "loss": 1.1049, + "step": 8257 + }, + { + "epoch": 2.48, + "grad_norm": 12.550244331359863, + "learning_rate": 3.447930239550967e-06, + "loss": 1.1492, + "step": 8258 + }, + { + "epoch": 2.48, + "grad_norm": 13.553984642028809, + "learning_rate": 3.4459256289465777e-06, + "loss": 1.1702, + "step": 8259 + }, + { + "epoch": 2.48, + "grad_norm": 17.68621826171875, + "learning_rate": 3.4439210183421873e-06, + "loss": 1.4533, + "step": 8260 + }, + { + "epoch": 2.48, + "grad_norm": 19.83187484741211, + "learning_rate": 3.441916407737797e-06, + "loss": 1.4019, + "step": 8261 + }, + { + "epoch": 2.48, + "grad_norm": 11.274413108825684, + "learning_rate": 3.4399117971334075e-06, + "loss": 1.7506, + "step": 8262 + }, + { + "epoch": 2.48, + "grad_norm": 27.54680061340332, + "learning_rate": 3.437907186529017e-06, + "loss": 1.5858, + "step": 8263 + }, + { + "epoch": 2.48, + "grad_norm": 9.336912155151367, + "learning_rate": 3.435902575924627e-06, + "loss": 0.7806, + "step": 8264 + }, + { + "epoch": 2.48, + "grad_norm": 53.21189498901367, + "learning_rate": 3.433897965320237e-06, + "loss": 1.7986, + "step": 8265 + }, + { + "epoch": 2.49, + "grad_norm": 9.917101860046387, + "learning_rate": 3.4318933547158465e-06, + "loss": 0.8816, + "step": 8266 + }, + { + "epoch": 2.49, + "grad_norm": 8.793371200561523, + "learning_rate": 3.4298887441114566e-06, + "loss": 0.8211, + "step": 8267 + }, + { + "epoch": 2.49, + "grad_norm": 6.4236626625061035, + "learning_rate": 3.4278841335070667e-06, + "loss": 0.7913, + "step": 8268 + }, + { + "epoch": 2.49, + "grad_norm": 22.39311408996582, + "learning_rate": 3.4258795229026764e-06, + "loss": 1.1071, + "step": 8269 + }, + { + "epoch": 2.49, + "grad_norm": 15.763972282409668, + "learning_rate": 3.423874912298286e-06, + "loss": 0.7142, + "step": 8270 + }, + { + "epoch": 2.49, + "grad_norm": 16.8847713470459, + "learning_rate": 3.4218703016938965e-06, + "loss": 1.2469, + "step": 8271 + }, + { + "epoch": 2.49, + "grad_norm": 9.058008193969727, + "learning_rate": 3.419865691089506e-06, + "loss": 1.3107, + "step": 8272 + }, + { + "epoch": 2.49, + "grad_norm": 19.072185516357422, + "learning_rate": 3.417861080485116e-06, + "loss": 1.2581, + "step": 8273 + }, + { + "epoch": 2.49, + "grad_norm": 30.158096313476562, + "learning_rate": 3.4158564698807264e-06, + "loss": 1.2928, + "step": 8274 + }, + { + "epoch": 2.49, + "grad_norm": 11.686331748962402, + "learning_rate": 3.413851859276336e-06, + "loss": 1.0867, + "step": 8275 + }, + { + "epoch": 2.49, + "grad_norm": 13.757401466369629, + "learning_rate": 3.4118472486719457e-06, + "loss": 1.5826, + "step": 8276 + }, + { + "epoch": 2.49, + "grad_norm": 18.740379333496094, + "learning_rate": 3.4098426380675558e-06, + "loss": 1.2056, + "step": 8277 + }, + { + "epoch": 2.49, + "grad_norm": 32.20988082885742, + "learning_rate": 3.4078380274631654e-06, + "loss": 1.3858, + "step": 8278 + }, + { + "epoch": 2.49, + "grad_norm": 13.55139446258545, + "learning_rate": 3.4058334168587755e-06, + "loss": 1.0321, + "step": 8279 + }, + { + "epoch": 2.49, + "grad_norm": 10.795275688171387, + "learning_rate": 3.4038288062543856e-06, + "loss": 1.2745, + "step": 8280 + }, + { + "epoch": 2.49, + "eval_loss": 0.17804668843746185, + "eval_runtime": 43.714, + "eval_samples_per_second": 33.834, + "eval_steps_per_second": 33.834, + "step": 8280 + }, + { + "epoch": 2.49, + "grad_norm": 11.243696212768555, + "learning_rate": 3.4018241956499953e-06, + "loss": 0.8101, + "step": 8281 + }, + { + "epoch": 2.49, + "grad_norm": 10.823609352111816, + "learning_rate": 3.399819585045605e-06, + "loss": 2.0621, + "step": 8282 + }, + { + "epoch": 2.49, + "grad_norm": 52.87884521484375, + "learning_rate": 3.3978149744412154e-06, + "loss": 2.0929, + "step": 8283 + }, + { + "epoch": 2.49, + "grad_norm": 11.563464164733887, + "learning_rate": 3.395810363836825e-06, + "loss": 0.8609, + "step": 8284 + }, + { + "epoch": 2.49, + "grad_norm": 39.08991241455078, + "learning_rate": 3.3938057532324348e-06, + "loss": 1.3453, + "step": 8285 + }, + { + "epoch": 2.49, + "grad_norm": 72.77201843261719, + "learning_rate": 3.391801142628045e-06, + "loss": 1.5782, + "step": 8286 + }, + { + "epoch": 2.49, + "grad_norm": 44.493431091308594, + "learning_rate": 3.389796532023655e-06, + "loss": 1.312, + "step": 8287 + }, + { + "epoch": 2.49, + "grad_norm": 49.612998962402344, + "learning_rate": 3.3877919214192646e-06, + "loss": 2.2307, + "step": 8288 + }, + { + "epoch": 2.49, + "grad_norm": 30.34926986694336, + "learning_rate": 3.3857873108148743e-06, + "loss": 1.1781, + "step": 8289 + }, + { + "epoch": 2.49, + "grad_norm": 9.271419525146484, + "learning_rate": 3.3837827002104843e-06, + "loss": 0.9995, + "step": 8290 + }, + { + "epoch": 2.49, + "grad_norm": 16.43708610534668, + "learning_rate": 3.3817780896060944e-06, + "loss": 1.193, + "step": 8291 + }, + { + "epoch": 2.49, + "grad_norm": 15.72551441192627, + "learning_rate": 3.379773479001704e-06, + "loss": 1.3098, + "step": 8292 + }, + { + "epoch": 2.49, + "grad_norm": 7.720631122589111, + "learning_rate": 3.377768868397314e-06, + "loss": 0.8755, + "step": 8293 + }, + { + "epoch": 2.49, + "grad_norm": 27.38102149963379, + "learning_rate": 3.375764257792924e-06, + "loss": 1.4355, + "step": 8294 + }, + { + "epoch": 2.49, + "grad_norm": 12.947731018066406, + "learning_rate": 3.3737596471885335e-06, + "loss": 1.0127, + "step": 8295 + }, + { + "epoch": 2.49, + "grad_norm": 18.314491271972656, + "learning_rate": 3.371755036584144e-06, + "loss": 1.1366, + "step": 8296 + }, + { + "epoch": 2.49, + "grad_norm": 20.400217056274414, + "learning_rate": 3.3697504259797537e-06, + "loss": 1.2425, + "step": 8297 + }, + { + "epoch": 2.49, + "grad_norm": 21.475561141967773, + "learning_rate": 3.3677458153753633e-06, + "loss": 0.9925, + "step": 8298 + }, + { + "epoch": 2.5, + "grad_norm": 21.971511840820312, + "learning_rate": 3.365741204770974e-06, + "loss": 1.3472, + "step": 8299 + }, + { + "epoch": 2.5, + "grad_norm": 11.167675018310547, + "learning_rate": 3.3637365941665835e-06, + "loss": 1.0643, + "step": 8300 + }, + { + "epoch": 2.5, + "grad_norm": 55.813114166259766, + "learning_rate": 3.361731983562193e-06, + "loss": 2.2609, + "step": 8301 + }, + { + "epoch": 2.5, + "grad_norm": 14.630870819091797, + "learning_rate": 3.3597273729578032e-06, + "loss": 0.6388, + "step": 8302 + }, + { + "epoch": 2.5, + "grad_norm": 21.16858673095703, + "learning_rate": 3.357722762353413e-06, + "loss": 1.4621, + "step": 8303 + }, + { + "epoch": 2.5, + "grad_norm": 25.93169593811035, + "learning_rate": 3.355718151749023e-06, + "loss": 1.6922, + "step": 8304 + }, + { + "epoch": 2.5, + "grad_norm": 17.35527229309082, + "learning_rate": 3.353713541144633e-06, + "loss": 1.1722, + "step": 8305 + }, + { + "epoch": 2.5, + "grad_norm": 33.93075942993164, + "learning_rate": 3.3517089305402427e-06, + "loss": 1.9865, + "step": 8306 + }, + { + "epoch": 2.5, + "grad_norm": 39.736629486083984, + "learning_rate": 3.3497043199358524e-06, + "loss": 2.1434, + "step": 8307 + }, + { + "epoch": 2.5, + "grad_norm": 16.60782814025879, + "learning_rate": 3.347699709331463e-06, + "loss": 1.4811, + "step": 8308 + }, + { + "epoch": 2.5, + "grad_norm": 23.1959285736084, + "learning_rate": 3.3456950987270725e-06, + "loss": 1.8844, + "step": 8309 + }, + { + "epoch": 2.5, + "grad_norm": 15.47692584991455, + "learning_rate": 3.343690488122682e-06, + "loss": 1.1461, + "step": 8310 + }, + { + "epoch": 2.5, + "grad_norm": 10.405765533447266, + "learning_rate": 3.3416858775182927e-06, + "loss": 1.0609, + "step": 8311 + }, + { + "epoch": 2.5, + "grad_norm": 7.443643569946289, + "learning_rate": 3.3396812669139024e-06, + "loss": 0.5891, + "step": 8312 + }, + { + "epoch": 2.5, + "grad_norm": 40.757137298583984, + "learning_rate": 3.337676656309512e-06, + "loss": 1.1556, + "step": 8313 + }, + { + "epoch": 2.5, + "grad_norm": 23.71717071533203, + "learning_rate": 3.335672045705122e-06, + "loss": 1.3438, + "step": 8314 + }, + { + "epoch": 2.5, + "grad_norm": 9.829071998596191, + "learning_rate": 3.3336674351007318e-06, + "loss": 1.1778, + "step": 8315 + }, + { + "epoch": 2.5, + "grad_norm": 36.342872619628906, + "learning_rate": 3.331662824496342e-06, + "loss": 1.5935, + "step": 8316 + }, + { + "epoch": 2.5, + "grad_norm": 34.38883590698242, + "learning_rate": 3.329658213891952e-06, + "loss": 1.1699, + "step": 8317 + }, + { + "epoch": 2.5, + "grad_norm": 13.522592544555664, + "learning_rate": 3.3276536032875616e-06, + "loss": 1.3218, + "step": 8318 + }, + { + "epoch": 2.5, + "grad_norm": 18.872207641601562, + "learning_rate": 3.3256489926831713e-06, + "loss": 1.0661, + "step": 8319 + }, + { + "epoch": 2.5, + "grad_norm": 48.556331634521484, + "learning_rate": 3.3236443820787818e-06, + "loss": 2.3702, + "step": 8320 + }, + { + "epoch": 2.5, + "grad_norm": 26.391420364379883, + "learning_rate": 3.3216397714743914e-06, + "loss": 1.0047, + "step": 8321 + }, + { + "epoch": 2.5, + "grad_norm": 10.691487312316895, + "learning_rate": 3.319635160870001e-06, + "loss": 0.6147, + "step": 8322 + }, + { + "epoch": 2.5, + "grad_norm": 11.963746070861816, + "learning_rate": 3.3176305502656116e-06, + "loss": 1.4303, + "step": 8323 + }, + { + "epoch": 2.5, + "grad_norm": 14.355230331420898, + "learning_rate": 3.3156259396612213e-06, + "loss": 1.6003, + "step": 8324 + }, + { + "epoch": 2.5, + "grad_norm": 14.952115058898926, + "learning_rate": 3.313621329056831e-06, + "loss": 1.2936, + "step": 8325 + }, + { + "epoch": 2.5, + "grad_norm": 11.36012077331543, + "learning_rate": 3.311616718452441e-06, + "loss": 1.3233, + "step": 8326 + }, + { + "epoch": 2.5, + "grad_norm": 73.88568115234375, + "learning_rate": 3.3096121078480507e-06, + "loss": 2.0371, + "step": 8327 + }, + { + "epoch": 2.5, + "grad_norm": 7.392289638519287, + "learning_rate": 3.3076074972436608e-06, + "loss": 0.817, + "step": 8328 + }, + { + "epoch": 2.5, + "grad_norm": 26.323123931884766, + "learning_rate": 3.305602886639271e-06, + "loss": 1.2377, + "step": 8329 + }, + { + "epoch": 2.5, + "grad_norm": 6.049129486083984, + "learning_rate": 3.3035982760348805e-06, + "loss": 0.725, + "step": 8330 + }, + { + "epoch": 2.5, + "grad_norm": 18.688451766967773, + "learning_rate": 3.30159366543049e-06, + "loss": 1.0841, + "step": 8331 + }, + { + "epoch": 2.51, + "grad_norm": 19.779504776000977, + "learning_rate": 3.2995890548261e-06, + "loss": 1.6004, + "step": 8332 + }, + { + "epoch": 2.51, + "grad_norm": 37.673744201660156, + "learning_rate": 3.2975844442217103e-06, + "loss": 1.7093, + "step": 8333 + }, + { + "epoch": 2.51, + "grad_norm": 15.679644584655762, + "learning_rate": 3.29557983361732e-06, + "loss": 1.4718, + "step": 8334 + }, + { + "epoch": 2.51, + "grad_norm": 17.381481170654297, + "learning_rate": 3.2935752230129297e-06, + "loss": 2.5925, + "step": 8335 + }, + { + "epoch": 2.51, + "grad_norm": 9.106160163879395, + "learning_rate": 3.29157061240854e-06, + "loss": 0.8652, + "step": 8336 + }, + { + "epoch": 2.51, + "grad_norm": 25.941999435424805, + "learning_rate": 3.28956600180415e-06, + "loss": 0.8842, + "step": 8337 + }, + { + "epoch": 2.51, + "grad_norm": 9.563976287841797, + "learning_rate": 3.2875613911997595e-06, + "loss": 1.6659, + "step": 8338 + }, + { + "epoch": 2.51, + "grad_norm": 46.76715087890625, + "learning_rate": 3.2855567805953696e-06, + "loss": 1.8769, + "step": 8339 + }, + { + "epoch": 2.51, + "grad_norm": 20.058713912963867, + "learning_rate": 3.2835521699909797e-06, + "loss": 1.3938, + "step": 8340 + }, + { + "epoch": 2.51, + "grad_norm": 41.1780891418457, + "learning_rate": 3.2815475593865893e-06, + "loss": 1.9891, + "step": 8341 + }, + { + "epoch": 2.51, + "grad_norm": 14.344354629516602, + "learning_rate": 3.2795429487821994e-06, + "loss": 1.4836, + "step": 8342 + }, + { + "epoch": 2.51, + "grad_norm": 12.271919250488281, + "learning_rate": 3.277538338177809e-06, + "loss": 1.1799, + "step": 8343 + }, + { + "epoch": 2.51, + "grad_norm": 10.539395332336426, + "learning_rate": 3.2755337275734187e-06, + "loss": 1.0998, + "step": 8344 + }, + { + "epoch": 2.51, + "grad_norm": 18.392601013183594, + "learning_rate": 3.2735291169690292e-06, + "loss": 1.2702, + "step": 8345 + }, + { + "epoch": 2.51, + "grad_norm": 9.823795318603516, + "learning_rate": 3.271524506364639e-06, + "loss": 1.4675, + "step": 8346 + }, + { + "epoch": 2.51, + "grad_norm": 43.700782775878906, + "learning_rate": 3.2695198957602486e-06, + "loss": 2.5785, + "step": 8347 + }, + { + "epoch": 2.51, + "grad_norm": 18.977764129638672, + "learning_rate": 3.267515285155859e-06, + "loss": 2.1341, + "step": 8348 + }, + { + "epoch": 2.51, + "grad_norm": 11.805031776428223, + "learning_rate": 3.2655106745514687e-06, + "loss": 1.0303, + "step": 8349 + }, + { + "epoch": 2.51, + "grad_norm": 35.22747802734375, + "learning_rate": 3.2635060639470784e-06, + "loss": 1.0836, + "step": 8350 + }, + { + "epoch": 2.51, + "grad_norm": 48.45168685913086, + "learning_rate": 3.2615014533426885e-06, + "loss": 1.3489, + "step": 8351 + }, + { + "epoch": 2.51, + "grad_norm": 22.869895935058594, + "learning_rate": 3.2594968427382986e-06, + "loss": 2.6811, + "step": 8352 + }, + { + "epoch": 2.51, + "grad_norm": 18.353343963623047, + "learning_rate": 3.257492232133908e-06, + "loss": 0.976, + "step": 8353 + }, + { + "epoch": 2.51, + "grad_norm": 29.93402862548828, + "learning_rate": 3.2554876215295183e-06, + "loss": 1.1663, + "step": 8354 + }, + { + "epoch": 2.51, + "grad_norm": 13.383641242980957, + "learning_rate": 3.253483010925128e-06, + "loss": 0.7903, + "step": 8355 + }, + { + "epoch": 2.51, + "grad_norm": 6.8472747802734375, + "learning_rate": 3.2514784003207376e-06, + "loss": 1.1989, + "step": 8356 + }, + { + "epoch": 2.51, + "grad_norm": 32.73374938964844, + "learning_rate": 3.249473789716348e-06, + "loss": 0.9284, + "step": 8357 + }, + { + "epoch": 2.51, + "grad_norm": 11.246085166931152, + "learning_rate": 3.2474691791119578e-06, + "loss": 1.5323, + "step": 8358 + }, + { + "epoch": 2.51, + "grad_norm": 8.259552955627441, + "learning_rate": 3.2454645685075674e-06, + "loss": 0.8713, + "step": 8359 + }, + { + "epoch": 2.51, + "grad_norm": 28.066495895385742, + "learning_rate": 3.243459957903178e-06, + "loss": 1.1289, + "step": 8360 + }, + { + "epoch": 2.51, + "grad_norm": 16.770734786987305, + "learning_rate": 3.2414553472987876e-06, + "loss": 1.1323, + "step": 8361 + }, + { + "epoch": 2.51, + "grad_norm": 10.900749206542969, + "learning_rate": 3.2394507366943973e-06, + "loss": 0.7672, + "step": 8362 + }, + { + "epoch": 2.51, + "grad_norm": 21.28752326965332, + "learning_rate": 3.2374461260900074e-06, + "loss": 1.4755, + "step": 8363 + }, + { + "epoch": 2.51, + "grad_norm": 15.661283493041992, + "learning_rate": 3.2354415154856174e-06, + "loss": 1.3512, + "step": 8364 + }, + { + "epoch": 2.52, + "grad_norm": 16.886972427368164, + "learning_rate": 3.233436904881227e-06, + "loss": 1.4028, + "step": 8365 + }, + { + "epoch": 2.52, + "grad_norm": 17.953754425048828, + "learning_rate": 3.231432294276837e-06, + "loss": 1.2938, + "step": 8366 + }, + { + "epoch": 2.52, + "grad_norm": 9.600143432617188, + "learning_rate": 3.229427683672447e-06, + "loss": 0.7052, + "step": 8367 + }, + { + "epoch": 2.52, + "grad_norm": 19.288686752319336, + "learning_rate": 3.2274230730680565e-06, + "loss": 1.588, + "step": 8368 + }, + { + "epoch": 2.52, + "grad_norm": 12.002143859863281, + "learning_rate": 3.225418462463667e-06, + "loss": 0.5881, + "step": 8369 + }, + { + "epoch": 2.52, + "grad_norm": 18.510684967041016, + "learning_rate": 3.2234138518592767e-06, + "loss": 1.543, + "step": 8370 + }, + { + "epoch": 2.52, + "grad_norm": 31.459373474121094, + "learning_rate": 3.2214092412548863e-06, + "loss": 1.0949, + "step": 8371 + }, + { + "epoch": 2.52, + "grad_norm": 7.364177703857422, + "learning_rate": 3.219404630650496e-06, + "loss": 0.7282, + "step": 8372 + }, + { + "epoch": 2.52, + "grad_norm": 10.49648666381836, + "learning_rate": 3.2174000200461065e-06, + "loss": 0.6402, + "step": 8373 + }, + { + "epoch": 2.52, + "grad_norm": 25.187376022338867, + "learning_rate": 3.215395409441716e-06, + "loss": 1.4291, + "step": 8374 + }, + { + "epoch": 2.52, + "grad_norm": 26.098617553710938, + "learning_rate": 3.213390798837326e-06, + "loss": 1.861, + "step": 8375 + }, + { + "epoch": 2.52, + "grad_norm": 46.278839111328125, + "learning_rate": 3.2113861882329363e-06, + "loss": 1.8303, + "step": 8376 + }, + { + "epoch": 2.52, + "grad_norm": 12.867042541503906, + "learning_rate": 3.209381577628546e-06, + "loss": 1.0696, + "step": 8377 + }, + { + "epoch": 2.52, + "grad_norm": 20.583019256591797, + "learning_rate": 3.2073769670241557e-06, + "loss": 0.992, + "step": 8378 + }, + { + "epoch": 2.52, + "grad_norm": 56.57286071777344, + "learning_rate": 3.2053723564197657e-06, + "loss": 1.8542, + "step": 8379 + }, + { + "epoch": 2.52, + "grad_norm": 13.195068359375, + "learning_rate": 3.2033677458153754e-06, + "loss": 1.3004, + "step": 8380 + }, + { + "epoch": 2.52, + "grad_norm": 16.263944625854492, + "learning_rate": 3.2013631352109855e-06, + "loss": 1.7439, + "step": 8381 + }, + { + "epoch": 2.52, + "grad_norm": 11.452669143676758, + "learning_rate": 3.1993585246065956e-06, + "loss": 0.7889, + "step": 8382 + }, + { + "epoch": 2.52, + "grad_norm": 17.5844669342041, + "learning_rate": 3.1973539140022052e-06, + "loss": 1.2866, + "step": 8383 + }, + { + "epoch": 2.52, + "grad_norm": 12.583413124084473, + "learning_rate": 3.195349303397815e-06, + "loss": 0.9804, + "step": 8384 + }, + { + "epoch": 2.52, + "grad_norm": 13.724156379699707, + "learning_rate": 3.1933446927934254e-06, + "loss": 0.7936, + "step": 8385 + }, + { + "epoch": 2.52, + "grad_norm": 11.498400688171387, + "learning_rate": 3.191340082189035e-06, + "loss": 1.1136, + "step": 8386 + }, + { + "epoch": 2.52, + "grad_norm": 41.235206604003906, + "learning_rate": 3.1893354715846447e-06, + "loss": 1.1562, + "step": 8387 + }, + { + "epoch": 2.52, + "grad_norm": 26.88475799560547, + "learning_rate": 3.187330860980255e-06, + "loss": 2.1271, + "step": 8388 + }, + { + "epoch": 2.52, + "grad_norm": 16.73887825012207, + "learning_rate": 3.185326250375865e-06, + "loss": 1.5196, + "step": 8389 + }, + { + "epoch": 2.52, + "grad_norm": 20.1945858001709, + "learning_rate": 3.1833216397714746e-06, + "loss": 0.9805, + "step": 8390 + }, + { + "epoch": 2.52, + "grad_norm": 34.976627349853516, + "learning_rate": 3.1813170291670846e-06, + "loss": 1.1922, + "step": 8391 + }, + { + "epoch": 2.52, + "grad_norm": 36.179351806640625, + "learning_rate": 3.1793124185626943e-06, + "loss": 4.2282, + "step": 8392 + }, + { + "epoch": 2.52, + "grad_norm": 9.375896453857422, + "learning_rate": 3.1773078079583044e-06, + "loss": 0.9379, + "step": 8393 + }, + { + "epoch": 2.52, + "grad_norm": 132.8348846435547, + "learning_rate": 3.1753031973539145e-06, + "loss": 3.0371, + "step": 8394 + }, + { + "epoch": 2.52, + "grad_norm": 30.87514877319336, + "learning_rate": 3.173298586749524e-06, + "loss": 2.5639, + "step": 8395 + }, + { + "epoch": 2.52, + "grad_norm": 12.229470252990723, + "learning_rate": 3.171293976145134e-06, + "loss": 0.9845, + "step": 8396 + }, + { + "epoch": 2.52, + "grad_norm": 26.513559341430664, + "learning_rate": 3.1692893655407443e-06, + "loss": 1.1088, + "step": 8397 + }, + { + "epoch": 2.52, + "grad_norm": 7.9161553382873535, + "learning_rate": 3.167284754936354e-06, + "loss": 0.8013, + "step": 8398 + }, + { + "epoch": 2.53, + "grad_norm": 12.535621643066406, + "learning_rate": 3.1652801443319636e-06, + "loss": 1.0497, + "step": 8399 + }, + { + "epoch": 2.53, + "grad_norm": 9.409601211547852, + "learning_rate": 3.1632755337275737e-06, + "loss": 1.132, + "step": 8400 + }, + { + "epoch": 2.53, + "eval_loss": 0.17680850625038147, + "eval_runtime": 44.0695, + "eval_samples_per_second": 33.561, + "eval_steps_per_second": 33.561, + "step": 8400 + }, + { + "epoch": 2.53, + "grad_norm": 38.185997009277344, + "learning_rate": 3.1612709231231838e-06, + "loss": 1.8298, + "step": 8401 + }, + { + "epoch": 2.53, + "grad_norm": 103.94436645507812, + "learning_rate": 3.1592663125187934e-06, + "loss": 0.8306, + "step": 8402 + }, + { + "epoch": 2.53, + "grad_norm": 105.07266998291016, + "learning_rate": 3.1572617019144035e-06, + "loss": 1.827, + "step": 8403 + }, + { + "epoch": 2.53, + "grad_norm": 19.526813507080078, + "learning_rate": 3.155257091310013e-06, + "loss": 1.4366, + "step": 8404 + }, + { + "epoch": 2.53, + "grad_norm": 10.135507583618164, + "learning_rate": 3.153252480705623e-06, + "loss": 0.6992, + "step": 8405 + }, + { + "epoch": 2.53, + "grad_norm": 35.69138717651367, + "learning_rate": 3.1512478701012334e-06, + "loss": 1.5329, + "step": 8406 + }, + { + "epoch": 2.53, + "grad_norm": 74.143798828125, + "learning_rate": 3.149243259496843e-06, + "loss": 0.9668, + "step": 8407 + }, + { + "epoch": 2.53, + "grad_norm": 15.071893692016602, + "learning_rate": 3.1472386488924527e-06, + "loss": 1.1024, + "step": 8408 + }, + { + "epoch": 2.53, + "grad_norm": 11.287890434265137, + "learning_rate": 3.145234038288063e-06, + "loss": 1.2887, + "step": 8409 + }, + { + "epoch": 2.53, + "grad_norm": 63.98174285888672, + "learning_rate": 3.143229427683673e-06, + "loss": 1.8663, + "step": 8410 + }, + { + "epoch": 2.53, + "grad_norm": 15.564671516418457, + "learning_rate": 3.1412248170792825e-06, + "loss": 1.1972, + "step": 8411 + }, + { + "epoch": 2.53, + "grad_norm": 26.284500122070312, + "learning_rate": 3.1392202064748926e-06, + "loss": 1.1945, + "step": 8412 + }, + { + "epoch": 2.53, + "grad_norm": 81.35211181640625, + "learning_rate": 3.1372155958705027e-06, + "loss": 1.7933, + "step": 8413 + }, + { + "epoch": 2.53, + "grad_norm": 20.28238296508789, + "learning_rate": 3.1352109852661123e-06, + "loss": 1.6545, + "step": 8414 + }, + { + "epoch": 2.53, + "grad_norm": 11.033011436462402, + "learning_rate": 3.133206374661722e-06, + "loss": 0.8211, + "step": 8415 + }, + { + "epoch": 2.53, + "grad_norm": 17.557846069335938, + "learning_rate": 3.131201764057332e-06, + "loss": 1.2009, + "step": 8416 + }, + { + "epoch": 2.53, + "grad_norm": 17.877410888671875, + "learning_rate": 3.1291971534529417e-06, + "loss": 1.1617, + "step": 8417 + }, + { + "epoch": 2.53, + "grad_norm": 17.008520126342773, + "learning_rate": 3.127192542848552e-06, + "loss": 1.2213, + "step": 8418 + }, + { + "epoch": 2.53, + "grad_norm": 13.909770965576172, + "learning_rate": 3.125187932244162e-06, + "loss": 0.9544, + "step": 8419 + }, + { + "epoch": 2.53, + "grad_norm": 12.314075469970703, + "learning_rate": 3.1231833216397716e-06, + "loss": 1.1267, + "step": 8420 + }, + { + "epoch": 2.53, + "grad_norm": 8.14324951171875, + "learning_rate": 3.1211787110353812e-06, + "loss": 1.052, + "step": 8421 + }, + { + "epoch": 2.53, + "grad_norm": 9.060734748840332, + "learning_rate": 3.1191741004309917e-06, + "loss": 0.7107, + "step": 8422 + }, + { + "epoch": 2.53, + "grad_norm": 12.2525634765625, + "learning_rate": 3.1171694898266014e-06, + "loss": 0.9515, + "step": 8423 + }, + { + "epoch": 2.53, + "grad_norm": 18.487070083618164, + "learning_rate": 3.115164879222211e-06, + "loss": 0.9164, + "step": 8424 + }, + { + "epoch": 2.53, + "grad_norm": 21.11574363708496, + "learning_rate": 3.1131602686178216e-06, + "loss": 1.1415, + "step": 8425 + }, + { + "epoch": 2.53, + "grad_norm": 10.710432052612305, + "learning_rate": 3.1111556580134312e-06, + "loss": 0.6223, + "step": 8426 + }, + { + "epoch": 2.53, + "grad_norm": 12.434354782104492, + "learning_rate": 3.109151047409041e-06, + "loss": 0.7547, + "step": 8427 + }, + { + "epoch": 2.53, + "grad_norm": 5.420670032501221, + "learning_rate": 3.107146436804651e-06, + "loss": 0.913, + "step": 8428 + }, + { + "epoch": 2.53, + "grad_norm": 23.501205444335938, + "learning_rate": 3.1051418262002606e-06, + "loss": 1.7807, + "step": 8429 + }, + { + "epoch": 2.53, + "grad_norm": 17.01691436767578, + "learning_rate": 3.1031372155958707e-06, + "loss": 1.2807, + "step": 8430 + }, + { + "epoch": 2.53, + "grad_norm": 15.571081161499023, + "learning_rate": 3.101132604991481e-06, + "loss": 1.397, + "step": 8431 + }, + { + "epoch": 2.54, + "grad_norm": 32.079612731933594, + "learning_rate": 3.0991279943870905e-06, + "loss": 1.4421, + "step": 8432 + }, + { + "epoch": 2.54, + "grad_norm": 13.537861824035645, + "learning_rate": 3.0971233837827e-06, + "loss": 1.4324, + "step": 8433 + }, + { + "epoch": 2.54, + "grad_norm": 72.99842834472656, + "learning_rate": 3.0951187731783106e-06, + "loss": 2.2397, + "step": 8434 + }, + { + "epoch": 2.54, + "grad_norm": 32.24034118652344, + "learning_rate": 3.0931141625739203e-06, + "loss": 1.8425, + "step": 8435 + }, + { + "epoch": 2.54, + "grad_norm": 14.41550350189209, + "learning_rate": 3.09110955196953e-06, + "loss": 1.0483, + "step": 8436 + }, + { + "epoch": 2.54, + "grad_norm": 47.05146789550781, + "learning_rate": 3.0891049413651405e-06, + "loss": 1.3004, + "step": 8437 + }, + { + "epoch": 2.54, + "grad_norm": 25.591800689697266, + "learning_rate": 3.08710033076075e-06, + "loss": 1.1771, + "step": 8438 + }, + { + "epoch": 2.54, + "grad_norm": 9.942156791687012, + "learning_rate": 3.08509572015636e-06, + "loss": 0.5432, + "step": 8439 + }, + { + "epoch": 2.54, + "grad_norm": 22.738941192626953, + "learning_rate": 3.08309110955197e-06, + "loss": 2.3535, + "step": 8440 + }, + { + "epoch": 2.54, + "grad_norm": 14.746498107910156, + "learning_rate": 3.0810864989475795e-06, + "loss": 1.2366, + "step": 8441 + }, + { + "epoch": 2.54, + "grad_norm": 98.85680389404297, + "learning_rate": 3.0790818883431896e-06, + "loss": 2.4007, + "step": 8442 + }, + { + "epoch": 2.54, + "grad_norm": 23.468538284301758, + "learning_rate": 3.0770772777387997e-06, + "loss": 1.0986, + "step": 8443 + }, + { + "epoch": 2.54, + "grad_norm": 8.700865745544434, + "learning_rate": 3.0750726671344094e-06, + "loss": 1.1621, + "step": 8444 + }, + { + "epoch": 2.54, + "grad_norm": 22.677888870239258, + "learning_rate": 3.073068056530019e-06, + "loss": 1.2522, + "step": 8445 + }, + { + "epoch": 2.54, + "grad_norm": 21.384296417236328, + "learning_rate": 3.0710634459256295e-06, + "loss": 1.6951, + "step": 8446 + }, + { + "epoch": 2.54, + "grad_norm": 30.282161712646484, + "learning_rate": 3.069058835321239e-06, + "loss": 1.8486, + "step": 8447 + }, + { + "epoch": 2.54, + "grad_norm": 11.835555076599121, + "learning_rate": 3.067054224716849e-06, + "loss": 1.0966, + "step": 8448 + }, + { + "epoch": 2.54, + "grad_norm": 16.40022850036621, + "learning_rate": 3.0650496141124594e-06, + "loss": 1.55, + "step": 8449 + }, + { + "epoch": 2.54, + "grad_norm": 34.12657928466797, + "learning_rate": 3.063045003508069e-06, + "loss": 1.6477, + "step": 8450 + }, + { + "epoch": 2.54, + "grad_norm": 43.224571228027344, + "learning_rate": 3.0610403929036787e-06, + "loss": 1.4323, + "step": 8451 + }, + { + "epoch": 2.54, + "grad_norm": 21.87476921081543, + "learning_rate": 3.0590357822992888e-06, + "loss": 1.117, + "step": 8452 + }, + { + "epoch": 2.54, + "grad_norm": 12.986775398254395, + "learning_rate": 3.0570311716948984e-06, + "loss": 0.865, + "step": 8453 + }, + { + "epoch": 2.54, + "grad_norm": 12.08977222442627, + "learning_rate": 3.0550265610905085e-06, + "loss": 0.7838, + "step": 8454 + }, + { + "epoch": 2.54, + "grad_norm": 50.07052230834961, + "learning_rate": 3.053021950486118e-06, + "loss": 1.4847, + "step": 8455 + }, + { + "epoch": 2.54, + "grad_norm": 7.173743724822998, + "learning_rate": 3.0510173398817283e-06, + "loss": 0.6167, + "step": 8456 + }, + { + "epoch": 2.54, + "grad_norm": 38.1131591796875, + "learning_rate": 3.049012729277338e-06, + "loss": 2.517, + "step": 8457 + }, + { + "epoch": 2.54, + "grad_norm": 9.692411422729492, + "learning_rate": 3.0470081186729476e-06, + "loss": 1.0573, + "step": 8458 + }, + { + "epoch": 2.54, + "grad_norm": 41.6327018737793, + "learning_rate": 3.045003508068558e-06, + "loss": 1.4349, + "step": 8459 + }, + { + "epoch": 2.54, + "grad_norm": 29.187610626220703, + "learning_rate": 3.0429988974641677e-06, + "loss": 1.5777, + "step": 8460 + }, + { + "epoch": 2.54, + "grad_norm": 29.35192108154297, + "learning_rate": 3.0409942868597774e-06, + "loss": 1.8016, + "step": 8461 + }, + { + "epoch": 2.54, + "grad_norm": 20.445228576660156, + "learning_rate": 3.038989676255388e-06, + "loss": 0.7974, + "step": 8462 + }, + { + "epoch": 2.54, + "grad_norm": 18.133628845214844, + "learning_rate": 3.0369850656509976e-06, + "loss": 1.1352, + "step": 8463 + }, + { + "epoch": 2.54, + "grad_norm": 16.889158248901367, + "learning_rate": 3.0349804550466072e-06, + "loss": 1.4672, + "step": 8464 + }, + { + "epoch": 2.55, + "grad_norm": 20.985919952392578, + "learning_rate": 3.0329758444422173e-06, + "loss": 1.192, + "step": 8465 + }, + { + "epoch": 2.55, + "grad_norm": 9.734439849853516, + "learning_rate": 3.0309712338378274e-06, + "loss": 0.7422, + "step": 8466 + }, + { + "epoch": 2.55, + "grad_norm": 23.414813995361328, + "learning_rate": 3.028966623233437e-06, + "loss": 1.8114, + "step": 8467 + }, + { + "epoch": 2.55, + "grad_norm": 12.681184768676758, + "learning_rate": 3.026962012629047e-06, + "loss": 1.1328, + "step": 8468 + }, + { + "epoch": 2.55, + "grad_norm": 23.120763778686523, + "learning_rate": 3.024957402024657e-06, + "loss": 1.1875, + "step": 8469 + }, + { + "epoch": 2.55, + "grad_norm": 19.008502960205078, + "learning_rate": 3.0229527914202665e-06, + "loss": 1.2166, + "step": 8470 + }, + { + "epoch": 2.55, + "grad_norm": 22.20121192932129, + "learning_rate": 3.020948180815877e-06, + "loss": 1.4729, + "step": 8471 + }, + { + "epoch": 2.55, + "grad_norm": 16.53184700012207, + "learning_rate": 3.0189435702114866e-06, + "loss": 1.8034, + "step": 8472 + }, + { + "epoch": 2.55, + "grad_norm": 21.981178283691406, + "learning_rate": 3.0169389596070963e-06, + "loss": 2.2006, + "step": 8473 + }, + { + "epoch": 2.55, + "grad_norm": 36.195106506347656, + "learning_rate": 3.014934349002707e-06, + "loss": 2.1031, + "step": 8474 + }, + { + "epoch": 2.55, + "grad_norm": 33.1042366027832, + "learning_rate": 3.0129297383983165e-06, + "loss": 1.5522, + "step": 8475 + }, + { + "epoch": 2.55, + "grad_norm": 14.721949577331543, + "learning_rate": 3.010925127793926e-06, + "loss": 1.5655, + "step": 8476 + }, + { + "epoch": 2.55, + "grad_norm": 14.444605827331543, + "learning_rate": 3.0089205171895362e-06, + "loss": 1.2094, + "step": 8477 + }, + { + "epoch": 2.55, + "grad_norm": 14.284456253051758, + "learning_rate": 3.0069159065851463e-06, + "loss": 0.986, + "step": 8478 + }, + { + "epoch": 2.55, + "grad_norm": 12.816495895385742, + "learning_rate": 3.004911295980756e-06, + "loss": 1.0418, + "step": 8479 + }, + { + "epoch": 2.55, + "grad_norm": 67.11431884765625, + "learning_rate": 3.002906685376366e-06, + "loss": 1.5002, + "step": 8480 + }, + { + "epoch": 2.55, + "grad_norm": 24.34355926513672, + "learning_rate": 3.0009020747719757e-06, + "loss": 1.329, + "step": 8481 + }, + { + "epoch": 2.55, + "grad_norm": 11.59719467163086, + "learning_rate": 2.9988974641675854e-06, + "loss": 1.4363, + "step": 8482 + }, + { + "epoch": 2.55, + "grad_norm": 32.02231216430664, + "learning_rate": 2.996892853563196e-06, + "loss": 1.2263, + "step": 8483 + }, + { + "epoch": 2.55, + "grad_norm": 9.977734565734863, + "learning_rate": 2.9948882429588055e-06, + "loss": 0.6075, + "step": 8484 + }, + { + "epoch": 2.55, + "grad_norm": 48.880313873291016, + "learning_rate": 2.992883632354415e-06, + "loss": 1.5793, + "step": 8485 + }, + { + "epoch": 2.55, + "grad_norm": 7.916026592254639, + "learning_rate": 2.9908790217500257e-06, + "loss": 0.7116, + "step": 8486 + }, + { + "epoch": 2.55, + "grad_norm": 29.04728889465332, + "learning_rate": 2.9888744111456354e-06, + "loss": 1.2325, + "step": 8487 + }, + { + "epoch": 2.55, + "grad_norm": 13.980238914489746, + "learning_rate": 2.986869800541245e-06, + "loss": 1.496, + "step": 8488 + }, + { + "epoch": 2.55, + "grad_norm": 15.269109725952148, + "learning_rate": 2.984865189936855e-06, + "loss": 1.4313, + "step": 8489 + }, + { + "epoch": 2.55, + "grad_norm": 20.857440948486328, + "learning_rate": 2.9828605793324648e-06, + "loss": 1.5363, + "step": 8490 + }, + { + "epoch": 2.55, + "grad_norm": 14.627253532409668, + "learning_rate": 2.980855968728075e-06, + "loss": 1.3487, + "step": 8491 + }, + { + "epoch": 2.55, + "grad_norm": 7.229565620422363, + "learning_rate": 2.978851358123685e-06, + "loss": 0.8008, + "step": 8492 + }, + { + "epoch": 2.55, + "grad_norm": 29.895063400268555, + "learning_rate": 2.9768467475192946e-06, + "loss": 1.7014, + "step": 8493 + }, + { + "epoch": 2.55, + "grad_norm": 26.894834518432617, + "learning_rate": 2.9748421369149043e-06, + "loss": 1.8372, + "step": 8494 + }, + { + "epoch": 2.55, + "grad_norm": 50.57379150390625, + "learning_rate": 2.9728375263105148e-06, + "loss": 1.3365, + "step": 8495 + }, + { + "epoch": 2.55, + "grad_norm": 12.997958183288574, + "learning_rate": 2.9708329157061244e-06, + "loss": 1.9539, + "step": 8496 + }, + { + "epoch": 2.55, + "grad_norm": 11.23823356628418, + "learning_rate": 2.968828305101734e-06, + "loss": 1.2944, + "step": 8497 + }, + { + "epoch": 2.56, + "grad_norm": 21.072771072387695, + "learning_rate": 2.9668236944973438e-06, + "loss": 1.4985, + "step": 8498 + }, + { + "epoch": 2.56, + "grad_norm": 61.82595443725586, + "learning_rate": 2.9648190838929543e-06, + "loss": 2.2333, + "step": 8499 + }, + { + "epoch": 2.56, + "grad_norm": 26.898548126220703, + "learning_rate": 2.962814473288564e-06, + "loss": 1.2997, + "step": 8500 + }, + { + "epoch": 2.56, + "grad_norm": 14.761646270751953, + "learning_rate": 2.9608098626841736e-06, + "loss": 1.4087, + "step": 8501 + }, + { + "epoch": 2.56, + "grad_norm": 26.489471435546875, + "learning_rate": 2.9588052520797837e-06, + "loss": 1.6874, + "step": 8502 + }, + { + "epoch": 2.56, + "grad_norm": 25.354101181030273, + "learning_rate": 2.9568006414753938e-06, + "loss": 0.7809, + "step": 8503 + }, + { + "epoch": 2.56, + "grad_norm": 8.844145774841309, + "learning_rate": 2.9547960308710034e-06, + "loss": 0.9255, + "step": 8504 + }, + { + "epoch": 2.56, + "grad_norm": 18.32469940185547, + "learning_rate": 2.9527914202666135e-06, + "loss": 1.0907, + "step": 8505 + }, + { + "epoch": 2.56, + "grad_norm": 14.326888084411621, + "learning_rate": 2.950786809662223e-06, + "loss": 1.1902, + "step": 8506 + }, + { + "epoch": 2.56, + "grad_norm": 19.863540649414062, + "learning_rate": 2.948782199057833e-06, + "loss": 1.5195, + "step": 8507 + }, + { + "epoch": 2.56, + "grad_norm": 9.05129623413086, + "learning_rate": 2.9467775884534433e-06, + "loss": 0.7649, + "step": 8508 + }, + { + "epoch": 2.56, + "grad_norm": 11.893647193908691, + "learning_rate": 2.944772977849053e-06, + "loss": 1.4486, + "step": 8509 + }, + { + "epoch": 2.56, + "grad_norm": 18.236352920532227, + "learning_rate": 2.9427683672446626e-06, + "loss": 1.0206, + "step": 8510 + }, + { + "epoch": 2.56, + "grad_norm": 32.056034088134766, + "learning_rate": 2.940763756640273e-06, + "loss": 2.3092, + "step": 8511 + }, + { + "epoch": 2.56, + "grad_norm": 72.33465576171875, + "learning_rate": 2.938759146035883e-06, + "loss": 2.1425, + "step": 8512 + }, + { + "epoch": 2.56, + "grad_norm": 56.66347122192383, + "learning_rate": 2.9367545354314925e-06, + "loss": 1.1673, + "step": 8513 + }, + { + "epoch": 2.56, + "grad_norm": 14.660852432250977, + "learning_rate": 2.9347499248271026e-06, + "loss": 1.1077, + "step": 8514 + }, + { + "epoch": 2.56, + "grad_norm": 45.072975158691406, + "learning_rate": 2.9327453142227126e-06, + "loss": 1.7196, + "step": 8515 + }, + { + "epoch": 2.56, + "grad_norm": 15.052632331848145, + "learning_rate": 2.9307407036183223e-06, + "loss": 1.245, + "step": 8516 + }, + { + "epoch": 2.56, + "grad_norm": 17.311857223510742, + "learning_rate": 2.9287360930139324e-06, + "loss": 1.241, + "step": 8517 + }, + { + "epoch": 2.56, + "grad_norm": 71.18388366699219, + "learning_rate": 2.926731482409542e-06, + "loss": 1.4936, + "step": 8518 + }, + { + "epoch": 2.56, + "grad_norm": 14.947158813476562, + "learning_rate": 2.9247268718051517e-06, + "loss": 1.164, + "step": 8519 + }, + { + "epoch": 2.56, + "grad_norm": 39.9619140625, + "learning_rate": 2.9227222612007622e-06, + "loss": 2.3687, + "step": 8520 + }, + { + "epoch": 2.56, + "eval_loss": 0.17769810557365417, + "eval_runtime": 44.5571, + "eval_samples_per_second": 33.193, + "eval_steps_per_second": 33.193, + "step": 8520 + }, + { + "epoch": 2.56, + "grad_norm": 17.896486282348633, + "learning_rate": 2.920717650596372e-06, + "loss": 1.4077, + "step": 8521 + }, + { + "epoch": 2.56, + "grad_norm": 32.224422454833984, + "learning_rate": 2.9187130399919815e-06, + "loss": 1.8564, + "step": 8522 + }, + { + "epoch": 2.56, + "grad_norm": 20.0201358795166, + "learning_rate": 2.916708429387592e-06, + "loss": 0.6378, + "step": 8523 + }, + { + "epoch": 2.56, + "grad_norm": 8.714916229248047, + "learning_rate": 2.9147038187832017e-06, + "loss": 0.8691, + "step": 8524 + }, + { + "epoch": 2.56, + "grad_norm": 13.311516761779785, + "learning_rate": 2.9126992081788114e-06, + "loss": 1.0312, + "step": 8525 + }, + { + "epoch": 2.56, + "grad_norm": 23.491960525512695, + "learning_rate": 2.9106945975744215e-06, + "loss": 1.2836, + "step": 8526 + }, + { + "epoch": 2.56, + "grad_norm": 9.72035026550293, + "learning_rate": 2.9086899869700315e-06, + "loss": 1.1779, + "step": 8527 + }, + { + "epoch": 2.56, + "grad_norm": 14.758877754211426, + "learning_rate": 2.906685376365641e-06, + "loss": 1.3364, + "step": 8528 + }, + { + "epoch": 2.56, + "grad_norm": 9.038101196289062, + "learning_rate": 2.9046807657612513e-06, + "loss": 0.7359, + "step": 8529 + }, + { + "epoch": 2.56, + "grad_norm": 9.687514305114746, + "learning_rate": 2.902676155156861e-06, + "loss": 0.6275, + "step": 8530 + }, + { + "epoch": 2.56, + "grad_norm": 21.076562881469727, + "learning_rate": 2.9006715445524706e-06, + "loss": 1.3599, + "step": 8531 + }, + { + "epoch": 2.57, + "grad_norm": 8.555130004882812, + "learning_rate": 2.898666933948081e-06, + "loss": 0.9138, + "step": 8532 + }, + { + "epoch": 2.57, + "grad_norm": 30.38118553161621, + "learning_rate": 2.8966623233436908e-06, + "loss": 1.4925, + "step": 8533 + }, + { + "epoch": 2.57, + "grad_norm": 34.965641021728516, + "learning_rate": 2.8946577127393004e-06, + "loss": 1.3331, + "step": 8534 + }, + { + "epoch": 2.57, + "grad_norm": 10.746747016906738, + "learning_rate": 2.892653102134911e-06, + "loss": 1.2565, + "step": 8535 + }, + { + "epoch": 2.57, + "grad_norm": 62.186431884765625, + "learning_rate": 2.8906484915305206e-06, + "loss": 2.5688, + "step": 8536 + }, + { + "epoch": 2.57, + "grad_norm": 9.051580429077148, + "learning_rate": 2.8886438809261303e-06, + "loss": 0.7643, + "step": 8537 + }, + { + "epoch": 2.57, + "grad_norm": 22.37142562866211, + "learning_rate": 2.88663927032174e-06, + "loss": 1.8968, + "step": 8538 + }, + { + "epoch": 2.57, + "grad_norm": 15.524712562561035, + "learning_rate": 2.8846346597173504e-06, + "loss": 0.9424, + "step": 8539 + }, + { + "epoch": 2.57, + "grad_norm": 14.384515762329102, + "learning_rate": 2.88263004911296e-06, + "loss": 1.0429, + "step": 8540 + }, + { + "epoch": 2.57, + "grad_norm": 55.4445686340332, + "learning_rate": 2.8806254385085698e-06, + "loss": 2.7088, + "step": 8541 + }, + { + "epoch": 2.57, + "grad_norm": 11.720520973205566, + "learning_rate": 2.87862082790418e-06, + "loss": 0.7067, + "step": 8542 + }, + { + "epoch": 2.57, + "grad_norm": 18.14213752746582, + "learning_rate": 2.8766162172997895e-06, + "loss": 1.0006, + "step": 8543 + }, + { + "epoch": 2.57, + "grad_norm": 9.360933303833008, + "learning_rate": 2.8746116066953996e-06, + "loss": 1.1814, + "step": 8544 + }, + { + "epoch": 2.57, + "grad_norm": 54.820091247558594, + "learning_rate": 2.8726069960910097e-06, + "loss": 1.0313, + "step": 8545 + }, + { + "epoch": 2.57, + "grad_norm": 17.7171688079834, + "learning_rate": 2.8706023854866193e-06, + "loss": 0.5955, + "step": 8546 + }, + { + "epoch": 2.57, + "grad_norm": 26.265403747558594, + "learning_rate": 2.868597774882229e-06, + "loss": 1.2873, + "step": 8547 + }, + { + "epoch": 2.57, + "grad_norm": 51.058250427246094, + "learning_rate": 2.8665931642778395e-06, + "loss": 1.3523, + "step": 8548 + }, + { + "epoch": 2.57, + "grad_norm": 14.056093215942383, + "learning_rate": 2.864588553673449e-06, + "loss": 0.9431, + "step": 8549 + }, + { + "epoch": 2.57, + "grad_norm": 55.49951171875, + "learning_rate": 2.862583943069059e-06, + "loss": 1.6338, + "step": 8550 + }, + { + "epoch": 2.57, + "grad_norm": 12.471390724182129, + "learning_rate": 2.8605793324646693e-06, + "loss": 1.1413, + "step": 8551 + }, + { + "epoch": 2.57, + "grad_norm": 19.35071563720703, + "learning_rate": 2.858574721860279e-06, + "loss": 0.9613, + "step": 8552 + }, + { + "epoch": 2.57, + "grad_norm": 16.208166122436523, + "learning_rate": 2.8565701112558886e-06, + "loss": 1.5786, + "step": 8553 + }, + { + "epoch": 2.57, + "grad_norm": 24.832500457763672, + "learning_rate": 2.8545655006514987e-06, + "loss": 1.6098, + "step": 8554 + }, + { + "epoch": 2.57, + "grad_norm": 51.225799560546875, + "learning_rate": 2.8525608900471084e-06, + "loss": 1.4513, + "step": 8555 + }, + { + "epoch": 2.57, + "grad_norm": 13.274884223937988, + "learning_rate": 2.8505562794427185e-06, + "loss": 0.8535, + "step": 8556 + }, + { + "epoch": 2.57, + "grad_norm": 31.125516891479492, + "learning_rate": 2.8485516688383286e-06, + "loss": 1.388, + "step": 8557 + }, + { + "epoch": 2.57, + "grad_norm": 17.432456970214844, + "learning_rate": 2.8465470582339382e-06, + "loss": 1.5913, + "step": 8558 + }, + { + "epoch": 2.57, + "grad_norm": 24.500856399536133, + "learning_rate": 2.844542447629548e-06, + "loss": 2.2924, + "step": 8559 + }, + { + "epoch": 2.57, + "grad_norm": 14.529585838317871, + "learning_rate": 2.8425378370251584e-06, + "loss": 0.5999, + "step": 8560 + }, + { + "epoch": 2.57, + "grad_norm": 12.131428718566895, + "learning_rate": 2.840533226420768e-06, + "loss": 1.439, + "step": 8561 + }, + { + "epoch": 2.57, + "grad_norm": 8.231968879699707, + "learning_rate": 2.8385286158163777e-06, + "loss": 0.9683, + "step": 8562 + }, + { + "epoch": 2.57, + "grad_norm": 7.67575740814209, + "learning_rate": 2.836524005211988e-06, + "loss": 0.6245, + "step": 8563 + }, + { + "epoch": 2.57, + "grad_norm": 12.068075180053711, + "learning_rate": 2.834519394607598e-06, + "loss": 0.7527, + "step": 8564 + }, + { + "epoch": 2.58, + "grad_norm": 14.110102653503418, + "learning_rate": 2.8325147840032075e-06, + "loss": 1.7017, + "step": 8565 + }, + { + "epoch": 2.58, + "grad_norm": 38.348731994628906, + "learning_rate": 2.8305101733988176e-06, + "loss": 2.0444, + "step": 8566 + }, + { + "epoch": 2.58, + "grad_norm": 9.310153007507324, + "learning_rate": 2.8285055627944273e-06, + "loss": 1.0862, + "step": 8567 + }, + { + "epoch": 2.58, + "grad_norm": 18.77411460876465, + "learning_rate": 2.8265009521900374e-06, + "loss": 1.0017, + "step": 8568 + }, + { + "epoch": 2.58, + "grad_norm": 9.130143165588379, + "learning_rate": 2.8244963415856475e-06, + "loss": 1.1588, + "step": 8569 + }, + { + "epoch": 2.58, + "grad_norm": 37.241661071777344, + "learning_rate": 2.822491730981257e-06, + "loss": 1.1469, + "step": 8570 + }, + { + "epoch": 2.58, + "grad_norm": 19.131694793701172, + "learning_rate": 2.8204871203768668e-06, + "loss": 1.8664, + "step": 8571 + }, + { + "epoch": 2.58, + "grad_norm": 16.91837501525879, + "learning_rate": 2.8184825097724773e-06, + "loss": 1.2044, + "step": 8572 + }, + { + "epoch": 2.58, + "grad_norm": 12.339491844177246, + "learning_rate": 2.816477899168087e-06, + "loss": 0.8794, + "step": 8573 + }, + { + "epoch": 2.58, + "grad_norm": 41.453243255615234, + "learning_rate": 2.8144732885636966e-06, + "loss": 1.4408, + "step": 8574 + }, + { + "epoch": 2.58, + "grad_norm": 36.32587814331055, + "learning_rate": 2.8124686779593067e-06, + "loss": 0.8383, + "step": 8575 + }, + { + "epoch": 2.58, + "grad_norm": 19.15566635131836, + "learning_rate": 2.8104640673549168e-06, + "loss": 1.0883, + "step": 8576 + }, + { + "epoch": 2.58, + "grad_norm": 18.432010650634766, + "learning_rate": 2.8084594567505264e-06, + "loss": 1.314, + "step": 8577 + }, + { + "epoch": 2.58, + "grad_norm": 51.9474983215332, + "learning_rate": 2.8064548461461365e-06, + "loss": 1.7958, + "step": 8578 + }, + { + "epoch": 2.58, + "grad_norm": 21.26700210571289, + "learning_rate": 2.804450235541746e-06, + "loss": 1.878, + "step": 8579 + }, + { + "epoch": 2.58, + "grad_norm": 23.973962783813477, + "learning_rate": 2.8024456249373563e-06, + "loss": 1.2876, + "step": 8580 + }, + { + "epoch": 2.58, + "grad_norm": 11.906970024108887, + "learning_rate": 2.800441014332966e-06, + "loss": 0.5872, + "step": 8581 + }, + { + "epoch": 2.58, + "grad_norm": 14.538961410522461, + "learning_rate": 2.798436403728576e-06, + "loss": 0.9155, + "step": 8582 + }, + { + "epoch": 2.58, + "grad_norm": 19.3804988861084, + "learning_rate": 2.7964317931241857e-06, + "loss": 1.2437, + "step": 8583 + }, + { + "epoch": 2.58, + "grad_norm": 21.384546279907227, + "learning_rate": 2.7944271825197953e-06, + "loss": 0.8738, + "step": 8584 + }, + { + "epoch": 2.58, + "grad_norm": 5.975277423858643, + "learning_rate": 2.792422571915406e-06, + "loss": 0.4514, + "step": 8585 + }, + { + "epoch": 2.58, + "grad_norm": 13.506431579589844, + "learning_rate": 2.7904179613110155e-06, + "loss": 1.3215, + "step": 8586 + }, + { + "epoch": 2.58, + "grad_norm": 16.181753158569336, + "learning_rate": 2.788413350706625e-06, + "loss": 1.9854, + "step": 8587 + }, + { + "epoch": 2.58, + "grad_norm": 23.374975204467773, + "learning_rate": 2.7864087401022357e-06, + "loss": 1.9213, + "step": 8588 + }, + { + "epoch": 2.58, + "grad_norm": 29.04400634765625, + "learning_rate": 2.7844041294978453e-06, + "loss": 1.8084, + "step": 8589 + }, + { + "epoch": 2.58, + "grad_norm": 9.371678352355957, + "learning_rate": 2.782399518893455e-06, + "loss": 1.0408, + "step": 8590 + }, + { + "epoch": 2.58, + "grad_norm": 11.659550666809082, + "learning_rate": 2.780394908289065e-06, + "loss": 1.3009, + "step": 8591 + }, + { + "epoch": 2.58, + "grad_norm": 27.285043716430664, + "learning_rate": 2.7783902976846747e-06, + "loss": 1.9198, + "step": 8592 + }, + { + "epoch": 2.58, + "grad_norm": 30.421232223510742, + "learning_rate": 2.776385687080285e-06, + "loss": 1.3893, + "step": 8593 + }, + { + "epoch": 2.58, + "grad_norm": 14.901866912841797, + "learning_rate": 2.774381076475895e-06, + "loss": 1.0787, + "step": 8594 + }, + { + "epoch": 2.58, + "grad_norm": 10.58189868927002, + "learning_rate": 2.7723764658715046e-06, + "loss": 1.3617, + "step": 8595 + }, + { + "epoch": 2.58, + "grad_norm": 83.2371826171875, + "learning_rate": 2.7703718552671142e-06, + "loss": 1.7557, + "step": 8596 + }, + { + "epoch": 2.58, + "grad_norm": 23.80765724182129, + "learning_rate": 2.7683672446627247e-06, + "loss": 2.3199, + "step": 8597 + }, + { + "epoch": 2.59, + "grad_norm": 15.422316551208496, + "learning_rate": 2.7663626340583344e-06, + "loss": 1.1275, + "step": 8598 + }, + { + "epoch": 2.59, + "grad_norm": 20.54278564453125, + "learning_rate": 2.764358023453944e-06, + "loss": 1.7601, + "step": 8599 + }, + { + "epoch": 2.59, + "grad_norm": 14.661285400390625, + "learning_rate": 2.7623534128495546e-06, + "loss": 1.0741, + "step": 8600 + }, + { + "epoch": 2.59, + "grad_norm": 33.70438766479492, + "learning_rate": 2.7603488022451642e-06, + "loss": 1.0387, + "step": 8601 + }, + { + "epoch": 2.59, + "grad_norm": 15.016576766967773, + "learning_rate": 2.758344191640774e-06, + "loss": 1.7023, + "step": 8602 + }, + { + "epoch": 2.59, + "grad_norm": 7.491636753082275, + "learning_rate": 2.756339581036384e-06, + "loss": 1.374, + "step": 8603 + }, + { + "epoch": 2.59, + "grad_norm": 13.837325096130371, + "learning_rate": 2.7543349704319936e-06, + "loss": 0.8613, + "step": 8604 + }, + { + "epoch": 2.59, + "grad_norm": 17.60556983947754, + "learning_rate": 2.7523303598276037e-06, + "loss": 1.1599, + "step": 8605 + }, + { + "epoch": 2.59, + "grad_norm": 13.817652702331543, + "learning_rate": 2.750325749223214e-06, + "loss": 1.1847, + "step": 8606 + }, + { + "epoch": 2.59, + "grad_norm": 39.440921783447266, + "learning_rate": 2.7483211386188235e-06, + "loss": 1.5714, + "step": 8607 + }, + { + "epoch": 2.59, + "grad_norm": 13.094484329223633, + "learning_rate": 2.746316528014433e-06, + "loss": 1.4254, + "step": 8608 + }, + { + "epoch": 2.59, + "grad_norm": 12.363558769226074, + "learning_rate": 2.7443119174100436e-06, + "loss": 1.4503, + "step": 8609 + }, + { + "epoch": 2.59, + "grad_norm": 81.3821792602539, + "learning_rate": 2.7423073068056533e-06, + "loss": 1.6868, + "step": 8610 + }, + { + "epoch": 2.59, + "grad_norm": 28.759248733520508, + "learning_rate": 2.740302696201263e-06, + "loss": 1.2274, + "step": 8611 + }, + { + "epoch": 2.59, + "grad_norm": 22.243206024169922, + "learning_rate": 2.7382980855968735e-06, + "loss": 1.5054, + "step": 8612 + }, + { + "epoch": 2.59, + "grad_norm": 16.99969482421875, + "learning_rate": 2.736293474992483e-06, + "loss": 1.3755, + "step": 8613 + }, + { + "epoch": 2.59, + "grad_norm": 8.332833290100098, + "learning_rate": 2.7342888643880928e-06, + "loss": 1.0886, + "step": 8614 + }, + { + "epoch": 2.59, + "grad_norm": 17.875459671020508, + "learning_rate": 2.732284253783703e-06, + "loss": 1.2963, + "step": 8615 + }, + { + "epoch": 2.59, + "grad_norm": 17.195663452148438, + "learning_rate": 2.7302796431793125e-06, + "loss": 0.9124, + "step": 8616 + }, + { + "epoch": 2.59, + "grad_norm": 45.631954193115234, + "learning_rate": 2.7282750325749226e-06, + "loss": 2.5502, + "step": 8617 + }, + { + "epoch": 2.59, + "grad_norm": 72.2505874633789, + "learning_rate": 2.7262704219705327e-06, + "loss": 1.9153, + "step": 8618 + }, + { + "epoch": 2.59, + "grad_norm": 19.540000915527344, + "learning_rate": 2.7242658113661424e-06, + "loss": 1.3127, + "step": 8619 + }, + { + "epoch": 2.59, + "grad_norm": 16.321380615234375, + "learning_rate": 2.722261200761752e-06, + "loss": 1.1336, + "step": 8620 + }, + { + "epoch": 2.59, + "grad_norm": 35.40462875366211, + "learning_rate": 2.7202565901573625e-06, + "loss": 1.4144, + "step": 8621 + }, + { + "epoch": 2.59, + "grad_norm": 18.681198120117188, + "learning_rate": 2.718251979552972e-06, + "loss": 0.8571, + "step": 8622 + }, + { + "epoch": 2.59, + "grad_norm": 37.2798957824707, + "learning_rate": 2.716247368948582e-06, + "loss": 1.1746, + "step": 8623 + }, + { + "epoch": 2.59, + "grad_norm": 17.78194808959961, + "learning_rate": 2.7142427583441915e-06, + "loss": 1.0923, + "step": 8624 + }, + { + "epoch": 2.59, + "grad_norm": 18.684160232543945, + "learning_rate": 2.712238147739802e-06, + "loss": 1.3139, + "step": 8625 + }, + { + "epoch": 2.59, + "grad_norm": 25.179645538330078, + "learning_rate": 2.7102335371354117e-06, + "loss": 1.5291, + "step": 8626 + }, + { + "epoch": 2.59, + "grad_norm": 9.155866622924805, + "learning_rate": 2.7082289265310213e-06, + "loss": 0.6078, + "step": 8627 + }, + { + "epoch": 2.59, + "grad_norm": 25.500167846679688, + "learning_rate": 2.7062243159266314e-06, + "loss": 1.8845, + "step": 8628 + }, + { + "epoch": 2.59, + "grad_norm": 14.735610008239746, + "learning_rate": 2.7042197053222415e-06, + "loss": 1.3812, + "step": 8629 + }, + { + "epoch": 2.59, + "grad_norm": 13.087080001831055, + "learning_rate": 2.702215094717851e-06, + "loss": 2.3241, + "step": 8630 + }, + { + "epoch": 2.6, + "grad_norm": 15.85035228729248, + "learning_rate": 2.7002104841134612e-06, + "loss": 2.1271, + "step": 8631 + }, + { + "epoch": 2.6, + "grad_norm": 16.836761474609375, + "learning_rate": 2.698205873509071e-06, + "loss": 0.9277, + "step": 8632 + }, + { + "epoch": 2.6, + "grad_norm": 17.871294021606445, + "learning_rate": 2.6962012629046806e-06, + "loss": 1.3209, + "step": 8633 + }, + { + "epoch": 2.6, + "grad_norm": 10.736555099487305, + "learning_rate": 2.694196652300291e-06, + "loss": 0.9247, + "step": 8634 + }, + { + "epoch": 2.6, + "grad_norm": 4.654946804046631, + "learning_rate": 2.6921920416959007e-06, + "loss": 0.6192, + "step": 8635 + }, + { + "epoch": 2.6, + "grad_norm": 62.396331787109375, + "learning_rate": 2.6901874310915104e-06, + "loss": 2.6783, + "step": 8636 + }, + { + "epoch": 2.6, + "grad_norm": 37.43707275390625, + "learning_rate": 2.688182820487121e-06, + "loss": 0.6253, + "step": 8637 + }, + { + "epoch": 2.6, + "grad_norm": 10.426443099975586, + "learning_rate": 2.6861782098827306e-06, + "loss": 1.1717, + "step": 8638 + }, + { + "epoch": 2.6, + "grad_norm": 77.32369995117188, + "learning_rate": 2.6841735992783402e-06, + "loss": 2.797, + "step": 8639 + }, + { + "epoch": 2.6, + "grad_norm": 18.485462188720703, + "learning_rate": 2.6821689886739503e-06, + "loss": 0.9191, + "step": 8640 + }, + { + "epoch": 2.6, + "eval_loss": 0.17308995127677917, + "eval_runtime": 43.9129, + "eval_samples_per_second": 33.68, + "eval_steps_per_second": 33.68, + "step": 8640 + }, + { + "epoch": 2.6, + "grad_norm": 10.55259895324707, + "learning_rate": 2.6801643780695604e-06, + "loss": 1.4176, + "step": 8641 + }, + { + "epoch": 2.6, + "grad_norm": 14.04641056060791, + "learning_rate": 2.67815976746517e-06, + "loss": 1.3492, + "step": 8642 + }, + { + "epoch": 2.6, + "grad_norm": 11.673070907592773, + "learning_rate": 2.67615515686078e-06, + "loss": 1.3232, + "step": 8643 + }, + { + "epoch": 2.6, + "grad_norm": 22.69013023376465, + "learning_rate": 2.67415054625639e-06, + "loss": 1.535, + "step": 8644 + }, + { + "epoch": 2.6, + "grad_norm": 31.60176658630371, + "learning_rate": 2.6721459356519995e-06, + "loss": 2.397, + "step": 8645 + }, + { + "epoch": 2.6, + "grad_norm": 72.22872161865234, + "learning_rate": 2.67014132504761e-06, + "loss": 3.1373, + "step": 8646 + }, + { + "epoch": 2.6, + "grad_norm": 12.671222686767578, + "learning_rate": 2.6681367144432196e-06, + "loss": 0.9173, + "step": 8647 + }, + { + "epoch": 2.6, + "grad_norm": 24.99388885498047, + "learning_rate": 2.6661321038388293e-06, + "loss": 1.3955, + "step": 8648 + }, + { + "epoch": 2.6, + "grad_norm": 28.980268478393555, + "learning_rate": 2.66412749323444e-06, + "loss": 1.3139, + "step": 8649 + }, + { + "epoch": 2.6, + "grad_norm": 14.720972061157227, + "learning_rate": 2.6621228826300495e-06, + "loss": 0.8373, + "step": 8650 + }, + { + "epoch": 2.6, + "grad_norm": 63.49948501586914, + "learning_rate": 2.660118272025659e-06, + "loss": 1.9721, + "step": 8651 + }, + { + "epoch": 2.6, + "grad_norm": 11.087202072143555, + "learning_rate": 2.658113661421269e-06, + "loss": 0.9022, + "step": 8652 + }, + { + "epoch": 2.6, + "grad_norm": 21.9641170501709, + "learning_rate": 2.6561090508168793e-06, + "loss": 1.9667, + "step": 8653 + }, + { + "epoch": 2.6, + "grad_norm": 16.303483963012695, + "learning_rate": 2.654104440212489e-06, + "loss": 1.1034, + "step": 8654 + }, + { + "epoch": 2.6, + "grad_norm": 15.40654468536377, + "learning_rate": 2.652099829608099e-06, + "loss": 0.9308, + "step": 8655 + }, + { + "epoch": 2.6, + "grad_norm": 37.456077575683594, + "learning_rate": 2.6500952190037087e-06, + "loss": 1.5839, + "step": 8656 + }, + { + "epoch": 2.6, + "grad_norm": 189.6568145751953, + "learning_rate": 2.6480906083993184e-06, + "loss": 1.1821, + "step": 8657 + }, + { + "epoch": 2.6, + "grad_norm": 14.535365104675293, + "learning_rate": 2.646085997794929e-06, + "loss": 1.221, + "step": 8658 + }, + { + "epoch": 2.6, + "grad_norm": 15.61431884765625, + "learning_rate": 2.6440813871905385e-06, + "loss": 0.8718, + "step": 8659 + }, + { + "epoch": 2.6, + "grad_norm": 16.955713272094727, + "learning_rate": 2.642076776586148e-06, + "loss": 1.0111, + "step": 8660 + }, + { + "epoch": 2.6, + "grad_norm": 17.068771362304688, + "learning_rate": 2.6400721659817587e-06, + "loss": 1.5695, + "step": 8661 + }, + { + "epoch": 2.6, + "grad_norm": 10.242925643920898, + "learning_rate": 2.6380675553773684e-06, + "loss": 1.4754, + "step": 8662 + }, + { + "epoch": 2.6, + "grad_norm": 30.505434036254883, + "learning_rate": 2.636062944772978e-06, + "loss": 1.1623, + "step": 8663 + }, + { + "epoch": 2.6, + "grad_norm": 44.106178283691406, + "learning_rate": 2.6340583341685877e-06, + "loss": 2.1524, + "step": 8664 + }, + { + "epoch": 2.61, + "grad_norm": 15.738425254821777, + "learning_rate": 2.6320537235641978e-06, + "loss": 1.3509, + "step": 8665 + }, + { + "epoch": 2.61, + "grad_norm": 37.473567962646484, + "learning_rate": 2.630049112959808e-06, + "loss": 1.1996, + "step": 8666 + }, + { + "epoch": 2.61, + "grad_norm": 20.072607040405273, + "learning_rate": 2.6280445023554175e-06, + "loss": 1.1615, + "step": 8667 + }, + { + "epoch": 2.61, + "grad_norm": 14.636398315429688, + "learning_rate": 2.6260398917510276e-06, + "loss": 0.863, + "step": 8668 + }, + { + "epoch": 2.61, + "grad_norm": 9.129220008850098, + "learning_rate": 2.6240352811466373e-06, + "loss": 1.157, + "step": 8669 + }, + { + "epoch": 2.61, + "grad_norm": 19.694377899169922, + "learning_rate": 2.6220306705422473e-06, + "loss": 0.8906, + "step": 8670 + }, + { + "epoch": 2.61, + "grad_norm": 13.88318157196045, + "learning_rate": 2.6200260599378574e-06, + "loss": 1.343, + "step": 8671 + }, + { + "epoch": 2.61, + "grad_norm": 11.803475379943848, + "learning_rate": 2.618021449333467e-06, + "loss": 1.4005, + "step": 8672 + }, + { + "epoch": 2.61, + "grad_norm": 18.531572341918945, + "learning_rate": 2.6160168387290767e-06, + "loss": 1.3321, + "step": 8673 + }, + { + "epoch": 2.61, + "grad_norm": 6.691076278686523, + "learning_rate": 2.6140122281246872e-06, + "loss": 0.7632, + "step": 8674 + }, + { + "epoch": 2.61, + "grad_norm": 8.980220794677734, + "learning_rate": 2.612007617520297e-06, + "loss": 0.7735, + "step": 8675 + }, + { + "epoch": 2.61, + "grad_norm": 8.376172065734863, + "learning_rate": 2.6100030069159066e-06, + "loss": 1.0655, + "step": 8676 + }, + { + "epoch": 2.61, + "grad_norm": 10.30313491821289, + "learning_rate": 2.6079983963115167e-06, + "loss": 1.6599, + "step": 8677 + }, + { + "epoch": 2.61, + "grad_norm": 23.8471622467041, + "learning_rate": 2.6059937857071267e-06, + "loss": 0.579, + "step": 8678 + }, + { + "epoch": 2.61, + "grad_norm": 25.849637985229492, + "learning_rate": 2.6039891751027364e-06, + "loss": 1.8009, + "step": 8679 + }, + { + "epoch": 2.61, + "grad_norm": 26.335845947265625, + "learning_rate": 2.6019845644983465e-06, + "loss": 1.6794, + "step": 8680 + }, + { + "epoch": 2.61, + "grad_norm": 21.52005386352539, + "learning_rate": 2.599979953893956e-06, + "loss": 1.3504, + "step": 8681 + }, + { + "epoch": 2.61, + "grad_norm": 24.20670509338379, + "learning_rate": 2.5979753432895662e-06, + "loss": 0.9255, + "step": 8682 + }, + { + "epoch": 2.61, + "grad_norm": 105.50646209716797, + "learning_rate": 2.5959707326851763e-06, + "loss": 2.7274, + "step": 8683 + }, + { + "epoch": 2.61, + "grad_norm": 28.476221084594727, + "learning_rate": 2.593966122080786e-06, + "loss": 2.7293, + "step": 8684 + }, + { + "epoch": 2.61, + "grad_norm": 23.278079986572266, + "learning_rate": 2.5919615114763956e-06, + "loss": 2.1648, + "step": 8685 + }, + { + "epoch": 2.61, + "grad_norm": 25.680274963378906, + "learning_rate": 2.589956900872006e-06, + "loss": 1.5944, + "step": 8686 + }, + { + "epoch": 2.61, + "grad_norm": 16.657413482666016, + "learning_rate": 2.587952290267616e-06, + "loss": 0.8337, + "step": 8687 + }, + { + "epoch": 2.61, + "grad_norm": 24.16915512084961, + "learning_rate": 2.5859476796632255e-06, + "loss": 1.168, + "step": 8688 + }, + { + "epoch": 2.61, + "grad_norm": 13.357244491577148, + "learning_rate": 2.5839430690588355e-06, + "loss": 0.9006, + "step": 8689 + }, + { + "epoch": 2.61, + "grad_norm": 17.03959083557129, + "learning_rate": 2.5819384584544456e-06, + "loss": 1.3754, + "step": 8690 + }, + { + "epoch": 2.61, + "grad_norm": 14.713468551635742, + "learning_rate": 2.5799338478500553e-06, + "loss": 1.3571, + "step": 8691 + }, + { + "epoch": 2.61, + "grad_norm": 48.19596481323242, + "learning_rate": 2.5779292372456654e-06, + "loss": 1.8712, + "step": 8692 + }, + { + "epoch": 2.61, + "grad_norm": 42.44739532470703, + "learning_rate": 2.575924626641275e-06, + "loss": 1.2746, + "step": 8693 + }, + { + "epoch": 2.61, + "grad_norm": 35.6419792175293, + "learning_rate": 2.5739200160368847e-06, + "loss": 1.5702, + "step": 8694 + }, + { + "epoch": 2.61, + "grad_norm": 19.48603630065918, + "learning_rate": 2.571915405432495e-06, + "loss": 1.9149, + "step": 8695 + }, + { + "epoch": 2.61, + "grad_norm": 15.384188652038574, + "learning_rate": 2.569910794828105e-06, + "loss": 1.4403, + "step": 8696 + }, + { + "epoch": 2.61, + "grad_norm": 18.034732818603516, + "learning_rate": 2.5679061842237145e-06, + "loss": 1.2765, + "step": 8697 + }, + { + "epoch": 2.62, + "grad_norm": 12.634345054626465, + "learning_rate": 2.565901573619325e-06, + "loss": 1.0044, + "step": 8698 + }, + { + "epoch": 2.62, + "grad_norm": 12.71899127960205, + "learning_rate": 2.5638969630149347e-06, + "loss": 1.3239, + "step": 8699 + }, + { + "epoch": 2.62, + "grad_norm": 597.7841186523438, + "learning_rate": 2.5618923524105444e-06, + "loss": 1.6701, + "step": 8700 + }, + { + "epoch": 2.62, + "grad_norm": 16.543411254882812, + "learning_rate": 2.5598877418061544e-06, + "loss": 1.4786, + "step": 8701 + }, + { + "epoch": 2.62, + "grad_norm": 48.39625930786133, + "learning_rate": 2.5578831312017645e-06, + "loss": 1.538, + "step": 8702 + }, + { + "epoch": 2.62, + "grad_norm": 10.015215873718262, + "learning_rate": 2.555878520597374e-06, + "loss": 1.4655, + "step": 8703 + }, + { + "epoch": 2.62, + "grad_norm": 18.111173629760742, + "learning_rate": 2.5538739099929843e-06, + "loss": 1.539, + "step": 8704 + }, + { + "epoch": 2.62, + "grad_norm": 34.77145767211914, + "learning_rate": 2.551869299388594e-06, + "loss": 1.5545, + "step": 8705 + }, + { + "epoch": 2.62, + "grad_norm": 17.934062957763672, + "learning_rate": 2.5498646887842036e-06, + "loss": 1.3262, + "step": 8706 + }, + { + "epoch": 2.62, + "grad_norm": 14.223177909851074, + "learning_rate": 2.5478600781798137e-06, + "loss": 2.0243, + "step": 8707 + }, + { + "epoch": 2.62, + "grad_norm": 21.776586532592773, + "learning_rate": 2.5458554675754238e-06, + "loss": 1.3405, + "step": 8708 + }, + { + "epoch": 2.62, + "grad_norm": 10.819275856018066, + "learning_rate": 2.5438508569710334e-06, + "loss": 1.488, + "step": 8709 + }, + { + "epoch": 2.62, + "grad_norm": 12.452916145324707, + "learning_rate": 2.541846246366643e-06, + "loss": 1.3601, + "step": 8710 + }, + { + "epoch": 2.62, + "grad_norm": 13.645332336425781, + "learning_rate": 2.5398416357622536e-06, + "loss": 1.2305, + "step": 8711 + }, + { + "epoch": 2.62, + "grad_norm": 157.49203491210938, + "learning_rate": 2.5378370251578633e-06, + "loss": 1.5407, + "step": 8712 + }, + { + "epoch": 2.62, + "grad_norm": 15.182050704956055, + "learning_rate": 2.535832414553473e-06, + "loss": 0.874, + "step": 8713 + }, + { + "epoch": 2.62, + "grad_norm": 16.068603515625, + "learning_rate": 2.5338278039490834e-06, + "loss": 1.3392, + "step": 8714 + }, + { + "epoch": 2.62, + "grad_norm": 11.687101364135742, + "learning_rate": 2.531823193344693e-06, + "loss": 0.9513, + "step": 8715 + }, + { + "epoch": 2.62, + "grad_norm": 16.209623336791992, + "learning_rate": 2.5298185827403027e-06, + "loss": 0.8897, + "step": 8716 + }, + { + "epoch": 2.62, + "grad_norm": 42.321136474609375, + "learning_rate": 2.527813972135913e-06, + "loss": 2.2684, + "step": 8717 + }, + { + "epoch": 2.62, + "grad_norm": 19.06346321105957, + "learning_rate": 2.5258093615315225e-06, + "loss": 1.3514, + "step": 8718 + }, + { + "epoch": 2.62, + "grad_norm": 10.905227661132812, + "learning_rate": 2.5238047509271326e-06, + "loss": 0.6565, + "step": 8719 + }, + { + "epoch": 2.62, + "grad_norm": 17.082691192626953, + "learning_rate": 2.5218001403227427e-06, + "loss": 2.4891, + "step": 8720 + }, + { + "epoch": 2.62, + "grad_norm": 22.291873931884766, + "learning_rate": 2.5197955297183523e-06, + "loss": 2.0741, + "step": 8721 + }, + { + "epoch": 2.62, + "grad_norm": 9.077491760253906, + "learning_rate": 2.517790919113962e-06, + "loss": 1.115, + "step": 8722 + }, + { + "epoch": 2.62, + "grad_norm": 10.916313171386719, + "learning_rate": 2.5157863085095725e-06, + "loss": 1.4171, + "step": 8723 + }, + { + "epoch": 2.62, + "grad_norm": 54.628211975097656, + "learning_rate": 2.513781697905182e-06, + "loss": 0.9545, + "step": 8724 + }, + { + "epoch": 2.62, + "grad_norm": 11.69243049621582, + "learning_rate": 2.511777087300792e-06, + "loss": 0.7757, + "step": 8725 + }, + { + "epoch": 2.62, + "grad_norm": 23.144771575927734, + "learning_rate": 2.5097724766964023e-06, + "loss": 1.4352, + "step": 8726 + }, + { + "epoch": 2.62, + "grad_norm": 23.521812438964844, + "learning_rate": 2.507767866092012e-06, + "loss": 1.276, + "step": 8727 + }, + { + "epoch": 2.62, + "grad_norm": 29.313697814941406, + "learning_rate": 2.5057632554876216e-06, + "loss": 1.4241, + "step": 8728 + }, + { + "epoch": 2.62, + "grad_norm": 5.5264716148376465, + "learning_rate": 2.5037586448832317e-06, + "loss": 0.6715, + "step": 8729 + }, + { + "epoch": 2.62, + "grad_norm": 107.75334167480469, + "learning_rate": 2.5017540342788414e-06, + "loss": 2.3175, + "step": 8730 + }, + { + "epoch": 2.63, + "grad_norm": 16.5670108795166, + "learning_rate": 2.4997494236744515e-06, + "loss": 1.7357, + "step": 8731 + }, + { + "epoch": 2.63, + "grad_norm": 25.61155128479004, + "learning_rate": 2.497744813070061e-06, + "loss": 1.3919, + "step": 8732 + }, + { + "epoch": 2.63, + "grad_norm": 12.228818893432617, + "learning_rate": 2.495740202465671e-06, + "loss": 2.4026, + "step": 8733 + }, + { + "epoch": 2.63, + "grad_norm": 16.512880325317383, + "learning_rate": 2.4937355918612813e-06, + "loss": 1.9608, + "step": 8734 + }, + { + "epoch": 2.63, + "grad_norm": 34.81612014770508, + "learning_rate": 2.491730981256891e-06, + "loss": 1.0885, + "step": 8735 + }, + { + "epoch": 2.63, + "grad_norm": 31.149940490722656, + "learning_rate": 2.489726370652501e-06, + "loss": 1.0149, + "step": 8736 + }, + { + "epoch": 2.63, + "grad_norm": 13.102104187011719, + "learning_rate": 2.4877217600481107e-06, + "loss": 1.5993, + "step": 8737 + }, + { + "epoch": 2.63, + "grad_norm": 18.59217071533203, + "learning_rate": 2.4857171494437208e-06, + "loss": 1.3812, + "step": 8738 + }, + { + "epoch": 2.63, + "grad_norm": 15.023920059204102, + "learning_rate": 2.483712538839331e-06, + "loss": 1.0019, + "step": 8739 + }, + { + "epoch": 2.63, + "grad_norm": 15.331181526184082, + "learning_rate": 2.4817079282349405e-06, + "loss": 1.5826, + "step": 8740 + }, + { + "epoch": 2.63, + "grad_norm": 32.79924774169922, + "learning_rate": 2.4797033176305506e-06, + "loss": 2.171, + "step": 8741 + }, + { + "epoch": 2.63, + "grad_norm": 11.717575073242188, + "learning_rate": 2.4776987070261603e-06, + "loss": 0.7835, + "step": 8742 + }, + { + "epoch": 2.63, + "grad_norm": 19.555763244628906, + "learning_rate": 2.4756940964217704e-06, + "loss": 2.7394, + "step": 8743 + }, + { + "epoch": 2.63, + "grad_norm": 37.40543746948242, + "learning_rate": 2.47368948581738e-06, + "loss": 2.1193, + "step": 8744 + }, + { + "epoch": 2.63, + "grad_norm": 18.472028732299805, + "learning_rate": 2.47168487521299e-06, + "loss": 1.0426, + "step": 8745 + }, + { + "epoch": 2.63, + "grad_norm": 15.350752830505371, + "learning_rate": 2.4696802646085998e-06, + "loss": 1.0137, + "step": 8746 + }, + { + "epoch": 2.63, + "grad_norm": 12.595955848693848, + "learning_rate": 2.46767565400421e-06, + "loss": 1.6324, + "step": 8747 + }, + { + "epoch": 2.63, + "grad_norm": 29.87061309814453, + "learning_rate": 2.46567104339982e-06, + "loss": 1.5554, + "step": 8748 + }, + { + "epoch": 2.63, + "grad_norm": 17.269176483154297, + "learning_rate": 2.4636664327954296e-06, + "loss": 1.6175, + "step": 8749 + }, + { + "epoch": 2.63, + "grad_norm": 16.87700653076172, + "learning_rate": 2.4616618221910397e-06, + "loss": 1.0128, + "step": 8750 + }, + { + "epoch": 2.63, + "grad_norm": 10.00948429107666, + "learning_rate": 2.4596572115866498e-06, + "loss": 0.9691, + "step": 8751 + }, + { + "epoch": 2.63, + "grad_norm": 21.35502815246582, + "learning_rate": 2.4576526009822594e-06, + "loss": 1.3124, + "step": 8752 + }, + { + "epoch": 2.63, + "grad_norm": 11.689464569091797, + "learning_rate": 2.4556479903778695e-06, + "loss": 1.3952, + "step": 8753 + }, + { + "epoch": 2.63, + "grad_norm": 141.92120361328125, + "learning_rate": 2.453643379773479e-06, + "loss": 2.629, + "step": 8754 + }, + { + "epoch": 2.63, + "grad_norm": 26.330507278442383, + "learning_rate": 2.4516387691690893e-06, + "loss": 3.4614, + "step": 8755 + }, + { + "epoch": 2.63, + "grad_norm": 10.556838035583496, + "learning_rate": 2.449634158564699e-06, + "loss": 1.7251, + "step": 8756 + }, + { + "epoch": 2.63, + "grad_norm": 45.57450485229492, + "learning_rate": 2.4476295479603086e-06, + "loss": 1.7645, + "step": 8757 + }, + { + "epoch": 2.63, + "grad_norm": 11.817948341369629, + "learning_rate": 2.4456249373559187e-06, + "loss": 1.0033, + "step": 8758 + }, + { + "epoch": 2.63, + "grad_norm": 7.083280563354492, + "learning_rate": 2.4436203267515287e-06, + "loss": 0.7292, + "step": 8759 + }, + { + "epoch": 2.63, + "grad_norm": 18.485204696655273, + "learning_rate": 2.4416157161471384e-06, + "loss": 1.5034, + "step": 8760 + }, + { + "epoch": 2.63, + "eval_loss": 0.163481205701828, + "eval_runtime": 43.3055, + "eval_samples_per_second": 34.153, + "eval_steps_per_second": 34.153, + "step": 8760 + }, + { + "epoch": 2.63, + "grad_norm": 21.252248764038086, + "learning_rate": 2.4396111055427485e-06, + "loss": 1.1208, + "step": 8761 + }, + { + "epoch": 2.63, + "grad_norm": 16.638259887695312, + "learning_rate": 2.4376064949383586e-06, + "loss": 1.8795, + "step": 8762 + }, + { + "epoch": 2.63, + "grad_norm": 20.396753311157227, + "learning_rate": 2.4356018843339682e-06, + "loss": 1.5712, + "step": 8763 + }, + { + "epoch": 2.63, + "grad_norm": 7.8299455642700195, + "learning_rate": 2.4335972737295783e-06, + "loss": 0.7808, + "step": 8764 + }, + { + "epoch": 2.64, + "grad_norm": 11.673842430114746, + "learning_rate": 2.4315926631251884e-06, + "loss": 0.8884, + "step": 8765 + }, + { + "epoch": 2.64, + "grad_norm": 40.24734115600586, + "learning_rate": 2.429588052520798e-06, + "loss": 2.0446, + "step": 8766 + }, + { + "epoch": 2.64, + "grad_norm": 29.379608154296875, + "learning_rate": 2.4275834419164077e-06, + "loss": 1.6934, + "step": 8767 + }, + { + "epoch": 2.64, + "grad_norm": 66.81684112548828, + "learning_rate": 2.425578831312018e-06, + "loss": 1.426, + "step": 8768 + }, + { + "epoch": 2.64, + "grad_norm": 81.16704559326172, + "learning_rate": 2.4235742207076275e-06, + "loss": 1.3078, + "step": 8769 + }, + { + "epoch": 2.64, + "grad_norm": 13.74372386932373, + "learning_rate": 2.4215696101032376e-06, + "loss": 1.543, + "step": 8770 + }, + { + "epoch": 2.64, + "grad_norm": 9.779702186584473, + "learning_rate": 2.4195649994988476e-06, + "loss": 1.0268, + "step": 8771 + }, + { + "epoch": 2.64, + "grad_norm": 14.754984855651855, + "learning_rate": 2.4175603888944573e-06, + "loss": 1.7506, + "step": 8772 + }, + { + "epoch": 2.64, + "grad_norm": 29.609939575195312, + "learning_rate": 2.4155557782900674e-06, + "loss": 2.8884, + "step": 8773 + }, + { + "epoch": 2.64, + "grad_norm": 100.57302856445312, + "learning_rate": 2.4135511676856775e-06, + "loss": 1.1484, + "step": 8774 + }, + { + "epoch": 2.64, + "grad_norm": 29.211971282958984, + "learning_rate": 2.411546557081287e-06, + "loss": 2.1784, + "step": 8775 + }, + { + "epoch": 2.64, + "grad_norm": 31.128265380859375, + "learning_rate": 2.409541946476897e-06, + "loss": 1.6176, + "step": 8776 + }, + { + "epoch": 2.64, + "grad_norm": 16.750701904296875, + "learning_rate": 2.4075373358725073e-06, + "loss": 0.8013, + "step": 8777 + }, + { + "epoch": 2.64, + "grad_norm": 65.17308044433594, + "learning_rate": 2.405532725268117e-06, + "loss": 1.9283, + "step": 8778 + }, + { + "epoch": 2.64, + "grad_norm": 18.673601150512695, + "learning_rate": 2.4035281146637266e-06, + "loss": 1.3132, + "step": 8779 + }, + { + "epoch": 2.64, + "grad_norm": 12.052127838134766, + "learning_rate": 2.4015235040593367e-06, + "loss": 0.8893, + "step": 8780 + }, + { + "epoch": 2.64, + "grad_norm": 33.23358917236328, + "learning_rate": 2.3995188934549464e-06, + "loss": 1.5921, + "step": 8781 + }, + { + "epoch": 2.64, + "grad_norm": 32.95631408691406, + "learning_rate": 2.3975142828505564e-06, + "loss": 1.5091, + "step": 8782 + }, + { + "epoch": 2.64, + "grad_norm": 21.939970016479492, + "learning_rate": 2.395509672246166e-06, + "loss": 1.1536, + "step": 8783 + }, + { + "epoch": 2.64, + "grad_norm": 19.46248435974121, + "learning_rate": 2.393505061641776e-06, + "loss": 1.7254, + "step": 8784 + }, + { + "epoch": 2.64, + "grad_norm": 20.779407501220703, + "learning_rate": 2.3915004510373863e-06, + "loss": 0.6734, + "step": 8785 + }, + { + "epoch": 2.64, + "grad_norm": 18.915145874023438, + "learning_rate": 2.389495840432996e-06, + "loss": 1.4362, + "step": 8786 + }, + { + "epoch": 2.64, + "grad_norm": 15.002533912658691, + "learning_rate": 2.387491229828606e-06, + "loss": 1.3399, + "step": 8787 + }, + { + "epoch": 2.64, + "grad_norm": 205.16641235351562, + "learning_rate": 2.385486619224216e-06, + "loss": 1.7637, + "step": 8788 + }, + { + "epoch": 2.64, + "grad_norm": 16.24716567993164, + "learning_rate": 2.3834820086198258e-06, + "loss": 2.3097, + "step": 8789 + }, + { + "epoch": 2.64, + "grad_norm": 27.199005126953125, + "learning_rate": 2.381477398015436e-06, + "loss": 2.0923, + "step": 8790 + }, + { + "epoch": 2.64, + "grad_norm": 7.843896865844727, + "learning_rate": 2.3794727874110455e-06, + "loss": 0.556, + "step": 8791 + }, + { + "epoch": 2.64, + "grad_norm": 15.323102951049805, + "learning_rate": 2.3774681768066556e-06, + "loss": 0.9552, + "step": 8792 + }, + { + "epoch": 2.64, + "grad_norm": 14.061708450317383, + "learning_rate": 2.3754635662022653e-06, + "loss": 1.4372, + "step": 8793 + }, + { + "epoch": 2.64, + "grad_norm": 12.933516502380371, + "learning_rate": 2.3734589555978753e-06, + "loss": 1.2338, + "step": 8794 + }, + { + "epoch": 2.64, + "grad_norm": 7.49314546585083, + "learning_rate": 2.371454344993485e-06, + "loss": 0.731, + "step": 8795 + }, + { + "epoch": 2.64, + "grad_norm": 15.913431167602539, + "learning_rate": 2.369449734389095e-06, + "loss": 1.2261, + "step": 8796 + }, + { + "epoch": 2.64, + "grad_norm": 12.59531307220459, + "learning_rate": 2.367445123784705e-06, + "loss": 1.0611, + "step": 8797 + }, + { + "epoch": 2.65, + "grad_norm": 16.93478012084961, + "learning_rate": 2.365440513180315e-06, + "loss": 1.6995, + "step": 8798 + }, + { + "epoch": 2.65, + "grad_norm": 11.499807357788086, + "learning_rate": 2.363435902575925e-06, + "loss": 1.3401, + "step": 8799 + }, + { + "epoch": 2.65, + "grad_norm": 33.20011901855469, + "learning_rate": 2.3614312919715346e-06, + "loss": 2.086, + "step": 8800 + }, + { + "epoch": 2.65, + "grad_norm": 10.58979606628418, + "learning_rate": 2.3594266813671447e-06, + "loss": 1.1518, + "step": 8801 + }, + { + "epoch": 2.65, + "grad_norm": 15.411652565002441, + "learning_rate": 2.3574220707627547e-06, + "loss": 0.9793, + "step": 8802 + }, + { + "epoch": 2.65, + "grad_norm": 68.01510620117188, + "learning_rate": 2.3554174601583644e-06, + "loss": 2.0531, + "step": 8803 + }, + { + "epoch": 2.65, + "grad_norm": 56.63874053955078, + "learning_rate": 2.3534128495539745e-06, + "loss": 1.1724, + "step": 8804 + }, + { + "epoch": 2.65, + "grad_norm": 60.3148193359375, + "learning_rate": 2.351408238949584e-06, + "loss": 2.7668, + "step": 8805 + }, + { + "epoch": 2.65, + "grad_norm": 38.45302963256836, + "learning_rate": 2.3494036283451942e-06, + "loss": 1.0094, + "step": 8806 + }, + { + "epoch": 2.65, + "grad_norm": 10.030008316040039, + "learning_rate": 2.347399017740804e-06, + "loss": 1.0792, + "step": 8807 + }, + { + "epoch": 2.65, + "grad_norm": 10.08592700958252, + "learning_rate": 2.345394407136414e-06, + "loss": 0.7964, + "step": 8808 + }, + { + "epoch": 2.65, + "grad_norm": 125.09164428710938, + "learning_rate": 2.3433897965320236e-06, + "loss": 3.0581, + "step": 8809 + }, + { + "epoch": 2.65, + "grad_norm": 19.070087432861328, + "learning_rate": 2.3413851859276337e-06, + "loss": 0.8685, + "step": 8810 + }, + { + "epoch": 2.65, + "grad_norm": 10.671518325805664, + "learning_rate": 2.339380575323244e-06, + "loss": 1.1835, + "step": 8811 + }, + { + "epoch": 2.65, + "grad_norm": 27.613374710083008, + "learning_rate": 2.3373759647188535e-06, + "loss": 1.4339, + "step": 8812 + }, + { + "epoch": 2.65, + "grad_norm": 18.462194442749023, + "learning_rate": 2.3353713541144636e-06, + "loss": 1.0176, + "step": 8813 + }, + { + "epoch": 2.65, + "grad_norm": 13.578800201416016, + "learning_rate": 2.3333667435100736e-06, + "loss": 2.1367, + "step": 8814 + }, + { + "epoch": 2.65, + "grad_norm": 43.0266227722168, + "learning_rate": 2.3313621329056833e-06, + "loss": 2.1957, + "step": 8815 + }, + { + "epoch": 2.65, + "grad_norm": 40.213008880615234, + "learning_rate": 2.3293575223012934e-06, + "loss": 1.9988, + "step": 8816 + }, + { + "epoch": 2.65, + "grad_norm": 14.245367050170898, + "learning_rate": 2.327352911696903e-06, + "loss": 1.891, + "step": 8817 + }, + { + "epoch": 2.65, + "grad_norm": 13.524105072021484, + "learning_rate": 2.3253483010925127e-06, + "loss": 1.4238, + "step": 8818 + }, + { + "epoch": 2.65, + "grad_norm": 22.680530548095703, + "learning_rate": 2.323343690488123e-06, + "loss": 1.2083, + "step": 8819 + }, + { + "epoch": 2.65, + "grad_norm": 13.493782043457031, + "learning_rate": 2.3213390798837325e-06, + "loss": 1.1022, + "step": 8820 + }, + { + "epoch": 2.65, + "grad_norm": 10.675830841064453, + "learning_rate": 2.3193344692793425e-06, + "loss": 0.5942, + "step": 8821 + }, + { + "epoch": 2.65, + "grad_norm": 12.953835487365723, + "learning_rate": 2.3173298586749526e-06, + "loss": 1.445, + "step": 8822 + }, + { + "epoch": 2.65, + "grad_norm": 10.076544761657715, + "learning_rate": 2.3153252480705623e-06, + "loss": 1.5124, + "step": 8823 + }, + { + "epoch": 2.65, + "grad_norm": 21.92123031616211, + "learning_rate": 2.3133206374661724e-06, + "loss": 1.8663, + "step": 8824 + }, + { + "epoch": 2.65, + "grad_norm": 25.028993606567383, + "learning_rate": 2.3113160268617824e-06, + "loss": 1.0345, + "step": 8825 + }, + { + "epoch": 2.65, + "grad_norm": 8.76278305053711, + "learning_rate": 2.309311416257392e-06, + "loss": 0.9227, + "step": 8826 + }, + { + "epoch": 2.65, + "grad_norm": 7.540186405181885, + "learning_rate": 2.307306805653002e-06, + "loss": 0.9597, + "step": 8827 + }, + { + "epoch": 2.65, + "grad_norm": 21.144914627075195, + "learning_rate": 2.3053021950486123e-06, + "loss": 1.3642, + "step": 8828 + }, + { + "epoch": 2.65, + "grad_norm": 17.2004451751709, + "learning_rate": 2.303297584444222e-06, + "loss": 0.6705, + "step": 8829 + }, + { + "epoch": 2.65, + "grad_norm": 8.921226501464844, + "learning_rate": 2.3012929738398316e-06, + "loss": 1.096, + "step": 8830 + }, + { + "epoch": 2.66, + "grad_norm": 13.363105773925781, + "learning_rate": 2.2992883632354417e-06, + "loss": 0.9989, + "step": 8831 + }, + { + "epoch": 2.66, + "grad_norm": 11.100980758666992, + "learning_rate": 2.2972837526310513e-06, + "loss": 1.4, + "step": 8832 + }, + { + "epoch": 2.66, + "grad_norm": 18.512340545654297, + "learning_rate": 2.2952791420266614e-06, + "loss": 1.3327, + "step": 8833 + }, + { + "epoch": 2.66, + "grad_norm": 52.814693450927734, + "learning_rate": 2.2932745314222715e-06, + "loss": 1.4743, + "step": 8834 + }, + { + "epoch": 2.66, + "grad_norm": 37.01776123046875, + "learning_rate": 2.291269920817881e-06, + "loss": 1.1938, + "step": 8835 + }, + { + "epoch": 2.66, + "grad_norm": 14.084249496459961, + "learning_rate": 2.2892653102134913e-06, + "loss": 1.4542, + "step": 8836 + }, + { + "epoch": 2.66, + "grad_norm": 34.67374038696289, + "learning_rate": 2.2872606996091013e-06, + "loss": 1.9464, + "step": 8837 + }, + { + "epoch": 2.66, + "grad_norm": 16.59427833557129, + "learning_rate": 2.285256089004711e-06, + "loss": 1.5703, + "step": 8838 + }, + { + "epoch": 2.66, + "grad_norm": 20.186216354370117, + "learning_rate": 2.283251478400321e-06, + "loss": 1.527, + "step": 8839 + }, + { + "epoch": 2.66, + "grad_norm": 34.25483703613281, + "learning_rate": 2.281246867795931e-06, + "loss": 2.5718, + "step": 8840 + }, + { + "epoch": 2.66, + "grad_norm": 28.123106002807617, + "learning_rate": 2.279242257191541e-06, + "loss": 1.5148, + "step": 8841 + }, + { + "epoch": 2.66, + "grad_norm": 55.747276306152344, + "learning_rate": 2.2772376465871505e-06, + "loss": 1.7577, + "step": 8842 + }, + { + "epoch": 2.66, + "grad_norm": 21.762004852294922, + "learning_rate": 2.2752330359827606e-06, + "loss": 1.3397, + "step": 8843 + }, + { + "epoch": 2.66, + "grad_norm": 38.36250305175781, + "learning_rate": 2.2732284253783702e-06, + "loss": 1.2055, + "step": 8844 + }, + { + "epoch": 2.66, + "grad_norm": 14.818611145019531, + "learning_rate": 2.2712238147739803e-06, + "loss": 1.3097, + "step": 8845 + }, + { + "epoch": 2.66, + "grad_norm": 7.749112129211426, + "learning_rate": 2.26921920416959e-06, + "loss": 0.4492, + "step": 8846 + }, + { + "epoch": 2.66, + "grad_norm": 10.357666969299316, + "learning_rate": 2.2672145935652e-06, + "loss": 0.9897, + "step": 8847 + }, + { + "epoch": 2.66, + "grad_norm": 15.227676391601562, + "learning_rate": 2.26520998296081e-06, + "loss": 1.3606, + "step": 8848 + }, + { + "epoch": 2.66, + "grad_norm": 16.850852966308594, + "learning_rate": 2.26320537235642e-06, + "loss": 1.4527, + "step": 8849 + }, + { + "epoch": 2.66, + "grad_norm": 16.4047794342041, + "learning_rate": 2.26120076175203e-06, + "loss": 1.6789, + "step": 8850 + }, + { + "epoch": 2.66, + "grad_norm": 7.696395397186279, + "learning_rate": 2.25919615114764e-06, + "loss": 0.9858, + "step": 8851 + }, + { + "epoch": 2.66, + "grad_norm": 21.263986587524414, + "learning_rate": 2.2571915405432496e-06, + "loss": 1.7351, + "step": 8852 + }, + { + "epoch": 2.66, + "grad_norm": 7.717570781707764, + "learning_rate": 2.2551869299388597e-06, + "loss": 1.0206, + "step": 8853 + }, + { + "epoch": 2.66, + "grad_norm": 14.79688549041748, + "learning_rate": 2.2531823193344694e-06, + "loss": 1.7117, + "step": 8854 + }, + { + "epoch": 2.66, + "grad_norm": 8.93305492401123, + "learning_rate": 2.2511777087300795e-06, + "loss": 1.839, + "step": 8855 + }, + { + "epoch": 2.66, + "grad_norm": 13.082503318786621, + "learning_rate": 2.249173098125689e-06, + "loss": 1.252, + "step": 8856 + }, + { + "epoch": 2.66, + "grad_norm": 11.024530410766602, + "learning_rate": 2.2471684875212992e-06, + "loss": 0.5396, + "step": 8857 + }, + { + "epoch": 2.66, + "grad_norm": 30.2106876373291, + "learning_rate": 2.245163876916909e-06, + "loss": 1.8259, + "step": 8858 + }, + { + "epoch": 2.66, + "grad_norm": 12.592206954956055, + "learning_rate": 2.243159266312519e-06, + "loss": 2.2187, + "step": 8859 + }, + { + "epoch": 2.66, + "grad_norm": 29.125699996948242, + "learning_rate": 2.241154655708129e-06, + "loss": 1.1455, + "step": 8860 + }, + { + "epoch": 2.66, + "grad_norm": 16.28235626220703, + "learning_rate": 2.2391500451037387e-06, + "loss": 1.9047, + "step": 8861 + }, + { + "epoch": 2.66, + "grad_norm": 30.546987533569336, + "learning_rate": 2.237145434499349e-06, + "loss": 1.7441, + "step": 8862 + }, + { + "epoch": 2.66, + "grad_norm": 12.745742797851562, + "learning_rate": 2.2351408238949585e-06, + "loss": 0.5604, + "step": 8863 + }, + { + "epoch": 2.67, + "grad_norm": 10.053805351257324, + "learning_rate": 2.2331362132905685e-06, + "loss": 1.467, + "step": 8864 + }, + { + "epoch": 2.67, + "grad_norm": 22.58064842224121, + "learning_rate": 2.2311316026861786e-06, + "loss": 1.3182, + "step": 8865 + }, + { + "epoch": 2.67, + "grad_norm": 10.636580467224121, + "learning_rate": 2.2291269920817883e-06, + "loss": 1.3195, + "step": 8866 + }, + { + "epoch": 2.67, + "grad_norm": 18.260143280029297, + "learning_rate": 2.2271223814773984e-06, + "loss": 1.3604, + "step": 8867 + }, + { + "epoch": 2.67, + "grad_norm": 8.831524848937988, + "learning_rate": 2.225117770873008e-06, + "loss": 1.1861, + "step": 8868 + }, + { + "epoch": 2.67, + "grad_norm": 13.567652702331543, + "learning_rate": 2.2231131602686177e-06, + "loss": 1.0606, + "step": 8869 + }, + { + "epoch": 2.67, + "grad_norm": 31.04169273376465, + "learning_rate": 2.2211085496642278e-06, + "loss": 1.1305, + "step": 8870 + }, + { + "epoch": 2.67, + "grad_norm": 17.62767219543457, + "learning_rate": 2.219103939059838e-06, + "loss": 1.8986, + "step": 8871 + }, + { + "epoch": 2.67, + "grad_norm": 31.392688751220703, + "learning_rate": 2.2170993284554475e-06, + "loss": 2.3315, + "step": 8872 + }, + { + "epoch": 2.67, + "grad_norm": 18.428953170776367, + "learning_rate": 2.2150947178510576e-06, + "loss": 1.559, + "step": 8873 + }, + { + "epoch": 2.67, + "grad_norm": 24.9808292388916, + "learning_rate": 2.2130901072466677e-06, + "loss": 1.6667, + "step": 8874 + }, + { + "epoch": 2.67, + "grad_norm": 23.458446502685547, + "learning_rate": 2.2110854966422773e-06, + "loss": 0.8407, + "step": 8875 + }, + { + "epoch": 2.67, + "grad_norm": 10.7035493850708, + "learning_rate": 2.2090808860378874e-06, + "loss": 1.3342, + "step": 8876 + }, + { + "epoch": 2.67, + "grad_norm": 21.982637405395508, + "learning_rate": 2.2070762754334975e-06, + "loss": 1.8623, + "step": 8877 + }, + { + "epoch": 2.67, + "grad_norm": 10.788835525512695, + "learning_rate": 2.205071664829107e-06, + "loss": 1.14, + "step": 8878 + }, + { + "epoch": 2.67, + "grad_norm": 12.52689266204834, + "learning_rate": 2.2030670542247173e-06, + "loss": 0.8905, + "step": 8879 + }, + { + "epoch": 2.67, + "grad_norm": 12.546585083007812, + "learning_rate": 2.201062443620327e-06, + "loss": 1.4123, + "step": 8880 + }, + { + "epoch": 2.67, + "eval_loss": 0.1634318232536316, + "eval_runtime": 43.2728, + "eval_samples_per_second": 34.179, + "eval_steps_per_second": 34.179, + "step": 8880 + }, + { + "epoch": 2.67, + "grad_norm": 23.065811157226562, + "learning_rate": 2.1990578330159366e-06, + "loss": 1.6073, + "step": 8881 + }, + { + "epoch": 2.67, + "grad_norm": 8.505629539489746, + "learning_rate": 2.1970532224115467e-06, + "loss": 0.6363, + "step": 8882 + }, + { + "epoch": 2.67, + "grad_norm": 19.605792999267578, + "learning_rate": 2.1950486118071563e-06, + "loss": 1.1445, + "step": 8883 + }, + { + "epoch": 2.67, + "grad_norm": 62.97642517089844, + "learning_rate": 2.1930440012027664e-06, + "loss": 3.0308, + "step": 8884 + }, + { + "epoch": 2.67, + "grad_norm": 34.175453186035156, + "learning_rate": 2.1910393905983765e-06, + "loss": 1.2063, + "step": 8885 + }, + { + "epoch": 2.67, + "grad_norm": 41.114051818847656, + "learning_rate": 2.189034779993986e-06, + "loss": 1.2233, + "step": 8886 + }, + { + "epoch": 2.67, + "grad_norm": 22.9063663482666, + "learning_rate": 2.1870301693895962e-06, + "loss": 1.1179, + "step": 8887 + }, + { + "epoch": 2.67, + "grad_norm": 12.28271770477295, + "learning_rate": 2.1850255587852063e-06, + "loss": 1.1284, + "step": 8888 + }, + { + "epoch": 2.67, + "grad_norm": 18.61870765686035, + "learning_rate": 2.183020948180816e-06, + "loss": 1.1018, + "step": 8889 + }, + { + "epoch": 2.67, + "grad_norm": 14.14230728149414, + "learning_rate": 2.181016337576426e-06, + "loss": 2.1612, + "step": 8890 + }, + { + "epoch": 2.67, + "grad_norm": 17.895893096923828, + "learning_rate": 2.179011726972036e-06, + "loss": 2.1376, + "step": 8891 + }, + { + "epoch": 2.67, + "grad_norm": 10.05941390991211, + "learning_rate": 2.177007116367646e-06, + "loss": 1.9443, + "step": 8892 + }, + { + "epoch": 2.67, + "grad_norm": 17.179000854492188, + "learning_rate": 2.1750025057632555e-06, + "loss": 1.3958, + "step": 8893 + }, + { + "epoch": 2.67, + "grad_norm": 18.431400299072266, + "learning_rate": 2.1729978951588656e-06, + "loss": 1.2758, + "step": 8894 + }, + { + "epoch": 2.67, + "grad_norm": 18.084781646728516, + "learning_rate": 2.1709932845544752e-06, + "loss": 1.7674, + "step": 8895 + }, + { + "epoch": 2.67, + "grad_norm": 15.693217277526855, + "learning_rate": 2.1689886739500853e-06, + "loss": 1.0816, + "step": 8896 + }, + { + "epoch": 2.67, + "grad_norm": 18.634521484375, + "learning_rate": 2.1669840633456954e-06, + "loss": 1.1063, + "step": 8897 + }, + { + "epoch": 2.68, + "grad_norm": 11.23396110534668, + "learning_rate": 2.164979452741305e-06, + "loss": 0.6603, + "step": 8898 + }, + { + "epoch": 2.68, + "grad_norm": 8.881089210510254, + "learning_rate": 2.162974842136915e-06, + "loss": 0.7917, + "step": 8899 + }, + { + "epoch": 2.68, + "grad_norm": 6.563168525695801, + "learning_rate": 2.1609702315325252e-06, + "loss": 0.7384, + "step": 8900 + }, + { + "epoch": 2.68, + "grad_norm": 45.68901824951172, + "learning_rate": 2.158965620928135e-06, + "loss": 2.6647, + "step": 8901 + }, + { + "epoch": 2.68, + "grad_norm": 10.925772666931152, + "learning_rate": 2.156961010323745e-06, + "loss": 0.5719, + "step": 8902 + }, + { + "epoch": 2.68, + "grad_norm": 52.76468276977539, + "learning_rate": 2.1549563997193546e-06, + "loss": 1.5088, + "step": 8903 + }, + { + "epoch": 2.68, + "grad_norm": 17.248638153076172, + "learning_rate": 2.1529517891149647e-06, + "loss": 1.2967, + "step": 8904 + }, + { + "epoch": 2.68, + "grad_norm": 51.29323196411133, + "learning_rate": 2.1509471785105744e-06, + "loss": 1.6994, + "step": 8905 + }, + { + "epoch": 2.68, + "grad_norm": 42.033302307128906, + "learning_rate": 2.1489425679061845e-06, + "loss": 1.3593, + "step": 8906 + }, + { + "epoch": 2.68, + "grad_norm": 9.53936767578125, + "learning_rate": 2.146937957301794e-06, + "loss": 0.7497, + "step": 8907 + }, + { + "epoch": 2.68, + "grad_norm": 29.297077178955078, + "learning_rate": 2.144933346697404e-06, + "loss": 1.042, + "step": 8908 + }, + { + "epoch": 2.68, + "grad_norm": 37.91065216064453, + "learning_rate": 2.142928736093014e-06, + "loss": 2.1151, + "step": 8909 + }, + { + "epoch": 2.68, + "grad_norm": 24.55820655822754, + "learning_rate": 2.140924125488624e-06, + "loss": 1.9639, + "step": 8910 + }, + { + "epoch": 2.68, + "grad_norm": 30.16676902770996, + "learning_rate": 2.138919514884234e-06, + "loss": 1.3401, + "step": 8911 + }, + { + "epoch": 2.68, + "grad_norm": 47.085235595703125, + "learning_rate": 2.1369149042798437e-06, + "loss": 2.4905, + "step": 8912 + }, + { + "epoch": 2.68, + "grad_norm": 19.413427352905273, + "learning_rate": 2.1349102936754538e-06, + "loss": 1.4043, + "step": 8913 + }, + { + "epoch": 2.68, + "grad_norm": 31.471092224121094, + "learning_rate": 2.132905683071064e-06, + "loss": 1.9347, + "step": 8914 + }, + { + "epoch": 2.68, + "grad_norm": 26.705747604370117, + "learning_rate": 2.1309010724666735e-06, + "loss": 2.8265, + "step": 8915 + }, + { + "epoch": 2.68, + "grad_norm": 43.82736587524414, + "learning_rate": 2.1288964618622836e-06, + "loss": 1.3732, + "step": 8916 + }, + { + "epoch": 2.68, + "grad_norm": 12.801732063293457, + "learning_rate": 2.1268918512578933e-06, + "loss": 0.7379, + "step": 8917 + }, + { + "epoch": 2.68, + "grad_norm": 14.50540828704834, + "learning_rate": 2.1248872406535033e-06, + "loss": 1.2645, + "step": 8918 + }, + { + "epoch": 2.68, + "grad_norm": 55.6925163269043, + "learning_rate": 2.122882630049113e-06, + "loss": 1.05, + "step": 8919 + }, + { + "epoch": 2.68, + "grad_norm": 8.958430290222168, + "learning_rate": 2.120878019444723e-06, + "loss": 0.9973, + "step": 8920 + }, + { + "epoch": 2.68, + "grad_norm": 12.112589836120605, + "learning_rate": 2.1188734088403328e-06, + "loss": 0.5669, + "step": 8921 + }, + { + "epoch": 2.68, + "grad_norm": 9.585906982421875, + "learning_rate": 2.116868798235943e-06, + "loss": 1.455, + "step": 8922 + }, + { + "epoch": 2.68, + "grad_norm": 10.292905807495117, + "learning_rate": 2.114864187631553e-06, + "loss": 0.822, + "step": 8923 + }, + { + "epoch": 2.68, + "grad_norm": 12.411725997924805, + "learning_rate": 2.1128595770271626e-06, + "loss": 0.5915, + "step": 8924 + }, + { + "epoch": 2.68, + "grad_norm": 95.50955200195312, + "learning_rate": 2.1108549664227727e-06, + "loss": 1.2842, + "step": 8925 + }, + { + "epoch": 2.68, + "grad_norm": 15.629396438598633, + "learning_rate": 2.1088503558183823e-06, + "loss": 1.6067, + "step": 8926 + }, + { + "epoch": 2.68, + "grad_norm": 15.7233304977417, + "learning_rate": 2.1068457452139924e-06, + "loss": 0.9173, + "step": 8927 + }, + { + "epoch": 2.68, + "grad_norm": 30.911314010620117, + "learning_rate": 2.1048411346096025e-06, + "loss": 2.3595, + "step": 8928 + }, + { + "epoch": 2.68, + "grad_norm": 12.196179389953613, + "learning_rate": 2.102836524005212e-06, + "loss": 1.0742, + "step": 8929 + }, + { + "epoch": 2.68, + "grad_norm": 13.497405052185059, + "learning_rate": 2.1008319134008222e-06, + "loss": 1.1664, + "step": 8930 + }, + { + "epoch": 2.69, + "grad_norm": 14.204902648925781, + "learning_rate": 2.098827302796432e-06, + "loss": 1.0476, + "step": 8931 + }, + { + "epoch": 2.69, + "grad_norm": 21.779407501220703, + "learning_rate": 2.0968226921920416e-06, + "loss": 1.0798, + "step": 8932 + }, + { + "epoch": 2.69, + "grad_norm": 30.279451370239258, + "learning_rate": 2.0948180815876516e-06, + "loss": 1.2667, + "step": 8933 + }, + { + "epoch": 2.69, + "grad_norm": 13.741867065429688, + "learning_rate": 2.0928134709832617e-06, + "loss": 0.93, + "step": 8934 + }, + { + "epoch": 2.69, + "grad_norm": 9.646475791931152, + "learning_rate": 2.0908088603788714e-06, + "loss": 1.304, + "step": 8935 + }, + { + "epoch": 2.69, + "grad_norm": 13.321015357971191, + "learning_rate": 2.0888042497744815e-06, + "loss": 1.1108, + "step": 8936 + }, + { + "epoch": 2.69, + "grad_norm": 15.75296688079834, + "learning_rate": 2.0867996391700916e-06, + "loss": 0.9929, + "step": 8937 + }, + { + "epoch": 2.69, + "grad_norm": 24.70446014404297, + "learning_rate": 2.0847950285657012e-06, + "loss": 1.2706, + "step": 8938 + }, + { + "epoch": 2.69, + "grad_norm": 13.372381210327148, + "learning_rate": 2.0827904179613113e-06, + "loss": 0.6046, + "step": 8939 + }, + { + "epoch": 2.69, + "grad_norm": 40.783939361572266, + "learning_rate": 2.0807858073569214e-06, + "loss": 1.5435, + "step": 8940 + }, + { + "epoch": 2.69, + "grad_norm": 8.127727508544922, + "learning_rate": 2.078781196752531e-06, + "loss": 0.8914, + "step": 8941 + }, + { + "epoch": 2.69, + "grad_norm": 16.346284866333008, + "learning_rate": 2.076776586148141e-06, + "loss": 1.139, + "step": 8942 + }, + { + "epoch": 2.69, + "grad_norm": 83.08602142333984, + "learning_rate": 2.074771975543751e-06, + "loss": 2.625, + "step": 8943 + }, + { + "epoch": 2.69, + "grad_norm": 13.884016036987305, + "learning_rate": 2.0727673649393605e-06, + "loss": 1.0258, + "step": 8944 + }, + { + "epoch": 2.69, + "grad_norm": 11.885502815246582, + "learning_rate": 2.0707627543349705e-06, + "loss": 0.8579, + "step": 8945 + }, + { + "epoch": 2.69, + "grad_norm": 9.849443435668945, + "learning_rate": 2.06875814373058e-06, + "loss": 0.9329, + "step": 8946 + }, + { + "epoch": 2.69, + "grad_norm": 9.109235763549805, + "learning_rate": 2.0667535331261903e-06, + "loss": 0.9577, + "step": 8947 + }, + { + "epoch": 2.69, + "grad_norm": 10.755121231079102, + "learning_rate": 2.0647489225218004e-06, + "loss": 0.6667, + "step": 8948 + }, + { + "epoch": 2.69, + "grad_norm": 10.410956382751465, + "learning_rate": 2.06274431191741e-06, + "loss": 0.4745, + "step": 8949 + }, + { + "epoch": 2.69, + "grad_norm": 10.473151206970215, + "learning_rate": 2.06073970131302e-06, + "loss": 1.2898, + "step": 8950 + }, + { + "epoch": 2.69, + "grad_norm": 14.992378234863281, + "learning_rate": 2.05873509070863e-06, + "loss": 1.5699, + "step": 8951 + }, + { + "epoch": 2.69, + "grad_norm": 9.013774871826172, + "learning_rate": 2.05673048010424e-06, + "loss": 1.1274, + "step": 8952 + }, + { + "epoch": 2.69, + "grad_norm": 18.34583854675293, + "learning_rate": 2.05472586949985e-06, + "loss": 1.1243, + "step": 8953 + }, + { + "epoch": 2.69, + "grad_norm": 9.34281063079834, + "learning_rate": 2.0527212588954596e-06, + "loss": 0.977, + "step": 8954 + }, + { + "epoch": 2.69, + "grad_norm": 19.33155632019043, + "learning_rate": 2.0507166482910697e-06, + "loss": 0.7991, + "step": 8955 + }, + { + "epoch": 2.69, + "grad_norm": 62.87168502807617, + "learning_rate": 2.0487120376866794e-06, + "loss": 1.8296, + "step": 8956 + }, + { + "epoch": 2.69, + "grad_norm": 14.451416015625, + "learning_rate": 2.0467074270822894e-06, + "loss": 0.9752, + "step": 8957 + }, + { + "epoch": 2.69, + "grad_norm": 13.052489280700684, + "learning_rate": 2.044702816477899e-06, + "loss": 1.0093, + "step": 8958 + }, + { + "epoch": 2.69, + "grad_norm": 10.156307220458984, + "learning_rate": 2.042698205873509e-06, + "loss": 1.142, + "step": 8959 + }, + { + "epoch": 2.69, + "grad_norm": 32.259788513183594, + "learning_rate": 2.0406935952691193e-06, + "loss": 2.6409, + "step": 8960 + }, + { + "epoch": 2.69, + "grad_norm": 35.24033737182617, + "learning_rate": 2.038688984664729e-06, + "loss": 1.7325, + "step": 8961 + }, + { + "epoch": 2.69, + "grad_norm": 15.073776245117188, + "learning_rate": 2.036684374060339e-06, + "loss": 0.7844, + "step": 8962 + }, + { + "epoch": 2.69, + "grad_norm": 26.254253387451172, + "learning_rate": 2.034679763455949e-06, + "loss": 0.9144, + "step": 8963 + }, + { + "epoch": 2.7, + "grad_norm": 10.655972480773926, + "learning_rate": 2.0326751528515588e-06, + "loss": 1.5722, + "step": 8964 + }, + { + "epoch": 2.7, + "grad_norm": 15.8784761428833, + "learning_rate": 2.030670542247169e-06, + "loss": 1.5385, + "step": 8965 + }, + { + "epoch": 2.7, + "grad_norm": 75.25648498535156, + "learning_rate": 2.0286659316427785e-06, + "loss": 1.2864, + "step": 8966 + }, + { + "epoch": 2.7, + "grad_norm": 12.602245330810547, + "learning_rate": 2.0266613210383886e-06, + "loss": 1.6121, + "step": 8967 + }, + { + "epoch": 2.7, + "grad_norm": 26.045513153076172, + "learning_rate": 2.0246567104339982e-06, + "loss": 1.1818, + "step": 8968 + }, + { + "epoch": 2.7, + "grad_norm": 15.361552238464355, + "learning_rate": 2.0226520998296083e-06, + "loss": 0.8867, + "step": 8969 + }, + { + "epoch": 2.7, + "grad_norm": 52.4283561706543, + "learning_rate": 2.020647489225218e-06, + "loss": 2.3197, + "step": 8970 + }, + { + "epoch": 2.7, + "grad_norm": 10.277100563049316, + "learning_rate": 2.018642878620828e-06, + "loss": 1.2036, + "step": 8971 + }, + { + "epoch": 2.7, + "grad_norm": 56.391624450683594, + "learning_rate": 2.0166382680164377e-06, + "loss": 1.6374, + "step": 8972 + }, + { + "epoch": 2.7, + "grad_norm": 16.56217384338379, + "learning_rate": 2.014633657412048e-06, + "loss": 1.2358, + "step": 8973 + }, + { + "epoch": 2.7, + "grad_norm": 30.0115966796875, + "learning_rate": 2.012629046807658e-06, + "loss": 1.2294, + "step": 8974 + }, + { + "epoch": 2.7, + "grad_norm": 10.70960807800293, + "learning_rate": 2.0106244362032676e-06, + "loss": 1.2672, + "step": 8975 + }, + { + "epoch": 2.7, + "grad_norm": 12.158578872680664, + "learning_rate": 2.0086198255988776e-06, + "loss": 0.794, + "step": 8976 + }, + { + "epoch": 2.7, + "grad_norm": 6.594705104827881, + "learning_rate": 2.0066152149944877e-06, + "loss": 0.964, + "step": 8977 + }, + { + "epoch": 2.7, + "grad_norm": 18.69150161743164, + "learning_rate": 2.0046106043900974e-06, + "loss": 1.1616, + "step": 8978 + }, + { + "epoch": 2.7, + "grad_norm": 19.576065063476562, + "learning_rate": 2.0026059937857075e-06, + "loss": 0.9053, + "step": 8979 + }, + { + "epoch": 2.7, + "grad_norm": 30.62327003479004, + "learning_rate": 2.000601383181317e-06, + "loss": 1.0141, + "step": 8980 + }, + { + "epoch": 2.7, + "grad_norm": 16.1602840423584, + "learning_rate": 1.9985967725769272e-06, + "loss": 1.0071, + "step": 8981 + }, + { + "epoch": 2.7, + "grad_norm": 16.035024642944336, + "learning_rate": 1.996592161972537e-06, + "loss": 1.9886, + "step": 8982 + }, + { + "epoch": 2.7, + "grad_norm": 32.672157287597656, + "learning_rate": 1.994587551368147e-06, + "loss": 1.9142, + "step": 8983 + }, + { + "epoch": 2.7, + "grad_norm": 34.7219352722168, + "learning_rate": 1.9925829407637566e-06, + "loss": 1.6279, + "step": 8984 + }, + { + "epoch": 2.7, + "grad_norm": 38.268131256103516, + "learning_rate": 1.9905783301593667e-06, + "loss": 1.3426, + "step": 8985 + }, + { + "epoch": 2.7, + "grad_norm": 10.52354621887207, + "learning_rate": 1.988573719554977e-06, + "loss": 1.3539, + "step": 8986 + }, + { + "epoch": 2.7, + "grad_norm": 19.19072914123535, + "learning_rate": 1.9865691089505865e-06, + "loss": 1.7263, + "step": 8987 + }, + { + "epoch": 2.7, + "grad_norm": 35.17476272583008, + "learning_rate": 1.9845644983461965e-06, + "loss": 1.2377, + "step": 8988 + }, + { + "epoch": 2.7, + "grad_norm": 23.662670135498047, + "learning_rate": 1.982559887741806e-06, + "loss": 0.9037, + "step": 8989 + }, + { + "epoch": 2.7, + "grad_norm": 25.2396297454834, + "learning_rate": 1.9805552771374163e-06, + "loss": 1.8102, + "step": 8990 + }, + { + "epoch": 2.7, + "grad_norm": 8.776323318481445, + "learning_rate": 1.9785506665330264e-06, + "loss": 0.7533, + "step": 8991 + }, + { + "epoch": 2.7, + "grad_norm": 16.68621253967285, + "learning_rate": 1.976546055928636e-06, + "loss": 1.3865, + "step": 8992 + }, + { + "epoch": 2.7, + "grad_norm": 17.21820640563965, + "learning_rate": 1.974541445324246e-06, + "loss": 1.0538, + "step": 8993 + }, + { + "epoch": 2.7, + "grad_norm": 13.057286262512207, + "learning_rate": 1.9725368347198558e-06, + "loss": 1.1173, + "step": 8994 + }, + { + "epoch": 2.7, + "grad_norm": 16.948637008666992, + "learning_rate": 1.9705322241154654e-06, + "loss": 0.5743, + "step": 8995 + }, + { + "epoch": 2.7, + "grad_norm": 11.57693099975586, + "learning_rate": 1.9685276135110755e-06, + "loss": 1.2016, + "step": 8996 + }, + { + "epoch": 2.71, + "grad_norm": 25.845434188842773, + "learning_rate": 1.9665230029066856e-06, + "loss": 1.6141, + "step": 8997 + }, + { + "epoch": 2.71, + "grad_norm": 11.109925270080566, + "learning_rate": 1.9645183923022953e-06, + "loss": 1.2974, + "step": 8998 + }, + { + "epoch": 2.71, + "grad_norm": 17.424144744873047, + "learning_rate": 1.9625137816979054e-06, + "loss": 1.0136, + "step": 8999 + }, + { + "epoch": 2.71, + "grad_norm": 13.590910911560059, + "learning_rate": 1.9605091710935154e-06, + "loss": 2.0274, + "step": 9000 + }, + { + "epoch": 2.71, + "eval_loss": 0.16217102110385895, + "eval_runtime": 43.386, + "eval_samples_per_second": 34.089, + "eval_steps_per_second": 34.089, + "step": 9000 + }, + { + "epoch": 2.71, + "grad_norm": 16.488697052001953, + "learning_rate": 1.958504560489125e-06, + "loss": 1.3625, + "step": 9001 + }, + { + "epoch": 2.71, + "grad_norm": 9.223793983459473, + "learning_rate": 1.956499949884735e-06, + "loss": 1.2811, + "step": 9002 + }, + { + "epoch": 2.71, + "grad_norm": 13.224336624145508, + "learning_rate": 1.9544953392803453e-06, + "loss": 1.5226, + "step": 9003 + }, + { + "epoch": 2.71, + "grad_norm": 39.96490478515625, + "learning_rate": 1.952490728675955e-06, + "loss": 0.9551, + "step": 9004 + }, + { + "epoch": 2.71, + "grad_norm": 8.030670166015625, + "learning_rate": 1.9504861180715646e-06, + "loss": 0.5237, + "step": 9005 + }, + { + "epoch": 2.71, + "grad_norm": 45.17795944213867, + "learning_rate": 1.9484815074671747e-06, + "loss": 1.1533, + "step": 9006 + }, + { + "epoch": 2.71, + "grad_norm": 13.118334770202637, + "learning_rate": 1.9464768968627843e-06, + "loss": 1.3035, + "step": 9007 + }, + { + "epoch": 2.71, + "grad_norm": 13.178396224975586, + "learning_rate": 1.9444722862583944e-06, + "loss": 1.2787, + "step": 9008 + }, + { + "epoch": 2.71, + "grad_norm": 12.746395111083984, + "learning_rate": 1.942467675654004e-06, + "loss": 1.3914, + "step": 9009 + }, + { + "epoch": 2.71, + "grad_norm": 25.33348274230957, + "learning_rate": 1.940463065049614e-06, + "loss": 1.2344, + "step": 9010 + }, + { + "epoch": 2.71, + "grad_norm": 18.61649513244629, + "learning_rate": 1.9384584544452242e-06, + "loss": 0.9396, + "step": 9011 + }, + { + "epoch": 2.71, + "grad_norm": 98.35498809814453, + "learning_rate": 1.936453843840834e-06, + "loss": 2.0006, + "step": 9012 + }, + { + "epoch": 2.71, + "grad_norm": 12.018935203552246, + "learning_rate": 1.934449233236444e-06, + "loss": 1.1039, + "step": 9013 + }, + { + "epoch": 2.71, + "grad_norm": 18.714567184448242, + "learning_rate": 1.932444622632054e-06, + "loss": 1.2751, + "step": 9014 + }, + { + "epoch": 2.71, + "grad_norm": 41.53782653808594, + "learning_rate": 1.9304400120276637e-06, + "loss": 1.7534, + "step": 9015 + }, + { + "epoch": 2.71, + "grad_norm": 37.50891876220703, + "learning_rate": 1.928435401423274e-06, + "loss": 1.1007, + "step": 9016 + }, + { + "epoch": 2.71, + "grad_norm": 14.983277320861816, + "learning_rate": 1.9264307908188835e-06, + "loss": 1.6123, + "step": 9017 + }, + { + "epoch": 2.71, + "grad_norm": 13.673733711242676, + "learning_rate": 1.9244261802144936e-06, + "loss": 1.0661, + "step": 9018 + }, + { + "epoch": 2.71, + "grad_norm": 9.97967529296875, + "learning_rate": 1.9224215696101032e-06, + "loss": 1.5572, + "step": 9019 + }, + { + "epoch": 2.71, + "grad_norm": 12.354124069213867, + "learning_rate": 1.9204169590057133e-06, + "loss": 1.2915, + "step": 9020 + }, + { + "epoch": 2.71, + "grad_norm": 11.94011116027832, + "learning_rate": 1.918412348401323e-06, + "loss": 1.3119, + "step": 9021 + }, + { + "epoch": 2.71, + "grad_norm": 11.489762306213379, + "learning_rate": 1.916407737796933e-06, + "loss": 1.1241, + "step": 9022 + }, + { + "epoch": 2.71, + "grad_norm": 26.69053840637207, + "learning_rate": 1.914403127192543e-06, + "loss": 1.3275, + "step": 9023 + }, + { + "epoch": 2.71, + "grad_norm": 15.625397682189941, + "learning_rate": 1.912398516588153e-06, + "loss": 1.2162, + "step": 9024 + }, + { + "epoch": 2.71, + "grad_norm": 23.008207321166992, + "learning_rate": 1.910393905983763e-06, + "loss": 1.4855, + "step": 9025 + }, + { + "epoch": 2.71, + "grad_norm": 122.37059783935547, + "learning_rate": 1.908389295379373e-06, + "loss": 1.5003, + "step": 9026 + }, + { + "epoch": 2.71, + "grad_norm": 47.32773208618164, + "learning_rate": 1.9063846847749826e-06, + "loss": 2.3885, + "step": 9027 + }, + { + "epoch": 2.71, + "grad_norm": 10.500248908996582, + "learning_rate": 1.9043800741705925e-06, + "loss": 1.3469, + "step": 9028 + }, + { + "epoch": 2.71, + "grad_norm": 12.691232681274414, + "learning_rate": 1.9023754635662024e-06, + "loss": 1.5982, + "step": 9029 + }, + { + "epoch": 2.71, + "grad_norm": 15.978590965270996, + "learning_rate": 1.9003708529618122e-06, + "loss": 0.9491, + "step": 9030 + }, + { + "epoch": 2.72, + "grad_norm": 22.821592330932617, + "learning_rate": 1.8983662423574223e-06, + "loss": 1.3128, + "step": 9031 + }, + { + "epoch": 2.72, + "grad_norm": 33.413455963134766, + "learning_rate": 1.896361631753032e-06, + "loss": 2.5185, + "step": 9032 + }, + { + "epoch": 2.72, + "grad_norm": 14.448841094970703, + "learning_rate": 1.894357021148642e-06, + "loss": 1.7291, + "step": 9033 + }, + { + "epoch": 2.72, + "grad_norm": 17.098773956298828, + "learning_rate": 1.892352410544252e-06, + "loss": 0.7803, + "step": 9034 + }, + { + "epoch": 2.72, + "grad_norm": 22.17733383178711, + "learning_rate": 1.8903477999398618e-06, + "loss": 2.054, + "step": 9035 + }, + { + "epoch": 2.72, + "grad_norm": 18.0053768157959, + "learning_rate": 1.8883431893354717e-06, + "loss": 0.9891, + "step": 9036 + }, + { + "epoch": 2.72, + "grad_norm": 7.076298713684082, + "learning_rate": 1.8863385787310818e-06, + "loss": 0.6043, + "step": 9037 + }, + { + "epoch": 2.72, + "grad_norm": 10.31680965423584, + "learning_rate": 1.8843339681266914e-06, + "loss": 0.6896, + "step": 9038 + }, + { + "epoch": 2.72, + "grad_norm": 34.10591125488281, + "learning_rate": 1.8823293575223015e-06, + "loss": 1.6691, + "step": 9039 + }, + { + "epoch": 2.72, + "grad_norm": 19.84099006652832, + "learning_rate": 1.8803247469179114e-06, + "loss": 1.7946, + "step": 9040 + }, + { + "epoch": 2.72, + "grad_norm": 16.761245727539062, + "learning_rate": 1.8783201363135213e-06, + "loss": 1.7972, + "step": 9041 + }, + { + "epoch": 2.72, + "grad_norm": 19.77780532836914, + "learning_rate": 1.8763155257091311e-06, + "loss": 1.0711, + "step": 9042 + }, + { + "epoch": 2.72, + "grad_norm": 12.905445098876953, + "learning_rate": 1.8743109151047412e-06, + "loss": 0.79, + "step": 9043 + }, + { + "epoch": 2.72, + "grad_norm": 34.32182312011719, + "learning_rate": 1.8723063045003509e-06, + "loss": 1.5714, + "step": 9044 + }, + { + "epoch": 2.72, + "grad_norm": 15.384803771972656, + "learning_rate": 1.870301693895961e-06, + "loss": 0.9761, + "step": 9045 + }, + { + "epoch": 2.72, + "grad_norm": 10.162015914916992, + "learning_rate": 1.8682970832915708e-06, + "loss": 1.0334, + "step": 9046 + }, + { + "epoch": 2.72, + "grad_norm": 21.561569213867188, + "learning_rate": 1.8662924726871807e-06, + "loss": 0.7688, + "step": 9047 + }, + { + "epoch": 2.72, + "grad_norm": 8.59749698638916, + "learning_rate": 1.8642878620827906e-06, + "loss": 0.533, + "step": 9048 + }, + { + "epoch": 2.72, + "grad_norm": 17.311405181884766, + "learning_rate": 1.8622832514784007e-06, + "loss": 0.9795, + "step": 9049 + }, + { + "epoch": 2.72, + "grad_norm": 67.3037109375, + "learning_rate": 1.8602786408740103e-06, + "loss": 2.6467, + "step": 9050 + }, + { + "epoch": 2.72, + "grad_norm": 12.96740436553955, + "learning_rate": 1.8582740302696204e-06, + "loss": 1.1722, + "step": 9051 + }, + { + "epoch": 2.72, + "grad_norm": 16.440349578857422, + "learning_rate": 1.85626941966523e-06, + "loss": 0.6379, + "step": 9052 + }, + { + "epoch": 2.72, + "grad_norm": 35.48670196533203, + "learning_rate": 1.85426480906084e-06, + "loss": 1.4359, + "step": 9053 + }, + { + "epoch": 2.72, + "grad_norm": 55.38397216796875, + "learning_rate": 1.85226019845645e-06, + "loss": 2.2976, + "step": 9054 + }, + { + "epoch": 2.72, + "grad_norm": 20.49527931213379, + "learning_rate": 1.8502555878520597e-06, + "loss": 1.1748, + "step": 9055 + }, + { + "epoch": 2.72, + "grad_norm": 11.808248519897461, + "learning_rate": 1.8482509772476698e-06, + "loss": 1.6608, + "step": 9056 + }, + { + "epoch": 2.72, + "grad_norm": 25.69436264038086, + "learning_rate": 1.8462463666432799e-06, + "loss": 1.0124, + "step": 9057 + }, + { + "epoch": 2.72, + "grad_norm": 27.099803924560547, + "learning_rate": 1.8442417560388895e-06, + "loss": 1.4334, + "step": 9058 + }, + { + "epoch": 2.72, + "grad_norm": 16.551921844482422, + "learning_rate": 1.8422371454344994e-06, + "loss": 1.4909, + "step": 9059 + }, + { + "epoch": 2.72, + "grad_norm": 14.500848770141602, + "learning_rate": 1.8402325348301095e-06, + "loss": 1.1442, + "step": 9060 + }, + { + "epoch": 2.72, + "grad_norm": 14.154791831970215, + "learning_rate": 1.8382279242257191e-06, + "loss": 1.8214, + "step": 9061 + }, + { + "epoch": 2.72, + "grad_norm": 28.4224796295166, + "learning_rate": 1.8362233136213292e-06, + "loss": 1.2986, + "step": 9062 + }, + { + "epoch": 2.72, + "grad_norm": 14.508337020874023, + "learning_rate": 1.8342187030169393e-06, + "loss": 1.1846, + "step": 9063 + }, + { + "epoch": 2.73, + "grad_norm": 17.251667022705078, + "learning_rate": 1.832214092412549e-06, + "loss": 1.2499, + "step": 9064 + }, + { + "epoch": 2.73, + "grad_norm": 37.404449462890625, + "learning_rate": 1.8302094818081588e-06, + "loss": 0.9916, + "step": 9065 + }, + { + "epoch": 2.73, + "grad_norm": 13.81652545928955, + "learning_rate": 1.828204871203769e-06, + "loss": 1.2755, + "step": 9066 + }, + { + "epoch": 2.73, + "grad_norm": 21.725704193115234, + "learning_rate": 1.8262002605993786e-06, + "loss": 1.1399, + "step": 9067 + }, + { + "epoch": 2.73, + "grad_norm": 10.298942565917969, + "learning_rate": 1.8241956499949887e-06, + "loss": 0.7941, + "step": 9068 + }, + { + "epoch": 2.73, + "grad_norm": 25.29086685180664, + "learning_rate": 1.8221910393905988e-06, + "loss": 1.3804, + "step": 9069 + }, + { + "epoch": 2.73, + "grad_norm": 18.70489501953125, + "learning_rate": 1.8201864287862084e-06, + "loss": 1.1317, + "step": 9070 + }, + { + "epoch": 2.73, + "grad_norm": 11.406014442443848, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.4713, + "step": 9071 + }, + { + "epoch": 2.73, + "grad_norm": 22.935609817504883, + "learning_rate": 1.8161772075774282e-06, + "loss": 1.6272, + "step": 9072 + }, + { + "epoch": 2.73, + "grad_norm": 33.32746124267578, + "learning_rate": 1.814172596973038e-06, + "loss": 1.3237, + "step": 9073 + }, + { + "epoch": 2.73, + "grad_norm": 8.96728801727295, + "learning_rate": 1.8121679863686481e-06, + "loss": 0.556, + "step": 9074 + }, + { + "epoch": 2.73, + "grad_norm": 23.04669761657715, + "learning_rate": 1.8101633757642578e-06, + "loss": 1.5039, + "step": 9075 + }, + { + "epoch": 2.73, + "grad_norm": 10.999143600463867, + "learning_rate": 1.8081587651598679e-06, + "loss": 1.7855, + "step": 9076 + }, + { + "epoch": 2.73, + "grad_norm": 21.91204261779785, + "learning_rate": 1.8061541545554777e-06, + "loss": 1.4705, + "step": 9077 + }, + { + "epoch": 2.73, + "grad_norm": 31.69646644592285, + "learning_rate": 1.8041495439510876e-06, + "loss": 1.5004, + "step": 9078 + }, + { + "epoch": 2.73, + "grad_norm": 8.526796340942383, + "learning_rate": 1.8021449333466975e-06, + "loss": 0.8363, + "step": 9079 + }, + { + "epoch": 2.73, + "grad_norm": 91.82820129394531, + "learning_rate": 1.8001403227423076e-06, + "loss": 2.5966, + "step": 9080 + }, + { + "epoch": 2.73, + "grad_norm": 62.776432037353516, + "learning_rate": 1.7981357121379172e-06, + "loss": 2.0551, + "step": 9081 + }, + { + "epoch": 2.73, + "grad_norm": 7.502096652984619, + "learning_rate": 1.7961311015335273e-06, + "loss": 0.9674, + "step": 9082 + }, + { + "epoch": 2.73, + "grad_norm": 13.519929885864258, + "learning_rate": 1.7941264909291372e-06, + "loss": 0.9669, + "step": 9083 + }, + { + "epoch": 2.73, + "grad_norm": 18.463733673095703, + "learning_rate": 1.792121880324747e-06, + "loss": 1.557, + "step": 9084 + }, + { + "epoch": 2.73, + "grad_norm": 18.185571670532227, + "learning_rate": 1.790117269720357e-06, + "loss": 1.9059, + "step": 9085 + }, + { + "epoch": 2.73, + "grad_norm": 11.766922950744629, + "learning_rate": 1.788112659115967e-06, + "loss": 1.3468, + "step": 9086 + }, + { + "epoch": 2.73, + "grad_norm": 22.52054786682129, + "learning_rate": 1.7861080485115767e-06, + "loss": 0.9551, + "step": 9087 + }, + { + "epoch": 2.73, + "grad_norm": 8.950528144836426, + "learning_rate": 1.7841034379071868e-06, + "loss": 0.6039, + "step": 9088 + }, + { + "epoch": 2.73, + "grad_norm": 19.103931427001953, + "learning_rate": 1.7820988273027966e-06, + "loss": 1.0202, + "step": 9089 + }, + { + "epoch": 2.73, + "grad_norm": 14.955410957336426, + "learning_rate": 1.7800942166984065e-06, + "loss": 1.4588, + "step": 9090 + }, + { + "epoch": 2.73, + "grad_norm": 49.647361755371094, + "learning_rate": 1.7780896060940164e-06, + "loss": 3.6037, + "step": 9091 + }, + { + "epoch": 2.73, + "grad_norm": 111.04601287841797, + "learning_rate": 1.7760849954896263e-06, + "loss": 1.5603, + "step": 9092 + }, + { + "epoch": 2.73, + "grad_norm": 49.5386962890625, + "learning_rate": 1.7740803848852361e-06, + "loss": 2.5773, + "step": 9093 + }, + { + "epoch": 2.73, + "grad_norm": 12.374490737915039, + "learning_rate": 1.7720757742808462e-06, + "loss": 1.1015, + "step": 9094 + }, + { + "epoch": 2.73, + "grad_norm": 9.451532363891602, + "learning_rate": 1.7700711636764559e-06, + "loss": 0.4628, + "step": 9095 + }, + { + "epoch": 2.73, + "grad_norm": 13.187180519104004, + "learning_rate": 1.768066553072066e-06, + "loss": 0.8442, + "step": 9096 + }, + { + "epoch": 2.74, + "grad_norm": 12.320247650146484, + "learning_rate": 1.7660619424676758e-06, + "loss": 0.88, + "step": 9097 + }, + { + "epoch": 2.74, + "grad_norm": 8.388055801391602, + "learning_rate": 1.7640573318632857e-06, + "loss": 0.7241, + "step": 9098 + }, + { + "epoch": 2.74, + "grad_norm": 11.43036937713623, + "learning_rate": 1.7620527212588956e-06, + "loss": 0.8333, + "step": 9099 + }, + { + "epoch": 2.74, + "grad_norm": 8.495409965515137, + "learning_rate": 1.7600481106545057e-06, + "loss": 1.1459, + "step": 9100 + }, + { + "epoch": 2.74, + "grad_norm": 11.421164512634277, + "learning_rate": 1.7580435000501153e-06, + "loss": 1.0815, + "step": 9101 + }, + { + "epoch": 2.74, + "grad_norm": 17.927734375, + "learning_rate": 1.7560388894457254e-06, + "loss": 1.1274, + "step": 9102 + }, + { + "epoch": 2.74, + "grad_norm": 22.090795516967773, + "learning_rate": 1.7540342788413353e-06, + "loss": 0.8337, + "step": 9103 + }, + { + "epoch": 2.74, + "grad_norm": 11.036092758178711, + "learning_rate": 1.752029668236945e-06, + "loss": 1.3825, + "step": 9104 + }, + { + "epoch": 2.74, + "grad_norm": 10.002640724182129, + "learning_rate": 1.750025057632555e-06, + "loss": 1.174, + "step": 9105 + }, + { + "epoch": 2.74, + "grad_norm": 13.688326835632324, + "learning_rate": 1.748020447028165e-06, + "loss": 1.0106, + "step": 9106 + }, + { + "epoch": 2.74, + "grad_norm": 18.911972045898438, + "learning_rate": 1.7460158364237748e-06, + "loss": 1.3007, + "step": 9107 + }, + { + "epoch": 2.74, + "grad_norm": 28.944271087646484, + "learning_rate": 1.7440112258193848e-06, + "loss": 1.5335, + "step": 9108 + }, + { + "epoch": 2.74, + "grad_norm": 17.66625213623047, + "learning_rate": 1.7420066152149947e-06, + "loss": 0.7189, + "step": 9109 + }, + { + "epoch": 2.74, + "grad_norm": 28.595500946044922, + "learning_rate": 1.7400020046106044e-06, + "loss": 1.4251, + "step": 9110 + }, + { + "epoch": 2.74, + "grad_norm": 11.355489730834961, + "learning_rate": 1.7379973940062145e-06, + "loss": 0.8031, + "step": 9111 + }, + { + "epoch": 2.74, + "grad_norm": 20.97258949279785, + "learning_rate": 1.7359927834018241e-06, + "loss": 0.8911, + "step": 9112 + }, + { + "epoch": 2.74, + "grad_norm": 21.422672271728516, + "learning_rate": 1.7339881727974342e-06, + "loss": 1.5909, + "step": 9113 + }, + { + "epoch": 2.74, + "grad_norm": 16.778446197509766, + "learning_rate": 1.7319835621930443e-06, + "loss": 0.5871, + "step": 9114 + }, + { + "epoch": 2.74, + "grad_norm": 14.981059074401855, + "learning_rate": 1.729978951588654e-06, + "loss": 0.938, + "step": 9115 + }, + { + "epoch": 2.74, + "grad_norm": 17.02187156677246, + "learning_rate": 1.7279743409842638e-06, + "loss": 1.4252, + "step": 9116 + }, + { + "epoch": 2.74, + "grad_norm": 20.162086486816406, + "learning_rate": 1.725969730379874e-06, + "loss": 0.9187, + "step": 9117 + }, + { + "epoch": 2.74, + "grad_norm": 22.232948303222656, + "learning_rate": 1.7239651197754836e-06, + "loss": 1.1912, + "step": 9118 + }, + { + "epoch": 2.74, + "grad_norm": 21.19387435913086, + "learning_rate": 1.7219605091710937e-06, + "loss": 0.7006, + "step": 9119 + }, + { + "epoch": 2.74, + "grad_norm": 11.553994178771973, + "learning_rate": 1.7199558985667037e-06, + "loss": 0.7409, + "step": 9120 + }, + { + "epoch": 2.74, + "eval_loss": 0.16245336830615997, + "eval_runtime": 43.6231, + "eval_samples_per_second": 33.904, + "eval_steps_per_second": 33.904, + "step": 9120 + } + ], + "logging_steps": 1, + "max_steps": 9978, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 120, + "total_flos": 435510745758720.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}