diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,82097 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.999827482101268, + "eval_steps": 100, + "global_step": 11592, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017251789873199344, + "grad_norm": 43.25, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.642, + "step": 1 + }, + { + "epoch": 0.0003450357974639869, + "grad_norm": 406.0, + "learning_rate": 8.000000000000001e-07, + "loss": 2.9249, + "step": 2 + }, + { + "epoch": 0.0005175536961959803, + "grad_norm": 65.0, + "learning_rate": 1.2000000000000002e-06, + "loss": 3.0206, + "step": 3 + }, + { + "epoch": 0.0006900715949279737, + "grad_norm": 55.75, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.7799, + "step": 4 + }, + { + "epoch": 0.0008625894936599673, + "grad_norm": 41.5, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.7781, + "step": 5 + }, + { + "epoch": 0.0010351073923919607, + "grad_norm": 47.0, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.8404, + "step": 6 + }, + { + "epoch": 0.001207625291123954, + "grad_norm": 49.5, + "learning_rate": 2.8000000000000003e-06, + "loss": 2.9029, + "step": 7 + }, + { + "epoch": 0.0013801431898559475, + "grad_norm": 47.25, + "learning_rate": 3.2000000000000003e-06, + "loss": 2.8079, + "step": 8 + }, + { + "epoch": 0.001552661088587941, + "grad_norm": 32.0, + "learning_rate": 3.6000000000000003e-06, + "loss": 2.794, + "step": 9 + }, + { + "epoch": 0.0017251789873199345, + "grad_norm": 36.5, + "learning_rate": 4.000000000000001e-06, + "loss": 2.7301, + "step": 10 + }, + { + "epoch": 0.001897696886051928, + "grad_norm": 38.75, + "learning_rate": 4.4e-06, + "loss": 2.5871, + "step": 11 + }, + { + "epoch": 0.0020702147847839214, + "grad_norm": 44.75, + "learning_rate": 4.800000000000001e-06, + "loss": 2.7294, + "step": 12 + }, + { + "epoch": 0.0022427326835159148, + "grad_norm": 41.5, + "learning_rate": 5.2e-06, + "loss": 2.4985, + "step": 13 + }, + { + "epoch": 0.002415250582247908, + "grad_norm": 25.125, + "learning_rate": 5.600000000000001e-06, + "loss": 2.5737, + "step": 14 + }, + { + "epoch": 0.0025877684809799016, + "grad_norm": 29.25, + "learning_rate": 6e-06, + "loss": 2.6294, + "step": 15 + }, + { + "epoch": 0.002760286379711895, + "grad_norm": 22.375, + "learning_rate": 6.4000000000000006e-06, + "loss": 2.4837, + "step": 16 + }, + { + "epoch": 0.0029328042784438884, + "grad_norm": 37.25, + "learning_rate": 6.800000000000001e-06, + "loss": 2.5322, + "step": 17 + }, + { + "epoch": 0.003105322177175882, + "grad_norm": 28.125, + "learning_rate": 7.2000000000000005e-06, + "loss": 2.3859, + "step": 18 + }, + { + "epoch": 0.0032778400759078757, + "grad_norm": 16.25, + "learning_rate": 7.600000000000001e-06, + "loss": 2.3999, + "step": 19 + }, + { + "epoch": 0.003450357974639869, + "grad_norm": 19.25, + "learning_rate": 8.000000000000001e-06, + "loss": 2.418, + "step": 20 + }, + { + "epoch": 0.0036228758733718625, + "grad_norm": 15.25, + "learning_rate": 8.400000000000001e-06, + "loss": 2.3585, + "step": 21 + }, + { + "epoch": 0.003795393772103856, + "grad_norm": 12.8125, + "learning_rate": 8.8e-06, + "loss": 2.2626, + "step": 22 + }, + { + "epoch": 0.003967911670835849, + "grad_norm": 9.0, + "learning_rate": 9.200000000000002e-06, + "loss": 2.3482, + "step": 23 + }, + { + "epoch": 0.004140429569567843, + "grad_norm": 7.125, + "learning_rate": 9.600000000000001e-06, + "loss": 2.1597, + "step": 24 + }, + { + "epoch": 0.004312947468299836, + "grad_norm": 27.25, + "learning_rate": 1e-05, + "loss": 2.1392, + "step": 25 + }, + { + "epoch": 0.0044854653670318295, + "grad_norm": 12.5625, + "learning_rate": 1.04e-05, + "loss": 2.218, + "step": 26 + }, + { + "epoch": 0.004657983265763823, + "grad_norm": 32.75, + "learning_rate": 1.0800000000000002e-05, + "loss": 2.2254, + "step": 27 + }, + { + "epoch": 0.004830501164495816, + "grad_norm": 4.59375, + "learning_rate": 1.1200000000000001e-05, + "loss": 2.1076, + "step": 28 + }, + { + "epoch": 0.00500301906322781, + "grad_norm": 24.125, + "learning_rate": 1.16e-05, + "loss": 2.0355, + "step": 29 + }, + { + "epoch": 0.005175536961959803, + "grad_norm": 7.8125, + "learning_rate": 1.2e-05, + "loss": 2.1556, + "step": 30 + }, + { + "epoch": 0.005348054860691797, + "grad_norm": 8.75, + "learning_rate": 1.2400000000000002e-05, + "loss": 2.2392, + "step": 31 + }, + { + "epoch": 0.00552057275942379, + "grad_norm": 11.1875, + "learning_rate": 1.2800000000000001e-05, + "loss": 2.1823, + "step": 32 + }, + { + "epoch": 0.005693090658155784, + "grad_norm": 3.875, + "learning_rate": 1.3200000000000002e-05, + "loss": 2.0795, + "step": 33 + }, + { + "epoch": 0.005865608556887777, + "grad_norm": 20.375, + "learning_rate": 1.3600000000000002e-05, + "loss": 2.0121, + "step": 34 + }, + { + "epoch": 0.006038126455619771, + "grad_norm": 16.125, + "learning_rate": 1.4e-05, + "loss": 2.0659, + "step": 35 + }, + { + "epoch": 0.006210644354351764, + "grad_norm": 9.625, + "learning_rate": 1.4400000000000001e-05, + "loss": 2.1046, + "step": 36 + }, + { + "epoch": 0.0063831622530837575, + "grad_norm": 3.546875, + "learning_rate": 1.48e-05, + "loss": 1.968, + "step": 37 + }, + { + "epoch": 0.006555680151815751, + "grad_norm": 44.75, + "learning_rate": 1.5200000000000002e-05, + "loss": 2.0737, + "step": 38 + }, + { + "epoch": 0.006728198050547744, + "grad_norm": 5.6875, + "learning_rate": 1.5600000000000003e-05, + "loss": 2.0571, + "step": 39 + }, + { + "epoch": 0.006900715949279738, + "grad_norm": 5.40625, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.0511, + "step": 40 + }, + { + "epoch": 0.007073233848011731, + "grad_norm": 2.5625, + "learning_rate": 1.64e-05, + "loss": 1.996, + "step": 41 + }, + { + "epoch": 0.007245751746743725, + "grad_norm": 5.21875, + "learning_rate": 1.6800000000000002e-05, + "loss": 2.0255, + "step": 42 + }, + { + "epoch": 0.007418269645475718, + "grad_norm": 3.453125, + "learning_rate": 1.72e-05, + "loss": 1.9345, + "step": 43 + }, + { + "epoch": 0.007590787544207712, + "grad_norm": 4.25, + "learning_rate": 1.76e-05, + "loss": 2.069, + "step": 44 + }, + { + "epoch": 0.007763305442939705, + "grad_norm": 2.5625, + "learning_rate": 1.8e-05, + "loss": 2.0591, + "step": 45 + }, + { + "epoch": 0.007935823341671698, + "grad_norm": 3.34375, + "learning_rate": 1.8400000000000003e-05, + "loss": 2.0095, + "step": 46 + }, + { + "epoch": 0.008108341240403692, + "grad_norm": 2.9375, + "learning_rate": 1.88e-05, + "loss": 1.9536, + "step": 47 + }, + { + "epoch": 0.008280859139135685, + "grad_norm": 2.140625, + "learning_rate": 1.9200000000000003e-05, + "loss": 2.0756, + "step": 48 + }, + { + "epoch": 0.00845337703786768, + "grad_norm": 2.625, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.878, + "step": 49 + }, + { + "epoch": 0.008625894936599671, + "grad_norm": 6.1875, + "learning_rate": 2e-05, + "loss": 1.8865, + "step": 50 + }, + { + "epoch": 0.008798412835331665, + "grad_norm": 1.8046875, + "learning_rate": 1.999999962956876e-05, + "loss": 1.8895, + "step": 51 + }, + { + "epoch": 0.008970930734063659, + "grad_norm": 1.8984375, + "learning_rate": 1.9999998518275062e-05, + "loss": 1.97, + "step": 52 + }, + { + "epoch": 0.009143448632795653, + "grad_norm": 1.9609375, + "learning_rate": 1.9999996666118996e-05, + "loss": 1.9397, + "step": 53 + }, + { + "epoch": 0.009315966531527647, + "grad_norm": 2.109375, + "learning_rate": 1.9999994073100687e-05, + "loss": 1.8746, + "step": 54 + }, + { + "epoch": 0.009488484430259639, + "grad_norm": 1.59375, + "learning_rate": 1.9999990739220338e-05, + "loss": 1.8565, + "step": 55 + }, + { + "epoch": 0.009661002328991633, + "grad_norm": 2.203125, + "learning_rate": 1.999998666447819e-05, + "loss": 1.8719, + "step": 56 + }, + { + "epoch": 0.009833520227723627, + "grad_norm": 1.90625, + "learning_rate": 1.999998184887455e-05, + "loss": 1.8919, + "step": 57 + }, + { + "epoch": 0.01000603812645562, + "grad_norm": 2.390625, + "learning_rate": 1.999997629240977e-05, + "loss": 1.8949, + "step": 58 + }, + { + "epoch": 0.010178556025187612, + "grad_norm": 1.546875, + "learning_rate": 1.9999969995084264e-05, + "loss": 1.8742, + "step": 59 + }, + { + "epoch": 0.010351073923919606, + "grad_norm": 1.40625, + "learning_rate": 1.99999629568985e-05, + "loss": 1.7554, + "step": 60 + }, + { + "epoch": 0.0105235918226516, + "grad_norm": 2.875, + "learning_rate": 1.9999955177852994e-05, + "loss": 1.8714, + "step": 61 + }, + { + "epoch": 0.010696109721383594, + "grad_norm": 8.6875, + "learning_rate": 1.999994665794833e-05, + "loss": 1.776, + "step": 62 + }, + { + "epoch": 0.010868627620115588, + "grad_norm": 1.328125, + "learning_rate": 1.999993739718513e-05, + "loss": 1.8771, + "step": 63 + }, + { + "epoch": 0.01104114551884758, + "grad_norm": 1.5, + "learning_rate": 1.9999927395564087e-05, + "loss": 1.7822, + "step": 64 + }, + { + "epoch": 0.011213663417579574, + "grad_norm": 1.796875, + "learning_rate": 1.999991665308594e-05, + "loss": 1.9068, + "step": 65 + }, + { + "epoch": 0.011386181316311568, + "grad_norm": 1.1953125, + "learning_rate": 1.9999905169751486e-05, + "loss": 1.999, + "step": 66 + }, + { + "epoch": 0.011558699215043561, + "grad_norm": 1.5, + "learning_rate": 1.9999892945561578e-05, + "loss": 1.8624, + "step": 67 + }, + { + "epoch": 0.011731217113775554, + "grad_norm": 2.5625, + "learning_rate": 1.999987998051711e-05, + "loss": 1.9439, + "step": 68 + }, + { + "epoch": 0.011903735012507547, + "grad_norm": 1.3359375, + "learning_rate": 1.9999866274619057e-05, + "loss": 1.8826, + "step": 69 + }, + { + "epoch": 0.012076252911239541, + "grad_norm": 1.328125, + "learning_rate": 1.9999851827868428e-05, + "loss": 1.7159, + "step": 70 + }, + { + "epoch": 0.012248770809971535, + "grad_norm": 1.3828125, + "learning_rate": 1.9999836640266292e-05, + "loss": 1.8327, + "step": 71 + }, + { + "epoch": 0.012421288708703527, + "grad_norm": 1.265625, + "learning_rate": 1.9999820711813776e-05, + "loss": 1.9363, + "step": 72 + }, + { + "epoch": 0.012593806607435521, + "grad_norm": 1.28125, + "learning_rate": 1.999980404251206e-05, + "loss": 1.8603, + "step": 73 + }, + { + "epoch": 0.012766324506167515, + "grad_norm": 1.03125, + "learning_rate": 1.999978663236238e-05, + "loss": 1.8476, + "step": 74 + }, + { + "epoch": 0.012938842404899509, + "grad_norm": 1.4453125, + "learning_rate": 1.9999768481366026e-05, + "loss": 1.786, + "step": 75 + }, + { + "epoch": 0.013111360303631503, + "grad_norm": 2.15625, + "learning_rate": 1.9999749589524338e-05, + "loss": 1.8373, + "step": 76 + }, + { + "epoch": 0.013283878202363495, + "grad_norm": 1.328125, + "learning_rate": 1.999972995683872e-05, + "loss": 1.7419, + "step": 77 + }, + { + "epoch": 0.013456396101095489, + "grad_norm": 1.7109375, + "learning_rate": 1.9999709583310624e-05, + "loss": 1.8733, + "step": 78 + }, + { + "epoch": 0.013628913999827482, + "grad_norm": 3.578125, + "learning_rate": 1.9999688468941565e-05, + "loss": 1.7432, + "step": 79 + }, + { + "epoch": 0.013801431898559476, + "grad_norm": 1.4453125, + "learning_rate": 1.9999666613733102e-05, + "loss": 1.8525, + "step": 80 + }, + { + "epoch": 0.013973949797291468, + "grad_norm": 1.1484375, + "learning_rate": 1.9999644017686855e-05, + "loss": 1.8241, + "step": 81 + }, + { + "epoch": 0.014146467696023462, + "grad_norm": 1.390625, + "learning_rate": 1.9999620680804495e-05, + "loss": 1.768, + "step": 82 + }, + { + "epoch": 0.014318985594755456, + "grad_norm": 1.3515625, + "learning_rate": 1.999959660308776e-05, + "loss": 1.8484, + "step": 83 + }, + { + "epoch": 0.01449150349348745, + "grad_norm": 0.90234375, + "learning_rate": 1.9999571784538428e-05, + "loss": 1.8453, + "step": 84 + }, + { + "epoch": 0.014664021392219442, + "grad_norm": 2.171875, + "learning_rate": 1.9999546225158335e-05, + "loss": 1.8202, + "step": 85 + }, + { + "epoch": 0.014836539290951436, + "grad_norm": 1.5078125, + "learning_rate": 1.9999519924949376e-05, + "loss": 1.8652, + "step": 86 + }, + { + "epoch": 0.01500905718968343, + "grad_norm": 0.8828125, + "learning_rate": 1.9999492883913506e-05, + "loss": 1.7482, + "step": 87 + }, + { + "epoch": 0.015181575088415424, + "grad_norm": 1.03125, + "learning_rate": 1.999946510205272e-05, + "loss": 1.7646, + "step": 88 + }, + { + "epoch": 0.015354092987147417, + "grad_norm": 1.21875, + "learning_rate": 1.999943657936908e-05, + "loss": 1.8786, + "step": 89 + }, + { + "epoch": 0.01552661088587941, + "grad_norm": 0.90625, + "learning_rate": 1.99994073158647e-05, + "loss": 1.7141, + "step": 90 + }, + { + "epoch": 0.015699128784611403, + "grad_norm": 0.97265625, + "learning_rate": 1.9999377311541748e-05, + "loss": 1.8988, + "step": 91 + }, + { + "epoch": 0.015871646683343395, + "grad_norm": 1.046875, + "learning_rate": 1.9999346566402444e-05, + "loss": 1.8017, + "step": 92 + }, + { + "epoch": 0.01604416458207539, + "grad_norm": 0.91796875, + "learning_rate": 1.999931508044907e-05, + "loss": 1.8711, + "step": 93 + }, + { + "epoch": 0.016216682480807383, + "grad_norm": 0.98046875, + "learning_rate": 1.999928285368395e-05, + "loss": 1.7997, + "step": 94 + }, + { + "epoch": 0.01638920037953938, + "grad_norm": 1.515625, + "learning_rate": 1.9999249886109485e-05, + "loss": 1.7473, + "step": 95 + }, + { + "epoch": 0.01656171827827137, + "grad_norm": 1.0703125, + "learning_rate": 1.9999216177728106e-05, + "loss": 1.9087, + "step": 96 + }, + { + "epoch": 0.016734236177003363, + "grad_norm": 1.15625, + "learning_rate": 1.9999181728542316e-05, + "loss": 1.8097, + "step": 97 + }, + { + "epoch": 0.01690675407573536, + "grad_norm": 1.0546875, + "learning_rate": 1.9999146538554663e-05, + "loss": 1.7901, + "step": 98 + }, + { + "epoch": 0.01707927197446735, + "grad_norm": 1.4609375, + "learning_rate": 1.9999110607767763e-05, + "loss": 1.8119, + "step": 99 + }, + { + "epoch": 0.017251789873199343, + "grad_norm": 0.9609375, + "learning_rate": 1.999907393618427e-05, + "loss": 1.7627, + "step": 100 + }, + { + "epoch": 0.017251789873199343, + "eval_loss": 1.7699980735778809, + "eval_runtime": 10.881, + "eval_samples_per_second": 94.109, + "eval_steps_per_second": 23.527, + "step": 100 + }, + { + "epoch": 0.01742430777193134, + "grad_norm": 0.96484375, + "learning_rate": 1.9999036523806897e-05, + "loss": 1.8264, + "step": 101 + }, + { + "epoch": 0.01759682567066333, + "grad_norm": 1.0, + "learning_rate": 1.9998998370638427e-05, + "loss": 1.9073, + "step": 102 + }, + { + "epoch": 0.017769343569395326, + "grad_norm": 0.921875, + "learning_rate": 1.9998959476681676e-05, + "loss": 1.7857, + "step": 103 + }, + { + "epoch": 0.017941861468127318, + "grad_norm": 1.1796875, + "learning_rate": 1.9998919841939536e-05, + "loss": 1.8095, + "step": 104 + }, + { + "epoch": 0.01811437936685931, + "grad_norm": 1.046875, + "learning_rate": 1.9998879466414937e-05, + "loss": 1.7011, + "step": 105 + }, + { + "epoch": 0.018286897265591306, + "grad_norm": 0.98046875, + "learning_rate": 1.9998838350110867e-05, + "loss": 1.7574, + "step": 106 + }, + { + "epoch": 0.018459415164323298, + "grad_norm": 1.375, + "learning_rate": 1.9998796493030382e-05, + "loss": 1.7882, + "step": 107 + }, + { + "epoch": 0.018631933063055293, + "grad_norm": 6.75, + "learning_rate": 1.9998753895176576e-05, + "loss": 1.7849, + "step": 108 + }, + { + "epoch": 0.018804450961787286, + "grad_norm": 0.921875, + "learning_rate": 1.9998710556552603e-05, + "loss": 1.7914, + "step": 109 + }, + { + "epoch": 0.018976968860519278, + "grad_norm": 0.85546875, + "learning_rate": 1.9998666477161678e-05, + "loss": 1.7382, + "step": 110 + }, + { + "epoch": 0.019149486759251273, + "grad_norm": 0.74609375, + "learning_rate": 1.9998621657007068e-05, + "loss": 1.7785, + "step": 111 + }, + { + "epoch": 0.019322004657983265, + "grad_norm": 0.7734375, + "learning_rate": 1.9998576096092093e-05, + "loss": 1.7812, + "step": 112 + }, + { + "epoch": 0.01949452255671526, + "grad_norm": 0.9140625, + "learning_rate": 1.9998529794420124e-05, + "loss": 1.7871, + "step": 113 + }, + { + "epoch": 0.019667040455447253, + "grad_norm": 1.125, + "learning_rate": 1.9998482751994596e-05, + "loss": 1.6526, + "step": 114 + }, + { + "epoch": 0.019839558354179245, + "grad_norm": 1.0234375, + "learning_rate": 1.999843496881899e-05, + "loss": 1.7308, + "step": 115 + }, + { + "epoch": 0.02001207625291124, + "grad_norm": 0.8359375, + "learning_rate": 1.999838644489685e-05, + "loss": 1.7452, + "step": 116 + }, + { + "epoch": 0.020184594151643233, + "grad_norm": 1.140625, + "learning_rate": 1.9998337180231768e-05, + "loss": 1.7223, + "step": 117 + }, + { + "epoch": 0.020357112050375225, + "grad_norm": 1.21875, + "learning_rate": 1.9998287174827396e-05, + "loss": 1.7875, + "step": 118 + }, + { + "epoch": 0.02052962994910722, + "grad_norm": 0.8359375, + "learning_rate": 1.999823642868744e-05, + "loss": 1.7478, + "step": 119 + }, + { + "epoch": 0.020702147847839213, + "grad_norm": 1.1015625, + "learning_rate": 1.9998184941815653e-05, + "loss": 1.7594, + "step": 120 + }, + { + "epoch": 0.020874665746571208, + "grad_norm": 0.81640625, + "learning_rate": 1.9998132714215855e-05, + "loss": 1.8166, + "step": 121 + }, + { + "epoch": 0.0210471836453032, + "grad_norm": 1.078125, + "learning_rate": 1.9998079745891918e-05, + "loss": 1.7075, + "step": 122 + }, + { + "epoch": 0.021219701544035192, + "grad_norm": 1.109375, + "learning_rate": 1.999802603684776e-05, + "loss": 1.8377, + "step": 123 + }, + { + "epoch": 0.021392219442767188, + "grad_norm": 1.1171875, + "learning_rate": 1.999797158708736e-05, + "loss": 1.7289, + "step": 124 + }, + { + "epoch": 0.02156473734149918, + "grad_norm": 0.89453125, + "learning_rate": 1.999791639661476e-05, + "loss": 1.8432, + "step": 125 + }, + { + "epoch": 0.021737255240231176, + "grad_norm": 1.0234375, + "learning_rate": 1.999786046543404e-05, + "loss": 1.7785, + "step": 126 + }, + { + "epoch": 0.021909773138963168, + "grad_norm": 0.79296875, + "learning_rate": 1.9997803793549347e-05, + "loss": 1.7906, + "step": 127 + }, + { + "epoch": 0.02208229103769516, + "grad_norm": 0.82421875, + "learning_rate": 1.999774638096488e-05, + "loss": 1.837, + "step": 128 + }, + { + "epoch": 0.022254808936427155, + "grad_norm": 1.109375, + "learning_rate": 1.9997688227684896e-05, + "loss": 1.754, + "step": 129 + }, + { + "epoch": 0.022427326835159148, + "grad_norm": 0.89453125, + "learning_rate": 1.9997629333713697e-05, + "loss": 1.8045, + "step": 130 + }, + { + "epoch": 0.02259984473389114, + "grad_norm": 0.7578125, + "learning_rate": 1.999756969905565e-05, + "loss": 1.7617, + "step": 131 + }, + { + "epoch": 0.022772362632623135, + "grad_norm": 0.84375, + "learning_rate": 1.9997509323715166e-05, + "loss": 1.8347, + "step": 132 + }, + { + "epoch": 0.022944880531355127, + "grad_norm": 1.171875, + "learning_rate": 1.999744820769673e-05, + "loss": 1.7765, + "step": 133 + }, + { + "epoch": 0.023117398430087123, + "grad_norm": 0.76171875, + "learning_rate": 1.9997386351004864e-05, + "loss": 1.8108, + "step": 134 + }, + { + "epoch": 0.023289916328819115, + "grad_norm": 0.84765625, + "learning_rate": 1.9997323753644148e-05, + "loss": 1.7178, + "step": 135 + }, + { + "epoch": 0.023462434227551107, + "grad_norm": 0.73046875, + "learning_rate": 1.9997260415619223e-05, + "loss": 1.7446, + "step": 136 + }, + { + "epoch": 0.023634952126283103, + "grad_norm": 1.2421875, + "learning_rate": 1.999719633693478e-05, + "loss": 1.8163, + "step": 137 + }, + { + "epoch": 0.023807470025015095, + "grad_norm": 0.8984375, + "learning_rate": 1.999713151759557e-05, + "loss": 1.7499, + "step": 138 + }, + { + "epoch": 0.02397998792374709, + "grad_norm": 1.1796875, + "learning_rate": 1.999706595760639e-05, + "loss": 1.8029, + "step": 139 + }, + { + "epoch": 0.024152505822479083, + "grad_norm": 0.95703125, + "learning_rate": 1.99969996569721e-05, + "loss": 1.6801, + "step": 140 + }, + { + "epoch": 0.024325023721211075, + "grad_norm": 0.84765625, + "learning_rate": 1.999693261569761e-05, + "loss": 1.7381, + "step": 141 + }, + { + "epoch": 0.02449754161994307, + "grad_norm": 1.046875, + "learning_rate": 1.999686483378789e-05, + "loss": 1.828, + "step": 142 + }, + { + "epoch": 0.024670059518675062, + "grad_norm": 1.34375, + "learning_rate": 1.9996796311247956e-05, + "loss": 1.7018, + "step": 143 + }, + { + "epoch": 0.024842577417407054, + "grad_norm": 0.71875, + "learning_rate": 1.999672704808289e-05, + "loss": 1.8166, + "step": 144 + }, + { + "epoch": 0.02501509531613905, + "grad_norm": 1.0078125, + "learning_rate": 1.9996657044297824e-05, + "loss": 1.7758, + "step": 145 + }, + { + "epoch": 0.025187613214871042, + "grad_norm": 1.0625, + "learning_rate": 1.9996586299897944e-05, + "loss": 1.8275, + "step": 146 + }, + { + "epoch": 0.025360131113603038, + "grad_norm": 0.765625, + "learning_rate": 1.9996514814888483e-05, + "loss": 1.7491, + "step": 147 + }, + { + "epoch": 0.02553264901233503, + "grad_norm": 1.125, + "learning_rate": 1.999644258927475e-05, + "loss": 1.8093, + "step": 148 + }, + { + "epoch": 0.025705166911067022, + "grad_norm": 0.8671875, + "learning_rate": 1.9996369623062083e-05, + "loss": 1.7722, + "step": 149 + }, + { + "epoch": 0.025877684809799018, + "grad_norm": 0.80859375, + "learning_rate": 1.9996295916255898e-05, + "loss": 1.6843, + "step": 150 + }, + { + "epoch": 0.02605020270853101, + "grad_norm": 0.890625, + "learning_rate": 1.999622146886165e-05, + "loss": 1.6833, + "step": 151 + }, + { + "epoch": 0.026222720607263005, + "grad_norm": 0.84375, + "learning_rate": 1.999614628088486e-05, + "loss": 1.779, + "step": 152 + }, + { + "epoch": 0.026395238505994997, + "grad_norm": 0.7734375, + "learning_rate": 1.999607035233109e-05, + "loss": 1.7611, + "step": 153 + }, + { + "epoch": 0.02656775640472699, + "grad_norm": 0.78515625, + "learning_rate": 1.999599368320597e-05, + "loss": 1.6703, + "step": 154 + }, + { + "epoch": 0.026740274303458985, + "grad_norm": 0.734375, + "learning_rate": 1.9995916273515185e-05, + "loss": 1.731, + "step": 155 + }, + { + "epoch": 0.026912792202190977, + "grad_norm": 0.7734375, + "learning_rate": 1.9995838123264463e-05, + "loss": 1.746, + "step": 156 + }, + { + "epoch": 0.02708531010092297, + "grad_norm": 0.875, + "learning_rate": 1.9995759232459594e-05, + "loss": 1.6949, + "step": 157 + }, + { + "epoch": 0.027257827999654965, + "grad_norm": 0.81640625, + "learning_rate": 1.9995679601106426e-05, + "loss": 1.7415, + "step": 158 + }, + { + "epoch": 0.027430345898386957, + "grad_norm": 1.1953125, + "learning_rate": 1.9995599229210855e-05, + "loss": 1.6621, + "step": 159 + }, + { + "epoch": 0.027602863797118952, + "grad_norm": 1.203125, + "learning_rate": 1.999551811677884e-05, + "loss": 1.6831, + "step": 160 + }, + { + "epoch": 0.027775381695850945, + "grad_norm": 2.03125, + "learning_rate": 1.9995436263816387e-05, + "loss": 1.7927, + "step": 161 + }, + { + "epoch": 0.027947899594582937, + "grad_norm": 1.0859375, + "learning_rate": 1.9995353670329565e-05, + "loss": 1.8223, + "step": 162 + }, + { + "epoch": 0.028120417493314932, + "grad_norm": 0.890625, + "learning_rate": 1.9995270336324487e-05, + "loss": 1.7593, + "step": 163 + }, + { + "epoch": 0.028292935392046924, + "grad_norm": 0.984375, + "learning_rate": 1.9995186261807326e-05, + "loss": 1.7412, + "step": 164 + }, + { + "epoch": 0.02846545329077892, + "grad_norm": 1.1171875, + "learning_rate": 1.9995101446784317e-05, + "loss": 1.7231, + "step": 165 + }, + { + "epoch": 0.028637971189510912, + "grad_norm": 1.1015625, + "learning_rate": 1.999501589126174e-05, + "loss": 1.6749, + "step": 166 + }, + { + "epoch": 0.028810489088242904, + "grad_norm": 0.90625, + "learning_rate": 1.9994929595245932e-05, + "loss": 1.7105, + "step": 167 + }, + { + "epoch": 0.0289830069869749, + "grad_norm": 1.140625, + "learning_rate": 1.9994842558743293e-05, + "loss": 1.7118, + "step": 168 + }, + { + "epoch": 0.029155524885706892, + "grad_norm": 1.359375, + "learning_rate": 1.9994754781760264e-05, + "loss": 1.6403, + "step": 169 + }, + { + "epoch": 0.029328042784438884, + "grad_norm": 1.0234375, + "learning_rate": 1.9994666264303347e-05, + "loss": 1.7117, + "step": 170 + }, + { + "epoch": 0.02950056068317088, + "grad_norm": 1.09375, + "learning_rate": 1.999457700637911e-05, + "loss": 1.6726, + "step": 171 + }, + { + "epoch": 0.02967307858190287, + "grad_norm": 0.703125, + "learning_rate": 1.9994487007994156e-05, + "loss": 1.7028, + "step": 172 + }, + { + "epoch": 0.029845596480634867, + "grad_norm": 0.90625, + "learning_rate": 1.9994396269155153e-05, + "loss": 1.7862, + "step": 173 + }, + { + "epoch": 0.03001811437936686, + "grad_norm": 1.0625, + "learning_rate": 1.9994304789868832e-05, + "loss": 1.7097, + "step": 174 + }, + { + "epoch": 0.03019063227809885, + "grad_norm": 1.6953125, + "learning_rate": 1.999421257014196e-05, + "loss": 1.7168, + "step": 175 + }, + { + "epoch": 0.030363150176830847, + "grad_norm": 0.8359375, + "learning_rate": 1.999411960998138e-05, + "loss": 1.8136, + "step": 176 + }, + { + "epoch": 0.03053566807556284, + "grad_norm": 0.87109375, + "learning_rate": 1.999402590939397e-05, + "loss": 1.6356, + "step": 177 + }, + { + "epoch": 0.030708185974294835, + "grad_norm": 0.8046875, + "learning_rate": 1.9993931468386675e-05, + "loss": 1.7556, + "step": 178 + }, + { + "epoch": 0.030880703873026827, + "grad_norm": 0.984375, + "learning_rate": 1.9993836286966492e-05, + "loss": 1.7139, + "step": 179 + }, + { + "epoch": 0.03105322177175882, + "grad_norm": 0.8984375, + "learning_rate": 1.999374036514047e-05, + "loss": 1.7707, + "step": 180 + }, + { + "epoch": 0.031225739670490815, + "grad_norm": 0.78125, + "learning_rate": 1.999364370291572e-05, + "loss": 1.6573, + "step": 181 + }, + { + "epoch": 0.03139825756922281, + "grad_norm": 0.82421875, + "learning_rate": 1.9993546300299404e-05, + "loss": 1.8058, + "step": 182 + }, + { + "epoch": 0.0315707754679548, + "grad_norm": 0.734375, + "learning_rate": 1.9993448157298733e-05, + "loss": 1.7508, + "step": 183 + }, + { + "epoch": 0.03174329336668679, + "grad_norm": 0.7578125, + "learning_rate": 1.999334927392098e-05, + "loss": 1.7117, + "step": 184 + }, + { + "epoch": 0.031915811265418786, + "grad_norm": 3.78125, + "learning_rate": 1.9993249650173475e-05, + "loss": 1.6337, + "step": 185 + }, + { + "epoch": 0.03208832916415078, + "grad_norm": 0.85546875, + "learning_rate": 1.999314928606359e-05, + "loss": 1.7379, + "step": 186 + }, + { + "epoch": 0.03226084706288277, + "grad_norm": 0.734375, + "learning_rate": 1.999304818159877e-05, + "loss": 1.7688, + "step": 187 + }, + { + "epoch": 0.032433364961614766, + "grad_norm": 0.73828125, + "learning_rate": 1.9992946336786502e-05, + "loss": 1.6814, + "step": 188 + }, + { + "epoch": 0.03260588286034676, + "grad_norm": 0.75, + "learning_rate": 1.9992843751634327e-05, + "loss": 1.6128, + "step": 189 + }, + { + "epoch": 0.03277840075907876, + "grad_norm": 0.70703125, + "learning_rate": 1.9992740426149847e-05, + "loss": 1.7347, + "step": 190 + }, + { + "epoch": 0.032950918657810746, + "grad_norm": 0.703125, + "learning_rate": 1.9992636360340722e-05, + "loss": 1.8182, + "step": 191 + }, + { + "epoch": 0.03312343655654274, + "grad_norm": 0.73046875, + "learning_rate": 1.999253155421466e-05, + "loss": 1.6953, + "step": 192 + }, + { + "epoch": 0.03329595445527474, + "grad_norm": 0.7265625, + "learning_rate": 1.999242600777942e-05, + "loss": 1.7346, + "step": 193 + }, + { + "epoch": 0.033468472354006726, + "grad_norm": 0.6875, + "learning_rate": 1.999231972104283e-05, + "loss": 1.7494, + "step": 194 + }, + { + "epoch": 0.03364099025273872, + "grad_norm": 0.73828125, + "learning_rate": 1.9992212694012757e-05, + "loss": 1.7586, + "step": 195 + }, + { + "epoch": 0.03381350815147072, + "grad_norm": 1.1015625, + "learning_rate": 1.9992104926697137e-05, + "loss": 1.7297, + "step": 196 + }, + { + "epoch": 0.033986026050202706, + "grad_norm": 0.7421875, + "learning_rate": 1.9991996419103947e-05, + "loss": 1.6872, + "step": 197 + }, + { + "epoch": 0.0341585439489347, + "grad_norm": 0.7265625, + "learning_rate": 1.999188717124123e-05, + "loss": 1.7465, + "step": 198 + }, + { + "epoch": 0.0343310618476667, + "grad_norm": 0.765625, + "learning_rate": 1.999177718311708e-05, + "loss": 1.7722, + "step": 199 + }, + { + "epoch": 0.034503579746398685, + "grad_norm": 0.78515625, + "learning_rate": 1.9991666454739644e-05, + "loss": 1.696, + "step": 200 + }, + { + "epoch": 0.034503579746398685, + "eval_loss": 1.701870322227478, + "eval_runtime": 11.0727, + "eval_samples_per_second": 92.48, + "eval_steps_per_second": 23.12, + "step": 200 + }, + { + "epoch": 0.03467609764513068, + "grad_norm": 0.671875, + "learning_rate": 1.9991554986117127e-05, + "loss": 1.6589, + "step": 201 + }, + { + "epoch": 0.03484861554386268, + "grad_norm": 1.0234375, + "learning_rate": 1.9991442777257786e-05, + "loss": 1.7258, + "step": 202 + }, + { + "epoch": 0.03502113344259467, + "grad_norm": 0.83984375, + "learning_rate": 1.9991329828169936e-05, + "loss": 1.75, + "step": 203 + }, + { + "epoch": 0.03519365134132666, + "grad_norm": 0.72265625, + "learning_rate": 1.999121613886194e-05, + "loss": 1.5888, + "step": 204 + }, + { + "epoch": 0.035366169240058656, + "grad_norm": 0.8984375, + "learning_rate": 1.9991101709342228e-05, + "loss": 1.6331, + "step": 205 + }, + { + "epoch": 0.03553868713879065, + "grad_norm": 0.83203125, + "learning_rate": 1.9990986539619274e-05, + "loss": 1.7844, + "step": 206 + }, + { + "epoch": 0.03571120503752264, + "grad_norm": 0.98046875, + "learning_rate": 1.999087062970161e-05, + "loss": 1.68, + "step": 207 + }, + { + "epoch": 0.035883722936254636, + "grad_norm": 0.8984375, + "learning_rate": 1.999075397959782e-05, + "loss": 1.7034, + "step": 208 + }, + { + "epoch": 0.03605624083498663, + "grad_norm": 0.703125, + "learning_rate": 1.9990636589316556e-05, + "loss": 1.7639, + "step": 209 + }, + { + "epoch": 0.03622875873371862, + "grad_norm": 1.0703125, + "learning_rate": 1.9990518458866506e-05, + "loss": 1.687, + "step": 210 + }, + { + "epoch": 0.036401276632450616, + "grad_norm": 1.0546875, + "learning_rate": 1.9990399588256425e-05, + "loss": 1.6867, + "step": 211 + }, + { + "epoch": 0.03657379453118261, + "grad_norm": 1.1328125, + "learning_rate": 1.999027997749512e-05, + "loss": 1.7076, + "step": 212 + }, + { + "epoch": 0.0367463124299146, + "grad_norm": 0.8671875, + "learning_rate": 1.9990159626591455e-05, + "loss": 1.6697, + "step": 213 + }, + { + "epoch": 0.036918830328646596, + "grad_norm": 1.109375, + "learning_rate": 1.999003853555434e-05, + "loss": 1.7503, + "step": 214 + }, + { + "epoch": 0.03709134822737859, + "grad_norm": 0.73046875, + "learning_rate": 1.9989916704392748e-05, + "loss": 1.7105, + "step": 215 + }, + { + "epoch": 0.03726386612611059, + "grad_norm": 1.09375, + "learning_rate": 1.998979413311571e-05, + "loss": 1.7114, + "step": 216 + }, + { + "epoch": 0.037436384024842576, + "grad_norm": 0.8359375, + "learning_rate": 1.99896708217323e-05, + "loss": 1.6787, + "step": 217 + }, + { + "epoch": 0.03760890192357457, + "grad_norm": 0.97265625, + "learning_rate": 1.9989546770251662e-05, + "loss": 1.714, + "step": 218 + }, + { + "epoch": 0.03778141982230657, + "grad_norm": 0.8671875, + "learning_rate": 1.9989421978682978e-05, + "loss": 1.7059, + "step": 219 + }, + { + "epoch": 0.037953937721038555, + "grad_norm": 0.7109375, + "learning_rate": 1.99892964470355e-05, + "loss": 1.702, + "step": 220 + }, + { + "epoch": 0.03812645561977055, + "grad_norm": 0.73046875, + "learning_rate": 1.9989170175318523e-05, + "loss": 1.6866, + "step": 221 + }, + { + "epoch": 0.038298973518502546, + "grad_norm": 0.7890625, + "learning_rate": 1.9989043163541403e-05, + "loss": 1.7692, + "step": 222 + }, + { + "epoch": 0.038471491417234535, + "grad_norm": 0.72265625, + "learning_rate": 1.9988915411713553e-05, + "loss": 1.7088, + "step": 223 + }, + { + "epoch": 0.03864400931596653, + "grad_norm": 0.75390625, + "learning_rate": 1.9988786919844437e-05, + "loss": 1.6951, + "step": 224 + }, + { + "epoch": 0.038816527214698526, + "grad_norm": 0.890625, + "learning_rate": 1.998865768794357e-05, + "loss": 1.6612, + "step": 225 + }, + { + "epoch": 0.03898904511343052, + "grad_norm": 0.84765625, + "learning_rate": 1.9988527716020532e-05, + "loss": 1.6171, + "step": 226 + }, + { + "epoch": 0.03916156301216251, + "grad_norm": 0.7890625, + "learning_rate": 1.998839700408495e-05, + "loss": 1.7984, + "step": 227 + }, + { + "epoch": 0.039334080910894506, + "grad_norm": 1.7890625, + "learning_rate": 1.9988265552146508e-05, + "loss": 1.6983, + "step": 228 + }, + { + "epoch": 0.0395065988096265, + "grad_norm": 1.59375, + "learning_rate": 1.998813336021494e-05, + "loss": 1.6833, + "step": 229 + }, + { + "epoch": 0.03967911670835849, + "grad_norm": 1.3828125, + "learning_rate": 1.9988000428300047e-05, + "loss": 1.6599, + "step": 230 + }, + { + "epoch": 0.039851634607090486, + "grad_norm": 0.69921875, + "learning_rate": 1.9987866756411676e-05, + "loss": 1.7222, + "step": 231 + }, + { + "epoch": 0.04002415250582248, + "grad_norm": 0.80078125, + "learning_rate": 1.9987732344559724e-05, + "loss": 1.7105, + "step": 232 + }, + { + "epoch": 0.04019667040455447, + "grad_norm": 0.93359375, + "learning_rate": 1.9987597192754155e-05, + "loss": 1.671, + "step": 233 + }, + { + "epoch": 0.040369188303286466, + "grad_norm": 0.80859375, + "learning_rate": 1.9987461301004984e-05, + "loss": 1.6531, + "step": 234 + }, + { + "epoch": 0.04054170620201846, + "grad_norm": 1.015625, + "learning_rate": 1.9987324669322274e-05, + "loss": 1.7921, + "step": 235 + }, + { + "epoch": 0.04071422410075045, + "grad_norm": 0.91796875, + "learning_rate": 1.9987187297716145e-05, + "loss": 1.8698, + "step": 236 + }, + { + "epoch": 0.040886741999482445, + "grad_norm": 0.7890625, + "learning_rate": 1.9987049186196782e-05, + "loss": 1.7407, + "step": 237 + }, + { + "epoch": 0.04105925989821444, + "grad_norm": 0.80859375, + "learning_rate": 1.9986910334774415e-05, + "loss": 1.6458, + "step": 238 + }, + { + "epoch": 0.04123177779694644, + "grad_norm": 0.7734375, + "learning_rate": 1.9986770743459325e-05, + "loss": 1.6818, + "step": 239 + }, + { + "epoch": 0.041404295695678425, + "grad_norm": 1.0703125, + "learning_rate": 1.9986630412261857e-05, + "loss": 1.6233, + "step": 240 + }, + { + "epoch": 0.04157681359441042, + "grad_norm": 0.75390625, + "learning_rate": 1.9986489341192416e-05, + "loss": 1.649, + "step": 241 + }, + { + "epoch": 0.041749331493142416, + "grad_norm": 0.76953125, + "learning_rate": 1.998634753026144e-05, + "loss": 1.7436, + "step": 242 + }, + { + "epoch": 0.041921849391874405, + "grad_norm": 0.74609375, + "learning_rate": 1.9986204979479443e-05, + "loss": 1.6854, + "step": 243 + }, + { + "epoch": 0.0420943672906064, + "grad_norm": 0.8359375, + "learning_rate": 1.9986061688856983e-05, + "loss": 1.6537, + "step": 244 + }, + { + "epoch": 0.042266885189338396, + "grad_norm": 0.71484375, + "learning_rate": 1.9985917658404677e-05, + "loss": 1.767, + "step": 245 + }, + { + "epoch": 0.042439403088070385, + "grad_norm": 1.0390625, + "learning_rate": 1.99857728881332e-05, + "loss": 1.68, + "step": 246 + }, + { + "epoch": 0.04261192098680238, + "grad_norm": 0.73046875, + "learning_rate": 1.998562737805327e-05, + "loss": 1.6439, + "step": 247 + }, + { + "epoch": 0.042784438885534376, + "grad_norm": 0.828125, + "learning_rate": 1.9985481128175673e-05, + "loss": 1.8946, + "step": 248 + }, + { + "epoch": 0.042956956784266365, + "grad_norm": 0.96875, + "learning_rate": 1.998533413851124e-05, + "loss": 1.6761, + "step": 249 + }, + { + "epoch": 0.04312947468299836, + "grad_norm": 0.734375, + "learning_rate": 1.9985186409070863e-05, + "loss": 1.6496, + "step": 250 + }, + { + "epoch": 0.043301992581730356, + "grad_norm": 0.86328125, + "learning_rate": 1.998503793986549e-05, + "loss": 1.7303, + "step": 251 + }, + { + "epoch": 0.04347451048046235, + "grad_norm": 0.796875, + "learning_rate": 1.9984888730906112e-05, + "loss": 1.7383, + "step": 252 + }, + { + "epoch": 0.04364702837919434, + "grad_norm": 0.71484375, + "learning_rate": 1.998473878220379e-05, + "loss": 1.5984, + "step": 253 + }, + { + "epoch": 0.043819546277926336, + "grad_norm": 1.2890625, + "learning_rate": 1.9984588093769633e-05, + "loss": 1.6765, + "step": 254 + }, + { + "epoch": 0.04399206417665833, + "grad_norm": 1.0078125, + "learning_rate": 1.99844366656148e-05, + "loss": 1.7208, + "step": 255 + }, + { + "epoch": 0.04416458207539032, + "grad_norm": 0.91015625, + "learning_rate": 1.9984284497750516e-05, + "loss": 1.7428, + "step": 256 + }, + { + "epoch": 0.044337099974122315, + "grad_norm": 0.77734375, + "learning_rate": 1.998413159018805e-05, + "loss": 1.7136, + "step": 257 + }, + { + "epoch": 0.04450961787285431, + "grad_norm": 0.82421875, + "learning_rate": 1.9983977942938735e-05, + "loss": 1.7301, + "step": 258 + }, + { + "epoch": 0.0446821357715863, + "grad_norm": 0.703125, + "learning_rate": 1.998382355601395e-05, + "loss": 1.5718, + "step": 259 + }, + { + "epoch": 0.044854653670318295, + "grad_norm": 0.796875, + "learning_rate": 1.998366842942513e-05, + "loss": 1.6996, + "step": 260 + }, + { + "epoch": 0.04502717156905029, + "grad_norm": 0.76171875, + "learning_rate": 1.998351256318378e-05, + "loss": 1.6608, + "step": 261 + }, + { + "epoch": 0.04519968946778228, + "grad_norm": 0.6640625, + "learning_rate": 1.998335595730143e-05, + "loss": 1.6553, + "step": 262 + }, + { + "epoch": 0.045372207366514275, + "grad_norm": 0.84375, + "learning_rate": 1.99831986117897e-05, + "loss": 1.6318, + "step": 263 + }, + { + "epoch": 0.04554472526524627, + "grad_norm": 0.7421875, + "learning_rate": 1.9983040526660236e-05, + "loss": 1.6407, + "step": 264 + }, + { + "epoch": 0.045717243163978266, + "grad_norm": 0.7734375, + "learning_rate": 1.998288170192475e-05, + "loss": 1.6473, + "step": 265 + }, + { + "epoch": 0.045889761062710255, + "grad_norm": 1.1875, + "learning_rate": 1.9982722137595015e-05, + "loss": 1.7061, + "step": 266 + }, + { + "epoch": 0.04606227896144225, + "grad_norm": 0.66796875, + "learning_rate": 1.9982561833682848e-05, + "loss": 1.6742, + "step": 267 + }, + { + "epoch": 0.046234796860174246, + "grad_norm": 0.69921875, + "learning_rate": 1.9982400790200127e-05, + "loss": 1.6009, + "step": 268 + }, + { + "epoch": 0.046407314758906235, + "grad_norm": 0.86328125, + "learning_rate": 1.9982239007158784e-05, + "loss": 1.6577, + "step": 269 + }, + { + "epoch": 0.04657983265763823, + "grad_norm": 1.03125, + "learning_rate": 1.99820764845708e-05, + "loss": 1.7364, + "step": 270 + }, + { + "epoch": 0.046752350556370226, + "grad_norm": 0.87109375, + "learning_rate": 1.998191322244822e-05, + "loss": 1.6277, + "step": 271 + }, + { + "epoch": 0.046924868455102214, + "grad_norm": 0.85546875, + "learning_rate": 1.9981749220803142e-05, + "loss": 1.6939, + "step": 272 + }, + { + "epoch": 0.04709738635383421, + "grad_norm": 0.703125, + "learning_rate": 1.998158447964771e-05, + "loss": 1.7526, + "step": 273 + }, + { + "epoch": 0.047269904252566206, + "grad_norm": 0.7265625, + "learning_rate": 1.998141899899413e-05, + "loss": 1.6889, + "step": 274 + }, + { + "epoch": 0.047442422151298194, + "grad_norm": 0.73046875, + "learning_rate": 1.998125277885467e-05, + "loss": 1.6421, + "step": 275 + }, + { + "epoch": 0.04761494005003019, + "grad_norm": 1.1796875, + "learning_rate": 1.998108581924163e-05, + "loss": 1.6843, + "step": 276 + }, + { + "epoch": 0.047787457948762185, + "grad_norm": 0.8671875, + "learning_rate": 1.9980918120167395e-05, + "loss": 1.6682, + "step": 277 + }, + { + "epoch": 0.04795997584749418, + "grad_norm": 0.8671875, + "learning_rate": 1.9980749681644378e-05, + "loss": 1.7816, + "step": 278 + }, + { + "epoch": 0.04813249374622617, + "grad_norm": 0.8125, + "learning_rate": 1.9980580503685064e-05, + "loss": 1.6331, + "step": 279 + }, + { + "epoch": 0.048305011644958165, + "grad_norm": 0.71875, + "learning_rate": 1.9980410586301983e-05, + "loss": 1.7004, + "step": 280 + }, + { + "epoch": 0.04847752954369016, + "grad_norm": 0.9609375, + "learning_rate": 1.9980239929507724e-05, + "loss": 1.7132, + "step": 281 + }, + { + "epoch": 0.04865004744242215, + "grad_norm": 3.21875, + "learning_rate": 1.9980068533314937e-05, + "loss": 1.6185, + "step": 282 + }, + { + "epoch": 0.048822565341154145, + "grad_norm": 0.82421875, + "learning_rate": 1.9979896397736308e-05, + "loss": 1.6983, + "step": 283 + }, + { + "epoch": 0.04899508323988614, + "grad_norm": 0.78515625, + "learning_rate": 1.9979723522784602e-05, + "loss": 1.6439, + "step": 284 + }, + { + "epoch": 0.04916760113861813, + "grad_norm": 0.9453125, + "learning_rate": 1.997954990847262e-05, + "loss": 1.66, + "step": 285 + }, + { + "epoch": 0.049340119037350125, + "grad_norm": 0.72265625, + "learning_rate": 1.9979375554813223e-05, + "loss": 1.704, + "step": 286 + }, + { + "epoch": 0.04951263693608212, + "grad_norm": 0.765625, + "learning_rate": 1.9979200461819334e-05, + "loss": 1.7866, + "step": 287 + }, + { + "epoch": 0.04968515483481411, + "grad_norm": 0.92578125, + "learning_rate": 1.997902462950392e-05, + "loss": 1.6967, + "step": 288 + }, + { + "epoch": 0.049857672733546105, + "grad_norm": 0.71484375, + "learning_rate": 1.9978848057880008e-05, + "loss": 1.6726, + "step": 289 + }, + { + "epoch": 0.0500301906322781, + "grad_norm": 0.9296875, + "learning_rate": 1.9978670746960687e-05, + "loss": 1.7372, + "step": 290 + }, + { + "epoch": 0.050202708531010096, + "grad_norm": 0.921875, + "learning_rate": 1.9978492696759084e-05, + "loss": 1.714, + "step": 291 + }, + { + "epoch": 0.050375226429742084, + "grad_norm": 0.98046875, + "learning_rate": 1.9978313907288395e-05, + "loss": 1.693, + "step": 292 + }, + { + "epoch": 0.05054774432847408, + "grad_norm": 0.80859375, + "learning_rate": 1.997813437856186e-05, + "loss": 1.6223, + "step": 293 + }, + { + "epoch": 0.050720262227206075, + "grad_norm": 0.8046875, + "learning_rate": 1.9977954110592787e-05, + "loss": 1.7357, + "step": 294 + }, + { + "epoch": 0.050892780125938064, + "grad_norm": 0.890625, + "learning_rate": 1.9977773103394527e-05, + "loss": 1.6369, + "step": 295 + }, + { + "epoch": 0.05106529802467006, + "grad_norm": 0.7109375, + "learning_rate": 1.9977591356980493e-05, + "loss": 1.7504, + "step": 296 + }, + { + "epoch": 0.051237815923402055, + "grad_norm": 0.74609375, + "learning_rate": 1.997740887136415e-05, + "loss": 1.7014, + "step": 297 + }, + { + "epoch": 0.051410333822134044, + "grad_norm": 0.71875, + "learning_rate": 1.9977225646559013e-05, + "loss": 1.6612, + "step": 298 + }, + { + "epoch": 0.05158285172086604, + "grad_norm": 0.75, + "learning_rate": 1.9977041682578662e-05, + "loss": 1.773, + "step": 299 + }, + { + "epoch": 0.051755369619598035, + "grad_norm": 0.8984375, + "learning_rate": 1.997685697943672e-05, + "loss": 1.6947, + "step": 300 + }, + { + "epoch": 0.051755369619598035, + "eval_loss": 1.6562981605529785, + "eval_runtime": 10.8538, + "eval_samples_per_second": 94.345, + "eval_steps_per_second": 23.586, + "step": 300 + }, + { + "epoch": 0.051927887518330024, + "grad_norm": 0.71875, + "learning_rate": 1.9976671537146877e-05, + "loss": 1.7933, + "step": 301 + }, + { + "epoch": 0.05210040541706202, + "grad_norm": 0.74609375, + "learning_rate": 1.997648535572287e-05, + "loss": 1.6316, + "step": 302 + }, + { + "epoch": 0.052272923315794015, + "grad_norm": 0.671875, + "learning_rate": 1.9976298435178493e-05, + "loss": 1.6905, + "step": 303 + }, + { + "epoch": 0.05244544121452601, + "grad_norm": 1.125, + "learning_rate": 1.9976110775527592e-05, + "loss": 1.6914, + "step": 304 + }, + { + "epoch": 0.052617959113258, + "grad_norm": 0.80859375, + "learning_rate": 1.997592237678407e-05, + "loss": 1.5772, + "step": 305 + }, + { + "epoch": 0.052790477011989995, + "grad_norm": 0.75, + "learning_rate": 1.9975733238961885e-05, + "loss": 1.6322, + "step": 306 + }, + { + "epoch": 0.05296299491072199, + "grad_norm": 1.3984375, + "learning_rate": 1.997554336207505e-05, + "loss": 1.6677, + "step": 307 + }, + { + "epoch": 0.05313551280945398, + "grad_norm": 0.9609375, + "learning_rate": 1.9975352746137636e-05, + "loss": 1.658, + "step": 308 + }, + { + "epoch": 0.053308030708185974, + "grad_norm": 0.76171875, + "learning_rate": 1.997516139116376e-05, + "loss": 1.6323, + "step": 309 + }, + { + "epoch": 0.05348054860691797, + "grad_norm": 1.1328125, + "learning_rate": 1.99749692971676e-05, + "loss": 1.7478, + "step": 310 + }, + { + "epoch": 0.05365306650564996, + "grad_norm": 0.79296875, + "learning_rate": 1.9974776464163387e-05, + "loss": 1.6917, + "step": 311 + }, + { + "epoch": 0.053825584404381954, + "grad_norm": 0.7890625, + "learning_rate": 1.997458289216541e-05, + "loss": 1.7139, + "step": 312 + }, + { + "epoch": 0.05399810230311395, + "grad_norm": 0.82421875, + "learning_rate": 1.9974388581188008e-05, + "loss": 1.6154, + "step": 313 + }, + { + "epoch": 0.05417062020184594, + "grad_norm": 0.80078125, + "learning_rate": 1.997419353124558e-05, + "loss": 1.5962, + "step": 314 + }, + { + "epoch": 0.054343138100577934, + "grad_norm": 0.84765625, + "learning_rate": 1.997399774235257e-05, + "loss": 1.642, + "step": 315 + }, + { + "epoch": 0.05451565599930993, + "grad_norm": 0.72265625, + "learning_rate": 1.997380121452349e-05, + "loss": 1.6652, + "step": 316 + }, + { + "epoch": 0.054688173898041925, + "grad_norm": 0.76171875, + "learning_rate": 1.9973603947772893e-05, + "loss": 1.6442, + "step": 317 + }, + { + "epoch": 0.054860691796773914, + "grad_norm": 1.671875, + "learning_rate": 1.99734059421154e-05, + "loss": 1.6428, + "step": 318 + }, + { + "epoch": 0.05503320969550591, + "grad_norm": 0.9921875, + "learning_rate": 1.9973207197565678e-05, + "loss": 1.6888, + "step": 319 + }, + { + "epoch": 0.055205727594237905, + "grad_norm": 0.69921875, + "learning_rate": 1.9973007714138447e-05, + "loss": 1.6007, + "step": 320 + }, + { + "epoch": 0.055378245492969894, + "grad_norm": 0.8515625, + "learning_rate": 1.9972807491848494e-05, + "loss": 1.6779, + "step": 321 + }, + { + "epoch": 0.05555076339170189, + "grad_norm": 0.953125, + "learning_rate": 1.997260653071065e-05, + "loss": 1.6187, + "step": 322 + }, + { + "epoch": 0.055723281290433885, + "grad_norm": 0.82421875, + "learning_rate": 1.99724048307398e-05, + "loss": 1.6346, + "step": 323 + }, + { + "epoch": 0.05589579918916587, + "grad_norm": 1.1015625, + "learning_rate": 1.9972202391950893e-05, + "loss": 1.782, + "step": 324 + }, + { + "epoch": 0.05606831708789787, + "grad_norm": 0.6875, + "learning_rate": 1.9971999214358918e-05, + "loss": 1.6016, + "step": 325 + }, + { + "epoch": 0.056240834986629865, + "grad_norm": 0.83984375, + "learning_rate": 1.9971795297978937e-05, + "loss": 1.6269, + "step": 326 + }, + { + "epoch": 0.05641335288536185, + "grad_norm": 0.83984375, + "learning_rate": 1.9971590642826056e-05, + "loss": 1.6898, + "step": 327 + }, + { + "epoch": 0.05658587078409385, + "grad_norm": 0.74609375, + "learning_rate": 1.997138524891543e-05, + "loss": 1.6505, + "step": 328 + }, + { + "epoch": 0.056758388682825844, + "grad_norm": 0.8125, + "learning_rate": 1.9971179116262284e-05, + "loss": 1.6379, + "step": 329 + }, + { + "epoch": 0.05693090658155784, + "grad_norm": 0.87109375, + "learning_rate": 1.9970972244881886e-05, + "loss": 1.6406, + "step": 330 + }, + { + "epoch": 0.05710342448028983, + "grad_norm": 0.6796875, + "learning_rate": 1.997076463478956e-05, + "loss": 1.6551, + "step": 331 + }, + { + "epoch": 0.057275942379021824, + "grad_norm": 0.7265625, + "learning_rate": 1.9970556286000693e-05, + "loss": 1.647, + "step": 332 + }, + { + "epoch": 0.05744846027775382, + "grad_norm": 3.6875, + "learning_rate": 1.997034719853072e-05, + "loss": 1.686, + "step": 333 + }, + { + "epoch": 0.05762097817648581, + "grad_norm": 0.84375, + "learning_rate": 1.9970137372395123e-05, + "loss": 1.6334, + "step": 334 + }, + { + "epoch": 0.057793496075217804, + "grad_norm": 0.96484375, + "learning_rate": 1.9969926807609453e-05, + "loss": 1.5861, + "step": 335 + }, + { + "epoch": 0.0579660139739498, + "grad_norm": 0.9296875, + "learning_rate": 1.9969715504189312e-05, + "loss": 1.7133, + "step": 336 + }, + { + "epoch": 0.05813853187268179, + "grad_norm": 0.67578125, + "learning_rate": 1.9969503462150352e-05, + "loss": 1.6659, + "step": 337 + }, + { + "epoch": 0.058311049771413784, + "grad_norm": 0.96484375, + "learning_rate": 1.9969290681508284e-05, + "loss": 1.7634, + "step": 338 + }, + { + "epoch": 0.05848356767014578, + "grad_norm": 0.8671875, + "learning_rate": 1.996907716227887e-05, + "loss": 1.7041, + "step": 339 + }, + { + "epoch": 0.05865608556887777, + "grad_norm": 0.7421875, + "learning_rate": 1.9968862904477936e-05, + "loss": 1.6879, + "step": 340 + }, + { + "epoch": 0.058828603467609764, + "grad_norm": 0.88671875, + "learning_rate": 1.9968647908121342e-05, + "loss": 1.6305, + "step": 341 + }, + { + "epoch": 0.05900112136634176, + "grad_norm": 1.5078125, + "learning_rate": 1.996843217322503e-05, + "loss": 1.6204, + "step": 342 + }, + { + "epoch": 0.059173639265073755, + "grad_norm": 0.75390625, + "learning_rate": 1.996821569980497e-05, + "loss": 1.6207, + "step": 343 + }, + { + "epoch": 0.05934615716380574, + "grad_norm": 0.83203125, + "learning_rate": 1.9967998487877212e-05, + "loss": 1.7491, + "step": 344 + }, + { + "epoch": 0.05951867506253774, + "grad_norm": 0.84765625, + "learning_rate": 1.9967780537457842e-05, + "loss": 1.6341, + "step": 345 + }, + { + "epoch": 0.059691192961269735, + "grad_norm": 0.9609375, + "learning_rate": 1.996756184856301e-05, + "loss": 1.6297, + "step": 346 + }, + { + "epoch": 0.05986371086000172, + "grad_norm": 0.84765625, + "learning_rate": 1.996734242120891e-05, + "loss": 1.6929, + "step": 347 + }, + { + "epoch": 0.06003622875873372, + "grad_norm": 0.83984375, + "learning_rate": 1.9967122255411812e-05, + "loss": 1.7607, + "step": 348 + }, + { + "epoch": 0.060208746657465714, + "grad_norm": 1.546875, + "learning_rate": 1.9966901351188018e-05, + "loss": 1.7494, + "step": 349 + }, + { + "epoch": 0.0603812645561977, + "grad_norm": 0.87890625, + "learning_rate": 1.996667970855389e-05, + "loss": 1.6567, + "step": 350 + }, + { + "epoch": 0.0605537824549297, + "grad_norm": 0.73046875, + "learning_rate": 1.9966457327525864e-05, + "loss": 1.666, + "step": 351 + }, + { + "epoch": 0.060726300353661694, + "grad_norm": 0.6796875, + "learning_rate": 1.9966234208120398e-05, + "loss": 1.6929, + "step": 352 + }, + { + "epoch": 0.06089881825239368, + "grad_norm": 0.859375, + "learning_rate": 1.9966010350354032e-05, + "loss": 1.7106, + "step": 353 + }, + { + "epoch": 0.06107133615112568, + "grad_norm": 0.7421875, + "learning_rate": 1.996578575424335e-05, + "loss": 1.6795, + "step": 354 + }, + { + "epoch": 0.061243854049857674, + "grad_norm": 0.7578125, + "learning_rate": 1.996556041980499e-05, + "loss": 1.6676, + "step": 355 + }, + { + "epoch": 0.06141637194858967, + "grad_norm": 0.703125, + "learning_rate": 1.9965334347055646e-05, + "loss": 1.6441, + "step": 356 + }, + { + "epoch": 0.06158888984732166, + "grad_norm": 0.80078125, + "learning_rate": 1.9965107536012067e-05, + "loss": 1.7209, + "step": 357 + }, + { + "epoch": 0.061761407746053654, + "grad_norm": 0.8203125, + "learning_rate": 1.996487998669106e-05, + "loss": 1.7478, + "step": 358 + }, + { + "epoch": 0.06193392564478565, + "grad_norm": 0.9375, + "learning_rate": 1.9964651699109476e-05, + "loss": 1.6541, + "step": 359 + }, + { + "epoch": 0.06210644354351764, + "grad_norm": 1.515625, + "learning_rate": 1.996442267328423e-05, + "loss": 1.5692, + "step": 360 + }, + { + "epoch": 0.062278961442249633, + "grad_norm": 0.96484375, + "learning_rate": 1.9964192909232297e-05, + "loss": 1.6048, + "step": 361 + }, + { + "epoch": 0.06245147934098163, + "grad_norm": 0.7109375, + "learning_rate": 1.9963962406970695e-05, + "loss": 1.7004, + "step": 362 + }, + { + "epoch": 0.06262399723971362, + "grad_norm": 0.859375, + "learning_rate": 1.9963731166516494e-05, + "loss": 1.61, + "step": 363 + }, + { + "epoch": 0.06279651513844561, + "grad_norm": 0.73828125, + "learning_rate": 1.996349918788684e-05, + "loss": 1.641, + "step": 364 + }, + { + "epoch": 0.0629690330371776, + "grad_norm": 0.7421875, + "learning_rate": 1.9963266471098905e-05, + "loss": 1.6621, + "step": 365 + }, + { + "epoch": 0.0631415509359096, + "grad_norm": 0.71875, + "learning_rate": 1.996303301616994e-05, + "loss": 1.5699, + "step": 366 + }, + { + "epoch": 0.0633140688346416, + "grad_norm": 0.703125, + "learning_rate": 1.9962798823117232e-05, + "loss": 1.7367, + "step": 367 + }, + { + "epoch": 0.06348658673337358, + "grad_norm": 0.75390625, + "learning_rate": 1.996256389195814e-05, + "loss": 1.6866, + "step": 368 + }, + { + "epoch": 0.06365910463210558, + "grad_norm": 0.81640625, + "learning_rate": 1.996232822271007e-05, + "loss": 1.7105, + "step": 369 + }, + { + "epoch": 0.06383162253083757, + "grad_norm": 0.671875, + "learning_rate": 1.9962091815390475e-05, + "loss": 1.5961, + "step": 370 + }, + { + "epoch": 0.06400414042956956, + "grad_norm": 0.8671875, + "learning_rate": 1.9961854670016868e-05, + "loss": 1.69, + "step": 371 + }, + { + "epoch": 0.06417665832830156, + "grad_norm": 0.73828125, + "learning_rate": 1.996161678660683e-05, + "loss": 1.689, + "step": 372 + }, + { + "epoch": 0.06434917622703355, + "grad_norm": 0.74609375, + "learning_rate": 1.9961378165177972e-05, + "loss": 1.6235, + "step": 373 + }, + { + "epoch": 0.06452169412576554, + "grad_norm": 1.0390625, + "learning_rate": 1.9961138805747977e-05, + "loss": 1.653, + "step": 374 + }, + { + "epoch": 0.06469421202449754, + "grad_norm": 0.7265625, + "learning_rate": 1.996089870833458e-05, + "loss": 1.5524, + "step": 375 + }, + { + "epoch": 0.06486672992322953, + "grad_norm": 0.796875, + "learning_rate": 1.996065787295557e-05, + "loss": 1.6183, + "step": 376 + }, + { + "epoch": 0.06503924782196153, + "grad_norm": 3.734375, + "learning_rate": 1.9960416299628788e-05, + "loss": 1.5917, + "step": 377 + }, + { + "epoch": 0.06521176572069352, + "grad_norm": 0.67578125, + "learning_rate": 1.996017398837213e-05, + "loss": 1.7018, + "step": 378 + }, + { + "epoch": 0.06538428361942551, + "grad_norm": 0.80078125, + "learning_rate": 1.995993093920355e-05, + "loss": 1.7019, + "step": 379 + }, + { + "epoch": 0.06555680151815751, + "grad_norm": 0.75390625, + "learning_rate": 1.9959687152141052e-05, + "loss": 1.7515, + "step": 380 + }, + { + "epoch": 0.0657293194168895, + "grad_norm": 1.3203125, + "learning_rate": 1.99594426272027e-05, + "loss": 1.6587, + "step": 381 + }, + { + "epoch": 0.06590183731562149, + "grad_norm": 0.75390625, + "learning_rate": 1.9959197364406607e-05, + "loss": 1.6349, + "step": 382 + }, + { + "epoch": 0.0660743552143535, + "grad_norm": 0.7578125, + "learning_rate": 1.9958951363770946e-05, + "loss": 1.6287, + "step": 383 + }, + { + "epoch": 0.06624687311308548, + "grad_norm": 0.6796875, + "learning_rate": 1.9958704625313942e-05, + "loss": 1.6268, + "step": 384 + }, + { + "epoch": 0.06641939101181747, + "grad_norm": 0.7265625, + "learning_rate": 1.9958457149053876e-05, + "loss": 1.5422, + "step": 385 + }, + { + "epoch": 0.06659190891054947, + "grad_norm": 0.8203125, + "learning_rate": 1.995820893500908e-05, + "loss": 1.5975, + "step": 386 + }, + { + "epoch": 0.06676442680928146, + "grad_norm": 0.85546875, + "learning_rate": 1.9957959983197944e-05, + "loss": 1.5995, + "step": 387 + }, + { + "epoch": 0.06693694470801345, + "grad_norm": 1.0234375, + "learning_rate": 1.995771029363891e-05, + "loss": 1.6374, + "step": 388 + }, + { + "epoch": 0.06710946260674545, + "grad_norm": 0.67578125, + "learning_rate": 1.995745986635048e-05, + "loss": 1.5406, + "step": 389 + }, + { + "epoch": 0.06728198050547744, + "grad_norm": 0.796875, + "learning_rate": 1.995720870135121e-05, + "loss": 1.6726, + "step": 390 + }, + { + "epoch": 0.06745449840420943, + "grad_norm": 0.70703125, + "learning_rate": 1.99569567986597e-05, + "loss": 1.6981, + "step": 391 + }, + { + "epoch": 0.06762701630294143, + "grad_norm": 0.75390625, + "learning_rate": 1.9956704158294614e-05, + "loss": 1.7608, + "step": 392 + }, + { + "epoch": 0.06779953420167342, + "grad_norm": 0.70703125, + "learning_rate": 1.9956450780274677e-05, + "loss": 1.7732, + "step": 393 + }, + { + "epoch": 0.06797205210040541, + "grad_norm": 0.74609375, + "learning_rate": 1.9956196664618652e-05, + "loss": 1.6464, + "step": 394 + }, + { + "epoch": 0.06814456999913741, + "grad_norm": 0.70703125, + "learning_rate": 1.995594181134537e-05, + "loss": 1.7028, + "step": 395 + }, + { + "epoch": 0.0683170878978694, + "grad_norm": 1.0703125, + "learning_rate": 1.9955686220473712e-05, + "loss": 1.6572, + "step": 396 + }, + { + "epoch": 0.06848960579660139, + "grad_norm": 0.7265625, + "learning_rate": 1.9955429892022612e-05, + "loss": 1.612, + "step": 397 + }, + { + "epoch": 0.0686621236953334, + "grad_norm": 0.76953125, + "learning_rate": 1.995517282601106e-05, + "loss": 1.6071, + "step": 398 + }, + { + "epoch": 0.06883464159406538, + "grad_norm": 0.84765625, + "learning_rate": 1.9954915022458105e-05, + "loss": 1.6205, + "step": 399 + }, + { + "epoch": 0.06900715949279737, + "grad_norm": 0.71484375, + "learning_rate": 1.995465648138284e-05, + "loss": 1.5928, + "step": 400 + }, + { + "epoch": 0.06900715949279737, + "eval_loss": 1.6248373985290527, + "eval_runtime": 10.8965, + "eval_samples_per_second": 93.975, + "eval_steps_per_second": 23.494, + "step": 400 + }, + { + "epoch": 0.06917967739152937, + "grad_norm": 0.91796875, + "learning_rate": 1.9954397202804426e-05, + "loss": 1.6632, + "step": 401 + }, + { + "epoch": 0.06935219529026136, + "grad_norm": 0.71875, + "learning_rate": 1.995413718674207e-05, + "loss": 1.663, + "step": 402 + }, + { + "epoch": 0.06952471318899336, + "grad_norm": 0.77734375, + "learning_rate": 1.9953876433215035e-05, + "loss": 1.6414, + "step": 403 + }, + { + "epoch": 0.06969723108772535, + "grad_norm": 0.75390625, + "learning_rate": 1.9953614942242635e-05, + "loss": 1.6355, + "step": 404 + }, + { + "epoch": 0.06986974898645734, + "grad_norm": 0.68359375, + "learning_rate": 1.9953352713844253e-05, + "loss": 1.517, + "step": 405 + }, + { + "epoch": 0.07004226688518934, + "grad_norm": 0.63671875, + "learning_rate": 1.9953089748039306e-05, + "loss": 1.657, + "step": 406 + }, + { + "epoch": 0.07021478478392133, + "grad_norm": 1.0078125, + "learning_rate": 1.9952826044847282e-05, + "loss": 1.5441, + "step": 407 + }, + { + "epoch": 0.07038730268265332, + "grad_norm": 0.61328125, + "learning_rate": 1.9952561604287717e-05, + "loss": 1.6777, + "step": 408 + }, + { + "epoch": 0.07055982058138532, + "grad_norm": 2.984375, + "learning_rate": 1.9952296426380198e-05, + "loss": 1.6177, + "step": 409 + }, + { + "epoch": 0.07073233848011731, + "grad_norm": 0.71875, + "learning_rate": 1.9952030511144384e-05, + "loss": 1.6362, + "step": 410 + }, + { + "epoch": 0.0709048563788493, + "grad_norm": 0.90625, + "learning_rate": 1.995176385859996e-05, + "loss": 1.5383, + "step": 411 + }, + { + "epoch": 0.0710773742775813, + "grad_norm": 0.65234375, + "learning_rate": 1.9951496468766687e-05, + "loss": 1.7264, + "step": 412 + }, + { + "epoch": 0.07124989217631329, + "grad_norm": 0.75, + "learning_rate": 1.9951228341664376e-05, + "loss": 1.7352, + "step": 413 + }, + { + "epoch": 0.07142241007504528, + "grad_norm": 0.66796875, + "learning_rate": 1.9950959477312895e-05, + "loss": 1.6589, + "step": 414 + }, + { + "epoch": 0.07159492797377728, + "grad_norm": 0.6796875, + "learning_rate": 1.9950689875732157e-05, + "loss": 1.6992, + "step": 415 + }, + { + "epoch": 0.07176744587250927, + "grad_norm": 0.83984375, + "learning_rate": 1.9950419536942137e-05, + "loss": 1.6052, + "step": 416 + }, + { + "epoch": 0.07193996377124126, + "grad_norm": 0.69140625, + "learning_rate": 1.9950148460962867e-05, + "loss": 1.561, + "step": 417 + }, + { + "epoch": 0.07211248166997326, + "grad_norm": 0.9296875, + "learning_rate": 1.9949876647814428e-05, + "loss": 1.6689, + "step": 418 + }, + { + "epoch": 0.07228499956870525, + "grad_norm": 0.7578125, + "learning_rate": 1.9949604097516956e-05, + "loss": 1.5868, + "step": 419 + }, + { + "epoch": 0.07245751746743724, + "grad_norm": 0.734375, + "learning_rate": 1.9949330810090643e-05, + "loss": 1.5644, + "step": 420 + }, + { + "epoch": 0.07263003536616924, + "grad_norm": 0.7890625, + "learning_rate": 1.9949056785555738e-05, + "loss": 1.5376, + "step": 421 + }, + { + "epoch": 0.07280255326490123, + "grad_norm": 1.0234375, + "learning_rate": 1.9948782023932545e-05, + "loss": 1.6105, + "step": 422 + }, + { + "epoch": 0.07297507116363322, + "grad_norm": 0.828125, + "learning_rate": 1.9948506525241414e-05, + "loss": 1.6057, + "step": 423 + }, + { + "epoch": 0.07314758906236522, + "grad_norm": 0.6875, + "learning_rate": 1.9948230289502758e-05, + "loss": 1.7268, + "step": 424 + }, + { + "epoch": 0.07332010696109721, + "grad_norm": 1.046875, + "learning_rate": 1.9947953316737045e-05, + "loss": 1.6582, + "step": 425 + }, + { + "epoch": 0.0734926248598292, + "grad_norm": 0.67578125, + "learning_rate": 1.9947675606964793e-05, + "loss": 1.6357, + "step": 426 + }, + { + "epoch": 0.0736651427585612, + "grad_norm": 0.6953125, + "learning_rate": 1.994739716020657e-05, + "loss": 1.5585, + "step": 427 + }, + { + "epoch": 0.07383766065729319, + "grad_norm": 0.703125, + "learning_rate": 1.9947117976483018e-05, + "loss": 1.6731, + "step": 428 + }, + { + "epoch": 0.0740101785560252, + "grad_norm": 0.765625, + "learning_rate": 1.994683805581481e-05, + "loss": 1.6636, + "step": 429 + }, + { + "epoch": 0.07418269645475718, + "grad_norm": 0.71484375, + "learning_rate": 1.9946557398222686e-05, + "loss": 1.7249, + "step": 430 + }, + { + "epoch": 0.07435521435348917, + "grad_norm": 0.62890625, + "learning_rate": 1.9946276003727447e-05, + "loss": 1.5798, + "step": 431 + }, + { + "epoch": 0.07452773225222117, + "grad_norm": 0.671875, + "learning_rate": 1.994599387234993e-05, + "loss": 1.6511, + "step": 432 + }, + { + "epoch": 0.07470025015095316, + "grad_norm": 0.77734375, + "learning_rate": 1.9945711004111045e-05, + "loss": 1.6225, + "step": 433 + }, + { + "epoch": 0.07487276804968515, + "grad_norm": 0.69140625, + "learning_rate": 1.9945427399031744e-05, + "loss": 1.5686, + "step": 434 + }, + { + "epoch": 0.07504528594841715, + "grad_norm": 0.82421875, + "learning_rate": 1.9945143057133037e-05, + "loss": 1.6841, + "step": 435 + }, + { + "epoch": 0.07521780384714914, + "grad_norm": 0.87109375, + "learning_rate": 1.9944857978435996e-05, + "loss": 1.6935, + "step": 436 + }, + { + "epoch": 0.07539032174588113, + "grad_norm": 0.83984375, + "learning_rate": 1.9944572162961735e-05, + "loss": 1.5725, + "step": 437 + }, + { + "epoch": 0.07556283964461313, + "grad_norm": 0.8203125, + "learning_rate": 1.9944285610731433e-05, + "loss": 1.675, + "step": 438 + }, + { + "epoch": 0.07573535754334512, + "grad_norm": 0.9453125, + "learning_rate": 1.9943998321766318e-05, + "loss": 1.7037, + "step": 439 + }, + { + "epoch": 0.07590787544207711, + "grad_norm": 0.94921875, + "learning_rate": 1.9943710296087672e-05, + "loss": 1.6607, + "step": 440 + }, + { + "epoch": 0.07608039334080911, + "grad_norm": 0.78515625, + "learning_rate": 1.994342153371684e-05, + "loss": 1.6181, + "step": 441 + }, + { + "epoch": 0.0762529112395411, + "grad_norm": 1.046875, + "learning_rate": 1.9943132034675208e-05, + "loss": 1.6794, + "step": 442 + }, + { + "epoch": 0.07642542913827309, + "grad_norm": 0.78515625, + "learning_rate": 1.9942841798984228e-05, + "loss": 1.6142, + "step": 443 + }, + { + "epoch": 0.07659794703700509, + "grad_norm": 0.8203125, + "learning_rate": 1.9942550826665404e-05, + "loss": 1.7234, + "step": 444 + }, + { + "epoch": 0.07677046493573708, + "grad_norm": 1.0390625, + "learning_rate": 1.994225911774029e-05, + "loss": 1.6427, + "step": 445 + }, + { + "epoch": 0.07694298283446907, + "grad_norm": 0.81640625, + "learning_rate": 1.9941966672230494e-05, + "loss": 1.7516, + "step": 446 + }, + { + "epoch": 0.07711550073320107, + "grad_norm": 1.3828125, + "learning_rate": 1.9941673490157694e-05, + "loss": 1.6401, + "step": 447 + }, + { + "epoch": 0.07728801863193306, + "grad_norm": 0.90625, + "learning_rate": 1.9941379571543597e-05, + "loss": 1.6463, + "step": 448 + }, + { + "epoch": 0.07746053653066505, + "grad_norm": 0.6875, + "learning_rate": 1.9941084916409988e-05, + "loss": 1.5744, + "step": 449 + }, + { + "epoch": 0.07763305442939705, + "grad_norm": 1.3203125, + "learning_rate": 1.994078952477869e-05, + "loss": 1.6328, + "step": 450 + }, + { + "epoch": 0.07780557232812904, + "grad_norm": 0.8984375, + "learning_rate": 1.9940493396671598e-05, + "loss": 1.6568, + "step": 451 + }, + { + "epoch": 0.07797809022686104, + "grad_norm": 0.890625, + "learning_rate": 1.994019653211064e-05, + "loss": 1.5954, + "step": 452 + }, + { + "epoch": 0.07815060812559303, + "grad_norm": 1.125, + "learning_rate": 1.9939898931117813e-05, + "loss": 1.6227, + "step": 453 + }, + { + "epoch": 0.07832312602432502, + "grad_norm": 0.96875, + "learning_rate": 1.9939600593715166e-05, + "loss": 1.5917, + "step": 454 + }, + { + "epoch": 0.07849564392305702, + "grad_norm": 0.77734375, + "learning_rate": 1.99393015199248e-05, + "loss": 1.6043, + "step": 455 + }, + { + "epoch": 0.07866816182178901, + "grad_norm": 0.80078125, + "learning_rate": 1.993900170976888e-05, + "loss": 1.66, + "step": 456 + }, + { + "epoch": 0.078840679720521, + "grad_norm": 0.8828125, + "learning_rate": 1.9938701163269607e-05, + "loss": 1.5998, + "step": 457 + }, + { + "epoch": 0.079013197619253, + "grad_norm": 0.875, + "learning_rate": 1.9938399880449254e-05, + "loss": 1.6249, + "step": 458 + }, + { + "epoch": 0.07918571551798499, + "grad_norm": 0.84765625, + "learning_rate": 1.9938097861330138e-05, + "loss": 1.623, + "step": 459 + }, + { + "epoch": 0.07935823341671698, + "grad_norm": 0.79296875, + "learning_rate": 1.9937795105934637e-05, + "loss": 1.5886, + "step": 460 + }, + { + "epoch": 0.07953075131544898, + "grad_norm": 0.68359375, + "learning_rate": 1.9937491614285182e-05, + "loss": 1.6607, + "step": 461 + }, + { + "epoch": 0.07970326921418097, + "grad_norm": 0.79296875, + "learning_rate": 1.993718738640425e-05, + "loss": 1.6488, + "step": 462 + }, + { + "epoch": 0.07987578711291296, + "grad_norm": 0.7109375, + "learning_rate": 1.993688242231439e-05, + "loss": 1.5408, + "step": 463 + }, + { + "epoch": 0.08004830501164496, + "grad_norm": 0.625, + "learning_rate": 1.9936576722038192e-05, + "loss": 1.6059, + "step": 464 + }, + { + "epoch": 0.08022082291037695, + "grad_norm": 0.66796875, + "learning_rate": 1.9936270285598306e-05, + "loss": 1.5628, + "step": 465 + }, + { + "epoch": 0.08039334080910894, + "grad_norm": 0.67578125, + "learning_rate": 1.993596311301743e-05, + "loss": 1.6705, + "step": 466 + }, + { + "epoch": 0.08056585870784094, + "grad_norm": 0.68359375, + "learning_rate": 1.9935655204318323e-05, + "loss": 1.6953, + "step": 467 + }, + { + "epoch": 0.08073837660657293, + "grad_norm": 0.71484375, + "learning_rate": 1.99353465595238e-05, + "loss": 1.6976, + "step": 468 + }, + { + "epoch": 0.08091089450530492, + "grad_norm": 0.6875, + "learning_rate": 1.993503717865672e-05, + "loss": 1.7231, + "step": 469 + }, + { + "epoch": 0.08108341240403692, + "grad_norm": 1.7265625, + "learning_rate": 1.9934727061740013e-05, + "loss": 1.6958, + "step": 470 + }, + { + "epoch": 0.08125593030276891, + "grad_norm": 0.6875, + "learning_rate": 1.993441620879665e-05, + "loss": 1.5575, + "step": 471 + }, + { + "epoch": 0.0814284482015009, + "grad_norm": 0.73046875, + "learning_rate": 1.993410461984966e-05, + "loss": 1.6419, + "step": 472 + }, + { + "epoch": 0.0816009661002329, + "grad_norm": 0.68359375, + "learning_rate": 1.993379229492213e-05, + "loss": 1.5573, + "step": 473 + }, + { + "epoch": 0.08177348399896489, + "grad_norm": 0.7265625, + "learning_rate": 1.99334792340372e-05, + "loss": 1.5639, + "step": 474 + }, + { + "epoch": 0.08194600189769688, + "grad_norm": 0.78125, + "learning_rate": 1.9933165437218057e-05, + "loss": 1.6752, + "step": 475 + }, + { + "epoch": 0.08211851979642888, + "grad_norm": 0.9765625, + "learning_rate": 1.993285090448795e-05, + "loss": 1.6365, + "step": 476 + }, + { + "epoch": 0.08229103769516087, + "grad_norm": 0.67578125, + "learning_rate": 1.993253563587019e-05, + "loss": 1.6038, + "step": 477 + }, + { + "epoch": 0.08246355559389287, + "grad_norm": 0.68359375, + "learning_rate": 1.993221963138813e-05, + "loss": 1.6083, + "step": 478 + }, + { + "epoch": 0.08263607349262486, + "grad_norm": 0.6875, + "learning_rate": 1.993190289106518e-05, + "loss": 1.6, + "step": 479 + }, + { + "epoch": 0.08280859139135685, + "grad_norm": 0.9765625, + "learning_rate": 1.99315854149248e-05, + "loss": 1.5528, + "step": 480 + }, + { + "epoch": 0.08298110929008885, + "grad_norm": 0.68359375, + "learning_rate": 1.9931267202990524e-05, + "loss": 1.6537, + "step": 481 + }, + { + "epoch": 0.08315362718882084, + "grad_norm": 0.7890625, + "learning_rate": 1.9930948255285915e-05, + "loss": 1.5673, + "step": 482 + }, + { + "epoch": 0.08332614508755283, + "grad_norm": 0.7421875, + "learning_rate": 1.9930628571834608e-05, + "loss": 1.6501, + "step": 483 + }, + { + "epoch": 0.08349866298628483, + "grad_norm": 0.7265625, + "learning_rate": 1.993030815266029e-05, + "loss": 1.6432, + "step": 484 + }, + { + "epoch": 0.08367118088501682, + "grad_norm": 0.6484375, + "learning_rate": 1.9929986997786698e-05, + "loss": 1.5725, + "step": 485 + }, + { + "epoch": 0.08384369878374881, + "grad_norm": 0.77734375, + "learning_rate": 1.992966510723762e-05, + "loss": 1.6375, + "step": 486 + }, + { + "epoch": 0.08401621668248081, + "grad_norm": 0.6328125, + "learning_rate": 1.992934248103691e-05, + "loss": 1.6855, + "step": 487 + }, + { + "epoch": 0.0841887345812128, + "grad_norm": 0.75, + "learning_rate": 1.992901911920847e-05, + "loss": 1.6522, + "step": 488 + }, + { + "epoch": 0.08436125247994479, + "grad_norm": 0.62109375, + "learning_rate": 1.992869502177625e-05, + "loss": 1.6277, + "step": 489 + }, + { + "epoch": 0.08453377037867679, + "grad_norm": 0.6328125, + "learning_rate": 1.9928370188764265e-05, + "loss": 1.6811, + "step": 490 + }, + { + "epoch": 0.08470628827740878, + "grad_norm": 0.78125, + "learning_rate": 1.9928044620196582e-05, + "loss": 1.7146, + "step": 491 + }, + { + "epoch": 0.08487880617614077, + "grad_norm": 0.70703125, + "learning_rate": 1.9927718316097322e-05, + "loss": 1.6449, + "step": 492 + }, + { + "epoch": 0.08505132407487277, + "grad_norm": 0.7265625, + "learning_rate": 1.9927391276490657e-05, + "loss": 1.5758, + "step": 493 + }, + { + "epoch": 0.08522384197360476, + "grad_norm": 0.6171875, + "learning_rate": 1.9927063501400817e-05, + "loss": 1.5599, + "step": 494 + }, + { + "epoch": 0.08539635987233675, + "grad_norm": 0.67578125, + "learning_rate": 1.9926734990852084e-05, + "loss": 1.6281, + "step": 495 + }, + { + "epoch": 0.08556887777106875, + "grad_norm": 0.6171875, + "learning_rate": 1.99264057448688e-05, + "loss": 1.7188, + "step": 496 + }, + { + "epoch": 0.08574139566980074, + "grad_norm": 0.6328125, + "learning_rate": 1.9926075763475353e-05, + "loss": 1.606, + "step": 497 + }, + { + "epoch": 0.08591391356853273, + "grad_norm": 0.66015625, + "learning_rate": 1.9925745046696196e-05, + "loss": 1.5543, + "step": 498 + }, + { + "epoch": 0.08608643146726473, + "grad_norm": 0.6953125, + "learning_rate": 1.9925413594555824e-05, + "loss": 1.6025, + "step": 499 + }, + { + "epoch": 0.08625894936599672, + "grad_norm": 1.9296875, + "learning_rate": 1.9925081407078798e-05, + "loss": 1.69, + "step": 500 + }, + { + "epoch": 0.08625894936599672, + "eval_loss": 1.5997848510742188, + "eval_runtime": 10.7856, + "eval_samples_per_second": 94.941, + "eval_steps_per_second": 23.735, + "step": 500 + }, + { + "epoch": 0.08643146726472871, + "grad_norm": 0.6640625, + "learning_rate": 1.9924748484289723e-05, + "loss": 1.6629, + "step": 501 + }, + { + "epoch": 0.08660398516346071, + "grad_norm": 18.75, + "learning_rate": 1.992441482621327e-05, + "loss": 1.6557, + "step": 502 + }, + { + "epoch": 0.0867765030621927, + "grad_norm": 0.671875, + "learning_rate": 1.9924080432874158e-05, + "loss": 1.5788, + "step": 503 + }, + { + "epoch": 0.0869490209609247, + "grad_norm": 1.015625, + "learning_rate": 1.992374530429716e-05, + "loss": 1.5682, + "step": 504 + }, + { + "epoch": 0.08712153885965669, + "grad_norm": 0.8046875, + "learning_rate": 1.99234094405071e-05, + "loss": 1.5251, + "step": 505 + }, + { + "epoch": 0.08729405675838868, + "grad_norm": 0.6796875, + "learning_rate": 1.9923072841528862e-05, + "loss": 1.5827, + "step": 506 + }, + { + "epoch": 0.08746657465712068, + "grad_norm": 0.73828125, + "learning_rate": 1.9922735507387393e-05, + "loss": 1.6767, + "step": 507 + }, + { + "epoch": 0.08763909255585267, + "grad_norm": 0.61328125, + "learning_rate": 1.992239743810767e-05, + "loss": 1.4635, + "step": 508 + }, + { + "epoch": 0.08781161045458466, + "grad_norm": 0.63671875, + "learning_rate": 1.9922058633714752e-05, + "loss": 1.6189, + "step": 509 + }, + { + "epoch": 0.08798412835331666, + "grad_norm": 0.69140625, + "learning_rate": 1.992171909423373e-05, + "loss": 1.5658, + "step": 510 + }, + { + "epoch": 0.08815664625204865, + "grad_norm": 0.62890625, + "learning_rate": 1.9921378819689767e-05, + "loss": 1.6472, + "step": 511 + }, + { + "epoch": 0.08832916415078064, + "grad_norm": 0.7265625, + "learning_rate": 1.992103781010807e-05, + "loss": 1.6193, + "step": 512 + }, + { + "epoch": 0.08850168204951264, + "grad_norm": 0.76171875, + "learning_rate": 1.9920696065513902e-05, + "loss": 1.5667, + "step": 513 + }, + { + "epoch": 0.08867419994824463, + "grad_norm": 0.6796875, + "learning_rate": 1.992035358593258e-05, + "loss": 1.5772, + "step": 514 + }, + { + "epoch": 0.08884671784697662, + "grad_norm": 1.015625, + "learning_rate": 1.992001037138948e-05, + "loss": 1.6495, + "step": 515 + }, + { + "epoch": 0.08901923574570862, + "grad_norm": 0.6953125, + "learning_rate": 1.991966642191003e-05, + "loss": 1.6067, + "step": 516 + }, + { + "epoch": 0.08919175364444061, + "grad_norm": 1.078125, + "learning_rate": 1.991932173751971e-05, + "loss": 1.5596, + "step": 517 + }, + { + "epoch": 0.0893642715431726, + "grad_norm": 0.72265625, + "learning_rate": 1.9918976318244056e-05, + "loss": 1.5892, + "step": 518 + }, + { + "epoch": 0.0895367894419046, + "grad_norm": 0.83984375, + "learning_rate": 1.991863016410866e-05, + "loss": 1.5691, + "step": 519 + }, + { + "epoch": 0.08970930734063659, + "grad_norm": 0.6953125, + "learning_rate": 1.9918283275139167e-05, + "loss": 1.5445, + "step": 520 + }, + { + "epoch": 0.08988182523936858, + "grad_norm": 0.6640625, + "learning_rate": 1.991793565136128e-05, + "loss": 1.7354, + "step": 521 + }, + { + "epoch": 0.09005434313810058, + "grad_norm": 0.69140625, + "learning_rate": 1.991758729280074e-05, + "loss": 1.5286, + "step": 522 + }, + { + "epoch": 0.09022686103683257, + "grad_norm": 0.9296875, + "learning_rate": 1.9917238199483374e-05, + "loss": 1.6222, + "step": 523 + }, + { + "epoch": 0.09039937893556456, + "grad_norm": 0.73828125, + "learning_rate": 1.9916888371435036e-05, + "loss": 1.6243, + "step": 524 + }, + { + "epoch": 0.09057189683429656, + "grad_norm": 0.9140625, + "learning_rate": 1.9916537808681643e-05, + "loss": 1.5857, + "step": 525 + }, + { + "epoch": 0.09074441473302855, + "grad_norm": 0.8515625, + "learning_rate": 1.9916186511249167e-05, + "loss": 1.5818, + "step": 526 + }, + { + "epoch": 0.09091693263176054, + "grad_norm": 0.7421875, + "learning_rate": 1.9915834479163634e-05, + "loss": 1.5379, + "step": 527 + }, + { + "epoch": 0.09108945053049254, + "grad_norm": 0.8984375, + "learning_rate": 1.991548171245113e-05, + "loss": 1.6106, + "step": 528 + }, + { + "epoch": 0.09126196842922453, + "grad_norm": 0.765625, + "learning_rate": 1.991512821113778e-05, + "loss": 1.6018, + "step": 529 + }, + { + "epoch": 0.09143448632795653, + "grad_norm": 0.75390625, + "learning_rate": 1.9914773975249782e-05, + "loss": 1.5822, + "step": 530 + }, + { + "epoch": 0.09160700422668852, + "grad_norm": 0.82421875, + "learning_rate": 1.991441900481338e-05, + "loss": 1.627, + "step": 531 + }, + { + "epoch": 0.09177952212542051, + "grad_norm": 0.75390625, + "learning_rate": 1.9914063299854866e-05, + "loss": 1.6365, + "step": 532 + }, + { + "epoch": 0.09195204002415251, + "grad_norm": 0.859375, + "learning_rate": 1.99137068604006e-05, + "loss": 1.6972, + "step": 533 + }, + { + "epoch": 0.0921245579228845, + "grad_norm": 1.078125, + "learning_rate": 1.9913349686476983e-05, + "loss": 1.6787, + "step": 534 + }, + { + "epoch": 0.09229707582161649, + "grad_norm": 0.73046875, + "learning_rate": 1.9912991778110485e-05, + "loss": 1.583, + "step": 535 + }, + { + "epoch": 0.09246959372034849, + "grad_norm": 1.1640625, + "learning_rate": 1.991263313532761e-05, + "loss": 1.6112, + "step": 536 + }, + { + "epoch": 0.09264211161908048, + "grad_norm": 0.921875, + "learning_rate": 1.9912273758154937e-05, + "loss": 1.5964, + "step": 537 + }, + { + "epoch": 0.09281462951781247, + "grad_norm": 0.90625, + "learning_rate": 1.9911913646619094e-05, + "loss": 1.6758, + "step": 538 + }, + { + "epoch": 0.09298714741654447, + "grad_norm": 0.890625, + "learning_rate": 1.991155280074675e-05, + "loss": 1.6041, + "step": 539 + }, + { + "epoch": 0.09315966531527646, + "grad_norm": 0.80859375, + "learning_rate": 1.991119122056465e-05, + "loss": 1.7256, + "step": 540 + }, + { + "epoch": 0.09333218321400845, + "grad_norm": 0.7890625, + "learning_rate": 1.9910828906099573e-05, + "loss": 1.6746, + "step": 541 + }, + { + "epoch": 0.09350470111274045, + "grad_norm": 1.0859375, + "learning_rate": 1.9910465857378367e-05, + "loss": 1.5857, + "step": 542 + }, + { + "epoch": 0.09367721901147244, + "grad_norm": 0.8671875, + "learning_rate": 1.991010207442792e-05, + "loss": 1.6412, + "step": 543 + }, + { + "epoch": 0.09384973691020443, + "grad_norm": 0.8828125, + "learning_rate": 1.99097375572752e-05, + "loss": 1.5818, + "step": 544 + }, + { + "epoch": 0.09402225480893643, + "grad_norm": 1.171875, + "learning_rate": 1.9909372305947196e-05, + "loss": 1.6015, + "step": 545 + }, + { + "epoch": 0.09419477270766842, + "grad_norm": 0.9375, + "learning_rate": 1.9909006320470977e-05, + "loss": 1.7097, + "step": 546 + }, + { + "epoch": 0.09436729060640041, + "grad_norm": 0.94921875, + "learning_rate": 1.9908639600873654e-05, + "loss": 1.6574, + "step": 547 + }, + { + "epoch": 0.09453980850513241, + "grad_norm": 1.0234375, + "learning_rate": 1.99082721471824e-05, + "loss": 1.6898, + "step": 548 + }, + { + "epoch": 0.0947123264038644, + "grad_norm": 0.97265625, + "learning_rate": 1.9907903959424436e-05, + "loss": 1.566, + "step": 549 + }, + { + "epoch": 0.09488484430259639, + "grad_norm": 0.6796875, + "learning_rate": 1.9907535037627034e-05, + "loss": 1.601, + "step": 550 + }, + { + "epoch": 0.09505736220132839, + "grad_norm": 0.9921875, + "learning_rate": 1.990716538181754e-05, + "loss": 1.6122, + "step": 551 + }, + { + "epoch": 0.09522988010006038, + "grad_norm": 0.8125, + "learning_rate": 1.9906794992023325e-05, + "loss": 1.6707, + "step": 552 + }, + { + "epoch": 0.09540239799879237, + "grad_norm": 0.61328125, + "learning_rate": 1.9906423868271837e-05, + "loss": 1.6544, + "step": 553 + }, + { + "epoch": 0.09557491589752437, + "grad_norm": 0.75390625, + "learning_rate": 1.990605201059057e-05, + "loss": 1.5394, + "step": 554 + }, + { + "epoch": 0.09574743379625636, + "grad_norm": 0.875, + "learning_rate": 1.9905679419007078e-05, + "loss": 1.5378, + "step": 555 + }, + { + "epoch": 0.09591995169498836, + "grad_norm": 0.625, + "learning_rate": 1.990530609354896e-05, + "loss": 1.5238, + "step": 556 + }, + { + "epoch": 0.09609246959372035, + "grad_norm": 0.77734375, + "learning_rate": 1.9904932034243872e-05, + "loss": 1.6568, + "step": 557 + }, + { + "epoch": 0.09626498749245234, + "grad_norm": 0.58203125, + "learning_rate": 1.9904557241119534e-05, + "loss": 1.6512, + "step": 558 + }, + { + "epoch": 0.09643750539118434, + "grad_norm": 0.6484375, + "learning_rate": 1.9904181714203707e-05, + "loss": 1.6527, + "step": 559 + }, + { + "epoch": 0.09661002328991633, + "grad_norm": 0.7578125, + "learning_rate": 1.9903805453524213e-05, + "loss": 1.6576, + "step": 560 + }, + { + "epoch": 0.09678254118864832, + "grad_norm": 0.77734375, + "learning_rate": 1.990342845910893e-05, + "loss": 1.6159, + "step": 561 + }, + { + "epoch": 0.09695505908738032, + "grad_norm": 0.88671875, + "learning_rate": 1.990305073098579e-05, + "loss": 1.5827, + "step": 562 + }, + { + "epoch": 0.09712757698611231, + "grad_norm": 0.80078125, + "learning_rate": 1.9902672269182773e-05, + "loss": 1.6479, + "step": 563 + }, + { + "epoch": 0.0973000948848443, + "grad_norm": 0.6484375, + "learning_rate": 1.990229307372792e-05, + "loss": 1.5307, + "step": 564 + }, + { + "epoch": 0.0974726127835763, + "grad_norm": 0.6953125, + "learning_rate": 1.9901913144649323e-05, + "loss": 1.6036, + "step": 565 + }, + { + "epoch": 0.09764513068230829, + "grad_norm": 0.95703125, + "learning_rate": 1.9901532481975133e-05, + "loss": 1.5556, + "step": 566 + }, + { + "epoch": 0.09781764858104028, + "grad_norm": 0.7265625, + "learning_rate": 1.9901151085733542e-05, + "loss": 1.6605, + "step": 567 + }, + { + "epoch": 0.09799016647977228, + "grad_norm": 0.70703125, + "learning_rate": 1.990076895595282e-05, + "loss": 1.6237, + "step": 568 + }, + { + "epoch": 0.09816268437850427, + "grad_norm": 0.734375, + "learning_rate": 1.990038609266127e-05, + "loss": 1.5957, + "step": 569 + }, + { + "epoch": 0.09833520227723626, + "grad_norm": 0.66796875, + "learning_rate": 1.9900002495887255e-05, + "loss": 1.651, + "step": 570 + }, + { + "epoch": 0.09850772017596826, + "grad_norm": 0.73046875, + "learning_rate": 1.98996181656592e-05, + "loss": 1.6785, + "step": 571 + }, + { + "epoch": 0.09868023807470025, + "grad_norm": 0.74609375, + "learning_rate": 1.9899233102005573e-05, + "loss": 1.5852, + "step": 572 + }, + { + "epoch": 0.09885275597343224, + "grad_norm": 0.6875, + "learning_rate": 1.9898847304954906e-05, + "loss": 1.5645, + "step": 573 + }, + { + "epoch": 0.09902527387216424, + "grad_norm": 0.7421875, + "learning_rate": 1.989846077453578e-05, + "loss": 1.711, + "step": 574 + }, + { + "epoch": 0.09919779177089623, + "grad_norm": 0.63671875, + "learning_rate": 1.989807351077683e-05, + "loss": 1.5826, + "step": 575 + }, + { + "epoch": 0.09937030966962822, + "grad_norm": 0.71875, + "learning_rate": 1.989768551370675e-05, + "loss": 1.5717, + "step": 576 + }, + { + "epoch": 0.09954282756836022, + "grad_norm": 0.6875, + "learning_rate": 1.9897296783354283e-05, + "loss": 1.6394, + "step": 577 + }, + { + "epoch": 0.09971534546709221, + "grad_norm": 0.66796875, + "learning_rate": 1.9896907319748227e-05, + "loss": 1.7012, + "step": 578 + }, + { + "epoch": 0.0998878633658242, + "grad_norm": 0.73046875, + "learning_rate": 1.9896517122917442e-05, + "loss": 1.6539, + "step": 579 + }, + { + "epoch": 0.1000603812645562, + "grad_norm": 0.640625, + "learning_rate": 1.989612619289083e-05, + "loss": 1.582, + "step": 580 + }, + { + "epoch": 0.10023289916328819, + "grad_norm": 0.984375, + "learning_rate": 1.9895734529697356e-05, + "loss": 1.5482, + "step": 581 + }, + { + "epoch": 0.10040541706202019, + "grad_norm": 0.6640625, + "learning_rate": 1.9895342133366036e-05, + "loss": 1.7008, + "step": 582 + }, + { + "epoch": 0.10057793496075218, + "grad_norm": 0.64453125, + "learning_rate": 1.9894949003925942e-05, + "loss": 1.5771, + "step": 583 + }, + { + "epoch": 0.10075045285948417, + "grad_norm": 0.81640625, + "learning_rate": 1.98945551414062e-05, + "loss": 1.5932, + "step": 584 + }, + { + "epoch": 0.10092297075821617, + "grad_norm": 0.8046875, + "learning_rate": 1.989416054583599e-05, + "loss": 1.6266, + "step": 585 + }, + { + "epoch": 0.10109548865694816, + "grad_norm": 0.640625, + "learning_rate": 1.9893765217244544e-05, + "loss": 1.588, + "step": 586 + }, + { + "epoch": 0.10126800655568015, + "grad_norm": 0.78515625, + "learning_rate": 1.9893369155661152e-05, + "loss": 1.6614, + "step": 587 + }, + { + "epoch": 0.10144052445441215, + "grad_norm": 0.70703125, + "learning_rate": 1.9892972361115154e-05, + "loss": 1.5868, + "step": 588 + }, + { + "epoch": 0.10161304235314414, + "grad_norm": 1.53125, + "learning_rate": 1.989257483363595e-05, + "loss": 1.5864, + "step": 589 + }, + { + "epoch": 0.10178556025187613, + "grad_norm": 0.7421875, + "learning_rate": 1.9892176573252993e-05, + "loss": 1.5276, + "step": 590 + }, + { + "epoch": 0.10195807815060813, + "grad_norm": 0.71484375, + "learning_rate": 1.9891777579995787e-05, + "loss": 1.616, + "step": 591 + }, + { + "epoch": 0.10213059604934012, + "grad_norm": 0.64453125, + "learning_rate": 1.9891377853893888e-05, + "loss": 1.621, + "step": 592 + }, + { + "epoch": 0.10230311394807211, + "grad_norm": 0.61328125, + "learning_rate": 1.9890977394976915e-05, + "loss": 1.5442, + "step": 593 + }, + { + "epoch": 0.10247563184680411, + "grad_norm": 0.58203125, + "learning_rate": 1.9890576203274534e-05, + "loss": 1.5709, + "step": 594 + }, + { + "epoch": 0.1026481497455361, + "grad_norm": 0.875, + "learning_rate": 1.989017427881647e-05, + "loss": 1.5972, + "step": 595 + }, + { + "epoch": 0.10282066764426809, + "grad_norm": 0.76953125, + "learning_rate": 1.9889771621632497e-05, + "loss": 1.636, + "step": 596 + }, + { + "epoch": 0.10299318554300009, + "grad_norm": 0.640625, + "learning_rate": 1.9889368231752452e-05, + "loss": 1.6101, + "step": 597 + }, + { + "epoch": 0.10316570344173208, + "grad_norm": 0.640625, + "learning_rate": 1.9888964109206213e-05, + "loss": 1.6646, + "step": 598 + }, + { + "epoch": 0.10333822134046407, + "grad_norm": 0.61328125, + "learning_rate": 1.9888559254023723e-05, + "loss": 1.5002, + "step": 599 + }, + { + "epoch": 0.10351073923919607, + "grad_norm": 0.74609375, + "learning_rate": 1.988815366623498e-05, + "loss": 1.5775, + "step": 600 + }, + { + "epoch": 0.10351073923919607, + "eval_loss": 1.5823333263397217, + "eval_runtime": 10.7916, + "eval_samples_per_second": 94.889, + "eval_steps_per_second": 23.722, + "step": 600 + }, + { + "epoch": 0.10368325713792806, + "grad_norm": 0.78515625, + "learning_rate": 1.988774734587003e-05, + "loss": 1.6637, + "step": 601 + }, + { + "epoch": 0.10385577503666005, + "grad_norm": 0.875, + "learning_rate": 1.988734029295897e-05, + "loss": 1.556, + "step": 602 + }, + { + "epoch": 0.10402829293539205, + "grad_norm": 0.73046875, + "learning_rate": 1.9886932507531966e-05, + "loss": 1.5965, + "step": 603 + }, + { + "epoch": 0.10420081083412404, + "grad_norm": 0.640625, + "learning_rate": 1.9886523989619224e-05, + "loss": 1.6596, + "step": 604 + }, + { + "epoch": 0.10437332873285603, + "grad_norm": 0.83203125, + "learning_rate": 1.988611473925101e-05, + "loss": 1.6136, + "step": 605 + }, + { + "epoch": 0.10454584663158803, + "grad_norm": 0.68359375, + "learning_rate": 1.988570475645765e-05, + "loss": 1.5358, + "step": 606 + }, + { + "epoch": 0.10471836453032002, + "grad_norm": 0.85546875, + "learning_rate": 1.988529404126951e-05, + "loss": 1.609, + "step": 607 + }, + { + "epoch": 0.10489088242905202, + "grad_norm": 0.8203125, + "learning_rate": 1.9884882593717018e-05, + "loss": 1.6632, + "step": 608 + }, + { + "epoch": 0.10506340032778401, + "grad_norm": 0.8125, + "learning_rate": 1.9884470413830662e-05, + "loss": 1.6104, + "step": 609 + }, + { + "epoch": 0.105235918226516, + "grad_norm": 0.68359375, + "learning_rate": 1.988405750164098e-05, + "loss": 1.5754, + "step": 610 + }, + { + "epoch": 0.105408436125248, + "grad_norm": 0.61328125, + "learning_rate": 1.9883643857178554e-05, + "loss": 1.614, + "step": 611 + }, + { + "epoch": 0.10558095402397999, + "grad_norm": 0.8125, + "learning_rate": 1.988322948047404e-05, + "loss": 1.6469, + "step": 612 + }, + { + "epoch": 0.10575347192271198, + "grad_norm": 0.6484375, + "learning_rate": 1.988281437155813e-05, + "loss": 1.6428, + "step": 613 + }, + { + "epoch": 0.10592598982144398, + "grad_norm": 0.6484375, + "learning_rate": 1.9882398530461582e-05, + "loss": 1.6195, + "step": 614 + }, + { + "epoch": 0.10609850772017597, + "grad_norm": 0.6953125, + "learning_rate": 1.98819819572152e-05, + "loss": 1.6814, + "step": 615 + }, + { + "epoch": 0.10627102561890796, + "grad_norm": 0.6484375, + "learning_rate": 1.9881564651849855e-05, + "loss": 1.5624, + "step": 616 + }, + { + "epoch": 0.10644354351763996, + "grad_norm": 0.875, + "learning_rate": 1.9881146614396454e-05, + "loss": 1.6194, + "step": 617 + }, + { + "epoch": 0.10661606141637195, + "grad_norm": 0.80859375, + "learning_rate": 1.988072784488597e-05, + "loss": 1.5799, + "step": 618 + }, + { + "epoch": 0.10678857931510394, + "grad_norm": 0.5546875, + "learning_rate": 1.9880308343349432e-05, + "loss": 1.5319, + "step": 619 + }, + { + "epoch": 0.10696109721383594, + "grad_norm": 0.953125, + "learning_rate": 1.9879888109817916e-05, + "loss": 1.5623, + "step": 620 + }, + { + "epoch": 0.10713361511256793, + "grad_norm": 0.79296875, + "learning_rate": 1.9879467144322557e-05, + "loss": 1.6315, + "step": 621 + }, + { + "epoch": 0.10730613301129992, + "grad_norm": 0.63671875, + "learning_rate": 1.9879045446894546e-05, + "loss": 1.5334, + "step": 622 + }, + { + "epoch": 0.10747865091003192, + "grad_norm": 0.69921875, + "learning_rate": 1.9878623017565115e-05, + "loss": 1.5988, + "step": 623 + }, + { + "epoch": 0.10765116880876391, + "grad_norm": 0.625, + "learning_rate": 1.987819985636557e-05, + "loss": 1.537, + "step": 624 + }, + { + "epoch": 0.1078236867074959, + "grad_norm": 0.7578125, + "learning_rate": 1.9877775963327254e-05, + "loss": 1.5755, + "step": 625 + }, + { + "epoch": 0.1079962046062279, + "grad_norm": 0.62890625, + "learning_rate": 1.987735133848158e-05, + "loss": 1.5418, + "step": 626 + }, + { + "epoch": 0.10816872250495989, + "grad_norm": 0.6328125, + "learning_rate": 1.987692598186e-05, + "loss": 1.595, + "step": 627 + }, + { + "epoch": 0.10834124040369188, + "grad_norm": 0.69921875, + "learning_rate": 1.9876499893494028e-05, + "loss": 1.4856, + "step": 628 + }, + { + "epoch": 0.10851375830242388, + "grad_norm": 0.6796875, + "learning_rate": 1.9876073073415233e-05, + "loss": 1.5476, + "step": 629 + }, + { + "epoch": 0.10868627620115587, + "grad_norm": 0.7734375, + "learning_rate": 1.987564552165524e-05, + "loss": 1.5228, + "step": 630 + }, + { + "epoch": 0.10885879409988786, + "grad_norm": 0.71484375, + "learning_rate": 1.9875217238245715e-05, + "loss": 1.5727, + "step": 631 + }, + { + "epoch": 0.10903131199861986, + "grad_norm": 0.6875, + "learning_rate": 1.9874788223218398e-05, + "loss": 1.6439, + "step": 632 + }, + { + "epoch": 0.10920382989735185, + "grad_norm": 0.6953125, + "learning_rate": 1.987435847660507e-05, + "loss": 1.4864, + "step": 633 + }, + { + "epoch": 0.10937634779608385, + "grad_norm": 0.7265625, + "learning_rate": 1.9873927998437566e-05, + "loss": 1.5274, + "step": 634 + }, + { + "epoch": 0.10954886569481584, + "grad_norm": 0.7578125, + "learning_rate": 1.9873496788747778e-05, + "loss": 1.6405, + "step": 635 + }, + { + "epoch": 0.10972138359354783, + "grad_norm": 0.64453125, + "learning_rate": 1.987306484756766e-05, + "loss": 1.6233, + "step": 636 + }, + { + "epoch": 0.10989390149227983, + "grad_norm": 0.6328125, + "learning_rate": 1.9872632174929208e-05, + "loss": 1.585, + "step": 637 + }, + { + "epoch": 0.11006641939101182, + "grad_norm": 0.75, + "learning_rate": 1.9872198770864476e-05, + "loss": 1.6029, + "step": 638 + }, + { + "epoch": 0.11023893728974381, + "grad_norm": 0.66796875, + "learning_rate": 1.9871764635405574e-05, + "loss": 1.6057, + "step": 639 + }, + { + "epoch": 0.11041145518847581, + "grad_norm": 0.765625, + "learning_rate": 1.9871329768584666e-05, + "loss": 1.5482, + "step": 640 + }, + { + "epoch": 0.1105839730872078, + "grad_norm": 0.75390625, + "learning_rate": 1.987089417043397e-05, + "loss": 1.6146, + "step": 641 + }, + { + "epoch": 0.11075649098593979, + "grad_norm": 0.6875, + "learning_rate": 1.987045784098576e-05, + "loss": 1.6118, + "step": 642 + }, + { + "epoch": 0.11092900888467179, + "grad_norm": 0.90625, + "learning_rate": 1.9870020780272357e-05, + "loss": 1.5789, + "step": 643 + }, + { + "epoch": 0.11110152678340378, + "grad_norm": 0.86328125, + "learning_rate": 1.9869582988326145e-05, + "loss": 1.5449, + "step": 644 + }, + { + "epoch": 0.11127404468213577, + "grad_norm": 0.703125, + "learning_rate": 1.9869144465179557e-05, + "loss": 1.5871, + "step": 645 + }, + { + "epoch": 0.11144656258086777, + "grad_norm": 0.59375, + "learning_rate": 1.9868705210865083e-05, + "loss": 1.5773, + "step": 646 + }, + { + "epoch": 0.11161908047959976, + "grad_norm": 0.796875, + "learning_rate": 1.9868265225415263e-05, + "loss": 1.6108, + "step": 647 + }, + { + "epoch": 0.11179159837833175, + "grad_norm": 0.8671875, + "learning_rate": 1.9867824508862696e-05, + "loss": 1.6162, + "step": 648 + }, + { + "epoch": 0.11196411627706375, + "grad_norm": 0.8125, + "learning_rate": 1.986738306124003e-05, + "loss": 1.5618, + "step": 649 + }, + { + "epoch": 0.11213663417579574, + "grad_norm": 0.65234375, + "learning_rate": 1.9866940882579976e-05, + "loss": 1.6397, + "step": 650 + }, + { + "epoch": 0.11230915207452773, + "grad_norm": 0.6640625, + "learning_rate": 1.986649797291529e-05, + "loss": 1.632, + "step": 651 + }, + { + "epoch": 0.11248166997325973, + "grad_norm": 1.1640625, + "learning_rate": 1.9866054332278784e-05, + "loss": 1.6181, + "step": 652 + }, + { + "epoch": 0.11265418787199172, + "grad_norm": 0.625, + "learning_rate": 1.9865609960703325e-05, + "loss": 1.5809, + "step": 653 + }, + { + "epoch": 0.1128267057707237, + "grad_norm": 0.71484375, + "learning_rate": 1.9865164858221838e-05, + "loss": 1.5046, + "step": 654 + }, + { + "epoch": 0.11299922366945571, + "grad_norm": 0.66796875, + "learning_rate": 1.9864719024867303e-05, + "loss": 1.6255, + "step": 655 + }, + { + "epoch": 0.1131717415681877, + "grad_norm": 0.609375, + "learning_rate": 1.9864272460672738e-05, + "loss": 1.613, + "step": 656 + }, + { + "epoch": 0.11334425946691969, + "grad_norm": 0.7734375, + "learning_rate": 1.9863825165671238e-05, + "loss": 1.6889, + "step": 657 + }, + { + "epoch": 0.11351677736565169, + "grad_norm": 1.2734375, + "learning_rate": 1.9863377139895935e-05, + "loss": 1.6039, + "step": 658 + }, + { + "epoch": 0.11368929526438368, + "grad_norm": 0.640625, + "learning_rate": 1.986292838338003e-05, + "loss": 1.6027, + "step": 659 + }, + { + "epoch": 0.11386181316311568, + "grad_norm": 0.64453125, + "learning_rate": 1.986247889615676e-05, + "loss": 1.5537, + "step": 660 + }, + { + "epoch": 0.11403433106184767, + "grad_norm": 0.65234375, + "learning_rate": 1.9862028678259427e-05, + "loss": 1.5859, + "step": 661 + }, + { + "epoch": 0.11420684896057966, + "grad_norm": 0.75390625, + "learning_rate": 1.986157772972139e-05, + "loss": 1.5783, + "step": 662 + }, + { + "epoch": 0.11437936685931166, + "grad_norm": 0.82421875, + "learning_rate": 1.9861126050576063e-05, + "loss": 1.5792, + "step": 663 + }, + { + "epoch": 0.11455188475804365, + "grad_norm": 0.65625, + "learning_rate": 1.9860673640856895e-05, + "loss": 1.5931, + "step": 664 + }, + { + "epoch": 0.11472440265677564, + "grad_norm": 0.6484375, + "learning_rate": 1.9860220500597415e-05, + "loss": 1.6084, + "step": 665 + }, + { + "epoch": 0.11489692055550764, + "grad_norm": 0.72265625, + "learning_rate": 1.9859766629831192e-05, + "loss": 1.6543, + "step": 666 + }, + { + "epoch": 0.11506943845423963, + "grad_norm": 0.61328125, + "learning_rate": 1.9859312028591852e-05, + "loss": 1.61, + "step": 667 + }, + { + "epoch": 0.11524195635297162, + "grad_norm": 0.640625, + "learning_rate": 1.9858856696913068e-05, + "loss": 1.5262, + "step": 668 + }, + { + "epoch": 0.11541447425170362, + "grad_norm": 0.67578125, + "learning_rate": 1.985840063482858e-05, + "loss": 1.5316, + "step": 669 + }, + { + "epoch": 0.11558699215043561, + "grad_norm": 0.62890625, + "learning_rate": 1.9857943842372175e-05, + "loss": 1.5745, + "step": 670 + }, + { + "epoch": 0.1157595100491676, + "grad_norm": 0.69921875, + "learning_rate": 1.9857486319577697e-05, + "loss": 1.5823, + "step": 671 + }, + { + "epoch": 0.1159320279478996, + "grad_norm": 0.6875, + "learning_rate": 1.9857028066479042e-05, + "loss": 1.5883, + "step": 672 + }, + { + "epoch": 0.11610454584663159, + "grad_norm": 0.74609375, + "learning_rate": 1.9856569083110152e-05, + "loss": 1.6125, + "step": 673 + }, + { + "epoch": 0.11627706374536358, + "grad_norm": 0.671875, + "learning_rate": 1.985610936950504e-05, + "loss": 1.5899, + "step": 674 + }, + { + "epoch": 0.11644958164409558, + "grad_norm": 0.8828125, + "learning_rate": 1.9855648925697762e-05, + "loss": 1.6004, + "step": 675 + }, + { + "epoch": 0.11662209954282757, + "grad_norm": 0.8828125, + "learning_rate": 1.9855187751722432e-05, + "loss": 1.5435, + "step": 676 + }, + { + "epoch": 0.11679461744155956, + "grad_norm": 0.625, + "learning_rate": 1.9854725847613216e-05, + "loss": 1.4848, + "step": 677 + }, + { + "epoch": 0.11696713534029156, + "grad_norm": 0.9921875, + "learning_rate": 1.9854263213404333e-05, + "loss": 1.5351, + "step": 678 + }, + { + "epoch": 0.11713965323902355, + "grad_norm": 0.640625, + "learning_rate": 1.985379984913006e-05, + "loss": 1.6054, + "step": 679 + }, + { + "epoch": 0.11731217113775554, + "grad_norm": 1.0234375, + "learning_rate": 1.985333575482472e-05, + "loss": 1.5781, + "step": 680 + }, + { + "epoch": 0.11748468903648754, + "grad_norm": 0.96484375, + "learning_rate": 1.9852870930522704e-05, + "loss": 1.5546, + "step": 681 + }, + { + "epoch": 0.11765720693521953, + "grad_norm": 0.65625, + "learning_rate": 1.985240537625845e-05, + "loss": 1.7013, + "step": 682 + }, + { + "epoch": 0.11782972483395152, + "grad_norm": 0.90625, + "learning_rate": 1.985193909206644e-05, + "loss": 1.6284, + "step": 683 + }, + { + "epoch": 0.11800224273268352, + "grad_norm": 0.9609375, + "learning_rate": 1.9851472077981226e-05, + "loss": 1.6304, + "step": 684 + }, + { + "epoch": 0.1181747606314155, + "grad_norm": 0.69921875, + "learning_rate": 1.98510043340374e-05, + "loss": 1.6323, + "step": 685 + }, + { + "epoch": 0.11834727853014751, + "grad_norm": 0.85546875, + "learning_rate": 1.9850535860269628e-05, + "loss": 1.6954, + "step": 686 + }, + { + "epoch": 0.1185197964288795, + "grad_norm": 0.625, + "learning_rate": 1.985006665671261e-05, + "loss": 1.5755, + "step": 687 + }, + { + "epoch": 0.11869231432761149, + "grad_norm": 0.66015625, + "learning_rate": 1.984959672340111e-05, + "loss": 1.6307, + "step": 688 + }, + { + "epoch": 0.11886483222634349, + "grad_norm": 0.70703125, + "learning_rate": 1.9849126060369933e-05, + "loss": 1.6133, + "step": 689 + }, + { + "epoch": 0.11903735012507548, + "grad_norm": 0.7109375, + "learning_rate": 1.9848654667653964e-05, + "loss": 1.649, + "step": 690 + }, + { + "epoch": 0.11920986802380747, + "grad_norm": 0.64453125, + "learning_rate": 1.9848182545288117e-05, + "loss": 1.5091, + "step": 691 + }, + { + "epoch": 0.11938238592253947, + "grad_norm": 0.78125, + "learning_rate": 1.9847709693307375e-05, + "loss": 1.6429, + "step": 692 + }, + { + "epoch": 0.11955490382127146, + "grad_norm": 0.84765625, + "learning_rate": 1.9847236111746767e-05, + "loss": 1.5849, + "step": 693 + }, + { + "epoch": 0.11972742172000345, + "grad_norm": 0.7734375, + "learning_rate": 1.9846761800641377e-05, + "loss": 1.5365, + "step": 694 + }, + { + "epoch": 0.11989993961873545, + "grad_norm": 0.91796875, + "learning_rate": 1.984628676002635e-05, + "loss": 1.5412, + "step": 695 + }, + { + "epoch": 0.12007245751746744, + "grad_norm": 0.6328125, + "learning_rate": 1.9845810989936882e-05, + "loss": 1.4677, + "step": 696 + }, + { + "epoch": 0.12024497541619943, + "grad_norm": 0.94140625, + "learning_rate": 1.984533449040821e-05, + "loss": 1.5982, + "step": 697 + }, + { + "epoch": 0.12041749331493143, + "grad_norm": 0.63671875, + "learning_rate": 1.984485726147564e-05, + "loss": 1.7411, + "step": 698 + }, + { + "epoch": 0.12059001121366342, + "grad_norm": 0.75, + "learning_rate": 1.9844379303174537e-05, + "loss": 1.5752, + "step": 699 + }, + { + "epoch": 0.1207625291123954, + "grad_norm": 0.66015625, + "learning_rate": 1.9843900615540305e-05, + "loss": 1.5738, + "step": 700 + }, + { + "epoch": 0.1207625291123954, + "eval_loss": 1.5691492557525635, + "eval_runtime": 11.0575, + "eval_samples_per_second": 92.607, + "eval_steps_per_second": 23.152, + "step": 700 + }, + { + "epoch": 0.12093504701112741, + "grad_norm": 0.6171875, + "learning_rate": 1.9843421198608404e-05, + "loss": 1.5288, + "step": 701 + }, + { + "epoch": 0.1211075649098594, + "grad_norm": 0.69921875, + "learning_rate": 1.9842941052414356e-05, + "loss": 1.5642, + "step": 702 + }, + { + "epoch": 0.12128008280859139, + "grad_norm": 0.6640625, + "learning_rate": 1.9842460176993734e-05, + "loss": 1.5566, + "step": 703 + }, + { + "epoch": 0.12145260070732339, + "grad_norm": 0.82421875, + "learning_rate": 1.9841978572382162e-05, + "loss": 1.5657, + "step": 704 + }, + { + "epoch": 0.12162511860605538, + "grad_norm": 0.7109375, + "learning_rate": 1.9841496238615326e-05, + "loss": 1.5543, + "step": 705 + }, + { + "epoch": 0.12179763650478737, + "grad_norm": 0.94140625, + "learning_rate": 1.9841013175728953e-05, + "loss": 1.5774, + "step": 706 + }, + { + "epoch": 0.12197015440351937, + "grad_norm": 0.7109375, + "learning_rate": 1.9840529383758834e-05, + "loss": 1.6671, + "step": 707 + }, + { + "epoch": 0.12214267230225136, + "grad_norm": 0.64453125, + "learning_rate": 1.9840044862740814e-05, + "loss": 1.6022, + "step": 708 + }, + { + "epoch": 0.12231519020098335, + "grad_norm": 0.7578125, + "learning_rate": 1.983955961271079e-05, + "loss": 1.5639, + "step": 709 + }, + { + "epoch": 0.12248770809971535, + "grad_norm": 0.59765625, + "learning_rate": 1.98390736337047e-05, + "loss": 1.6281, + "step": 710 + }, + { + "epoch": 0.12266022599844734, + "grad_norm": 0.77734375, + "learning_rate": 1.9838586925758566e-05, + "loss": 1.6136, + "step": 711 + }, + { + "epoch": 0.12283274389717934, + "grad_norm": 0.625, + "learning_rate": 1.9838099488908437e-05, + "loss": 1.581, + "step": 712 + }, + { + "epoch": 0.12300526179591133, + "grad_norm": 0.69140625, + "learning_rate": 1.9837611323190423e-05, + "loss": 1.6113, + "step": 713 + }, + { + "epoch": 0.12317777969464332, + "grad_norm": 0.84765625, + "learning_rate": 1.9837122428640695e-05, + "loss": 1.5996, + "step": 714 + }, + { + "epoch": 0.12335029759337532, + "grad_norm": 0.70703125, + "learning_rate": 1.9836632805295477e-05, + "loss": 1.7063, + "step": 715 + }, + { + "epoch": 0.12352281549210731, + "grad_norm": 0.91015625, + "learning_rate": 1.9836142453191032e-05, + "loss": 1.5825, + "step": 716 + }, + { + "epoch": 0.1236953333908393, + "grad_norm": 0.63671875, + "learning_rate": 1.98356513723637e-05, + "loss": 1.5981, + "step": 717 + }, + { + "epoch": 0.1238678512895713, + "grad_norm": 0.796875, + "learning_rate": 1.9835159562849857e-05, + "loss": 1.5686, + "step": 718 + }, + { + "epoch": 0.12404036918830329, + "grad_norm": 0.71484375, + "learning_rate": 1.983466702468594e-05, + "loss": 1.5379, + "step": 719 + }, + { + "epoch": 0.12421288708703528, + "grad_norm": 0.6875, + "learning_rate": 1.9834173757908442e-05, + "loss": 1.5893, + "step": 720 + }, + { + "epoch": 0.12438540498576728, + "grad_norm": 0.734375, + "learning_rate": 1.9833679762553905e-05, + "loss": 1.6384, + "step": 721 + }, + { + "epoch": 0.12455792288449927, + "grad_norm": 0.70703125, + "learning_rate": 1.9833185038658923e-05, + "loss": 1.6167, + "step": 722 + }, + { + "epoch": 0.12473044078323126, + "grad_norm": 0.89453125, + "learning_rate": 1.9832689586260157e-05, + "loss": 1.5078, + "step": 723 + }, + { + "epoch": 0.12490295868196326, + "grad_norm": 0.83984375, + "learning_rate": 1.9832193405394306e-05, + "loss": 1.5687, + "step": 724 + }, + { + "epoch": 0.12507547658069526, + "grad_norm": 0.7421875, + "learning_rate": 1.9831696496098135e-05, + "loss": 1.561, + "step": 725 + }, + { + "epoch": 0.12524799447942725, + "grad_norm": 0.6796875, + "learning_rate": 1.9831198858408456e-05, + "loss": 1.5419, + "step": 726 + }, + { + "epoch": 0.12542051237815924, + "grad_norm": 0.62890625, + "learning_rate": 1.983070049236214e-05, + "loss": 1.5969, + "step": 727 + }, + { + "epoch": 0.12559303027689123, + "grad_norm": 0.69140625, + "learning_rate": 1.9830201397996104e-05, + "loss": 1.4889, + "step": 728 + }, + { + "epoch": 0.12576554817562322, + "grad_norm": 0.8125, + "learning_rate": 1.9829701575347323e-05, + "loss": 1.5763, + "step": 729 + }, + { + "epoch": 0.1259380660743552, + "grad_norm": 0.71484375, + "learning_rate": 1.9829201024452834e-05, + "loss": 1.5959, + "step": 730 + }, + { + "epoch": 0.12611058397308722, + "grad_norm": 0.73046875, + "learning_rate": 1.982869974534972e-05, + "loss": 1.6139, + "step": 731 + }, + { + "epoch": 0.1262831018718192, + "grad_norm": 0.71875, + "learning_rate": 1.9828197738075114e-05, + "loss": 1.5739, + "step": 732 + }, + { + "epoch": 0.1264556197705512, + "grad_norm": 1.0078125, + "learning_rate": 1.982769500266621e-05, + "loss": 1.6035, + "step": 733 + }, + { + "epoch": 0.1266281376692832, + "grad_norm": 0.8515625, + "learning_rate": 1.9827191539160253e-05, + "loss": 1.5015, + "step": 734 + }, + { + "epoch": 0.12680065556801517, + "grad_norm": 0.76953125, + "learning_rate": 1.9826687347594548e-05, + "loss": 1.6663, + "step": 735 + }, + { + "epoch": 0.12697317346674716, + "grad_norm": 0.8046875, + "learning_rate": 1.9826182428006437e-05, + "loss": 1.4977, + "step": 736 + }, + { + "epoch": 0.12714569136547918, + "grad_norm": 0.93359375, + "learning_rate": 1.9825676780433342e-05, + "loss": 1.5396, + "step": 737 + }, + { + "epoch": 0.12731820926421117, + "grad_norm": 0.6484375, + "learning_rate": 1.9825170404912712e-05, + "loss": 1.5833, + "step": 738 + }, + { + "epoch": 0.12749072716294316, + "grad_norm": 0.66796875, + "learning_rate": 1.982466330148207e-05, + "loss": 1.6297, + "step": 739 + }, + { + "epoch": 0.12766324506167515, + "grad_norm": 0.6328125, + "learning_rate": 1.9824155470178983e-05, + "loss": 1.5306, + "step": 740 + }, + { + "epoch": 0.12783576296040713, + "grad_norm": 0.703125, + "learning_rate": 1.9823646911041076e-05, + "loss": 1.529, + "step": 741 + }, + { + "epoch": 0.12800828085913912, + "grad_norm": 0.65625, + "learning_rate": 1.9823137624106023e-05, + "loss": 1.5001, + "step": 742 + }, + { + "epoch": 0.12818079875787114, + "grad_norm": 0.625, + "learning_rate": 1.982262760941156e-05, + "loss": 1.573, + "step": 743 + }, + { + "epoch": 0.12835331665660313, + "grad_norm": 0.65234375, + "learning_rate": 1.9822116866995466e-05, + "loss": 1.6003, + "step": 744 + }, + { + "epoch": 0.12852583455533512, + "grad_norm": 0.85546875, + "learning_rate": 1.9821605396895584e-05, + "loss": 1.6081, + "step": 745 + }, + { + "epoch": 0.1286983524540671, + "grad_norm": 0.71484375, + "learning_rate": 1.9821093199149806e-05, + "loss": 1.6188, + "step": 746 + }, + { + "epoch": 0.1288708703527991, + "grad_norm": 0.6484375, + "learning_rate": 1.982058027379608e-05, + "loss": 1.5679, + "step": 747 + }, + { + "epoch": 0.12904338825153108, + "grad_norm": 0.6484375, + "learning_rate": 1.9820066620872403e-05, + "loss": 1.582, + "step": 748 + }, + { + "epoch": 0.1292159061502631, + "grad_norm": 0.7109375, + "learning_rate": 1.9819552240416832e-05, + "loss": 1.613, + "step": 749 + }, + { + "epoch": 0.1293884240489951, + "grad_norm": 1.8671875, + "learning_rate": 1.9819037132467478e-05, + "loss": 1.6215, + "step": 750 + }, + { + "epoch": 0.12956094194772708, + "grad_norm": 0.79296875, + "learning_rate": 1.98185212970625e-05, + "loss": 1.5503, + "step": 751 + }, + { + "epoch": 0.12973345984645906, + "grad_norm": 0.9453125, + "learning_rate": 1.9818004734240115e-05, + "loss": 1.6983, + "step": 752 + }, + { + "epoch": 0.12990597774519105, + "grad_norm": 1.0859375, + "learning_rate": 1.9817487444038594e-05, + "loss": 1.5299, + "step": 753 + }, + { + "epoch": 0.13007849564392307, + "grad_norm": 0.90625, + "learning_rate": 1.981696942649626e-05, + "loss": 1.5083, + "step": 754 + }, + { + "epoch": 0.13025101354265506, + "grad_norm": 0.6875, + "learning_rate": 1.9816450681651495e-05, + "loss": 1.5681, + "step": 755 + }, + { + "epoch": 0.13042353144138705, + "grad_norm": 0.96875, + "learning_rate": 1.9815931209542723e-05, + "loss": 1.5735, + "step": 756 + }, + { + "epoch": 0.13059604934011904, + "grad_norm": 0.71484375, + "learning_rate": 1.9815411010208438e-05, + "loss": 1.5884, + "step": 757 + }, + { + "epoch": 0.13076856723885102, + "grad_norm": 0.7109375, + "learning_rate": 1.9814890083687172e-05, + "loss": 1.5198, + "step": 758 + }, + { + "epoch": 0.130941085137583, + "grad_norm": 0.65234375, + "learning_rate": 1.9814368430017526e-05, + "loss": 1.5571, + "step": 759 + }, + { + "epoch": 0.13111360303631503, + "grad_norm": 0.8984375, + "learning_rate": 1.9813846049238143e-05, + "loss": 1.597, + "step": 760 + }, + { + "epoch": 0.13128612093504702, + "grad_norm": 1.0234375, + "learning_rate": 1.981332294138772e-05, + "loss": 1.5261, + "step": 761 + }, + { + "epoch": 0.131458638833779, + "grad_norm": 1.4140625, + "learning_rate": 1.981279910650502e-05, + "loss": 1.711, + "step": 762 + }, + { + "epoch": 0.131631156732511, + "grad_norm": 0.84765625, + "learning_rate": 1.981227454462885e-05, + "loss": 1.6753, + "step": 763 + }, + { + "epoch": 0.13180367463124298, + "grad_norm": 0.74609375, + "learning_rate": 1.9811749255798074e-05, + "loss": 1.559, + "step": 764 + }, + { + "epoch": 0.13197619252997497, + "grad_norm": 0.76953125, + "learning_rate": 1.98112232400516e-05, + "loss": 1.648, + "step": 765 + }, + { + "epoch": 0.132148710428707, + "grad_norm": 0.83984375, + "learning_rate": 1.9810696497428412e-05, + "loss": 1.6339, + "step": 766 + }, + { + "epoch": 0.13232122832743898, + "grad_norm": 0.75390625, + "learning_rate": 1.9810169027967524e-05, + "loss": 1.5987, + "step": 767 + }, + { + "epoch": 0.13249374622617097, + "grad_norm": 0.8671875, + "learning_rate": 1.9809640831708018e-05, + "loss": 1.538, + "step": 768 + }, + { + "epoch": 0.13266626412490296, + "grad_norm": 0.703125, + "learning_rate": 1.9809111908689028e-05, + "loss": 1.4946, + "step": 769 + }, + { + "epoch": 0.13283878202363494, + "grad_norm": 0.83203125, + "learning_rate": 1.9808582258949735e-05, + "loss": 1.574, + "step": 770 + }, + { + "epoch": 0.13301129992236693, + "grad_norm": 0.796875, + "learning_rate": 1.980805188252938e-05, + "loss": 1.529, + "step": 771 + }, + { + "epoch": 0.13318381782109895, + "grad_norm": 0.70703125, + "learning_rate": 1.980752077946726e-05, + "loss": 1.519, + "step": 772 + }, + { + "epoch": 0.13335633571983094, + "grad_norm": 0.84765625, + "learning_rate": 1.9806988949802722e-05, + "loss": 1.6591, + "step": 773 + }, + { + "epoch": 0.13352885361856293, + "grad_norm": 0.70703125, + "learning_rate": 1.9806456393575164e-05, + "loss": 1.5509, + "step": 774 + }, + { + "epoch": 0.13370137151729491, + "grad_norm": 0.83984375, + "learning_rate": 1.980592311082404e-05, + "loss": 1.5898, + "step": 775 + }, + { + "epoch": 0.1338738894160269, + "grad_norm": 0.73046875, + "learning_rate": 1.9805389101588868e-05, + "loss": 1.5729, + "step": 776 + }, + { + "epoch": 0.13404640731475892, + "grad_norm": 0.83203125, + "learning_rate": 1.9804854365909202e-05, + "loss": 1.6551, + "step": 777 + }, + { + "epoch": 0.1342189252134909, + "grad_norm": 1.0390625, + "learning_rate": 1.980431890382466e-05, + "loss": 1.5785, + "step": 778 + }, + { + "epoch": 0.1343914431122229, + "grad_norm": 0.62890625, + "learning_rate": 1.9803782715374912e-05, + "loss": 1.6861, + "step": 779 + }, + { + "epoch": 0.13456396101095489, + "grad_norm": 0.76953125, + "learning_rate": 1.9803245800599685e-05, + "loss": 1.4989, + "step": 780 + }, + { + "epoch": 0.13473647890968687, + "grad_norm": 0.76171875, + "learning_rate": 1.9802708159538755e-05, + "loss": 1.6312, + "step": 781 + }, + { + "epoch": 0.13490899680841886, + "grad_norm": 0.87109375, + "learning_rate": 1.9802169792231955e-05, + "loss": 1.5757, + "step": 782 + }, + { + "epoch": 0.13508151470715088, + "grad_norm": 0.85546875, + "learning_rate": 1.980163069871917e-05, + "loss": 1.649, + "step": 783 + }, + { + "epoch": 0.13525403260588287, + "grad_norm": 0.77734375, + "learning_rate": 1.9801090879040342e-05, + "loss": 1.6765, + "step": 784 + }, + { + "epoch": 0.13542655050461486, + "grad_norm": 0.62890625, + "learning_rate": 1.9800550333235455e-05, + "loss": 1.5596, + "step": 785 + }, + { + "epoch": 0.13559906840334685, + "grad_norm": 1.03125, + "learning_rate": 1.980000906134457e-05, + "loss": 1.5821, + "step": 786 + }, + { + "epoch": 0.13577158630207883, + "grad_norm": 0.8828125, + "learning_rate": 1.9799467063407777e-05, + "loss": 1.4838, + "step": 787 + }, + { + "epoch": 0.13594410420081082, + "grad_norm": 0.62890625, + "learning_rate": 1.9798924339465232e-05, + "loss": 1.6303, + "step": 788 + }, + { + "epoch": 0.13611662209954284, + "grad_norm": 0.65625, + "learning_rate": 1.979838088955715e-05, + "loss": 1.5389, + "step": 789 + }, + { + "epoch": 0.13628913999827483, + "grad_norm": 0.60546875, + "learning_rate": 1.9797836713723786e-05, + "loss": 1.5639, + "step": 790 + }, + { + "epoch": 0.13646165789700682, + "grad_norm": 0.671875, + "learning_rate": 1.9797291812005458e-05, + "loss": 1.5259, + "step": 791 + }, + { + "epoch": 0.1366341757957388, + "grad_norm": 0.62890625, + "learning_rate": 1.9796746184442538e-05, + "loss": 1.6243, + "step": 792 + }, + { + "epoch": 0.1368066936944708, + "grad_norm": 0.78515625, + "learning_rate": 1.9796199831075445e-05, + "loss": 1.5038, + "step": 793 + }, + { + "epoch": 0.13697921159320278, + "grad_norm": 0.66796875, + "learning_rate": 1.9795652751944662e-05, + "loss": 1.5741, + "step": 794 + }, + { + "epoch": 0.1371517294919348, + "grad_norm": 0.6015625, + "learning_rate": 1.9795104947090714e-05, + "loss": 1.5581, + "step": 795 + }, + { + "epoch": 0.1373242473906668, + "grad_norm": 0.6328125, + "learning_rate": 1.9794556416554193e-05, + "loss": 1.539, + "step": 796 + }, + { + "epoch": 0.13749676528939878, + "grad_norm": 0.6484375, + "learning_rate": 1.979400716037573e-05, + "loss": 1.6923, + "step": 797 + }, + { + "epoch": 0.13766928318813076, + "grad_norm": 0.7578125, + "learning_rate": 1.979345717859602e-05, + "loss": 1.5974, + "step": 798 + }, + { + "epoch": 0.13784180108686275, + "grad_norm": 0.7109375, + "learning_rate": 1.9792906471255814e-05, + "loss": 1.6278, + "step": 799 + }, + { + "epoch": 0.13801431898559474, + "grad_norm": 0.75, + "learning_rate": 1.9792355038395906e-05, + "loss": 1.6722, + "step": 800 + }, + { + "epoch": 0.13801431898559474, + "eval_loss": 1.557534098625183, + "eval_runtime": 10.7703, + "eval_samples_per_second": 95.077, + "eval_steps_per_second": 23.769, + "step": 800 + }, + { + "epoch": 0.13818683688432676, + "grad_norm": 0.62890625, + "learning_rate": 1.979180288005715e-05, + "loss": 1.5548, + "step": 801 + }, + { + "epoch": 0.13835935478305875, + "grad_norm": 0.58203125, + "learning_rate": 1.9791249996280456e-05, + "loss": 1.5351, + "step": 802 + }, + { + "epoch": 0.13853187268179074, + "grad_norm": 0.62109375, + "learning_rate": 1.9790696387106782e-05, + "loss": 1.4778, + "step": 803 + }, + { + "epoch": 0.13870439058052272, + "grad_norm": 0.7265625, + "learning_rate": 1.9790142052577148e-05, + "loss": 1.5723, + "step": 804 + }, + { + "epoch": 0.1388769084792547, + "grad_norm": 0.7421875, + "learning_rate": 1.9789586992732615e-05, + "loss": 1.642, + "step": 805 + }, + { + "epoch": 0.13904942637798673, + "grad_norm": 0.75390625, + "learning_rate": 1.9789031207614312e-05, + "loss": 1.5624, + "step": 806 + }, + { + "epoch": 0.13922194427671872, + "grad_norm": 0.7890625, + "learning_rate": 1.978847469726341e-05, + "loss": 1.5436, + "step": 807 + }, + { + "epoch": 0.1393944621754507, + "grad_norm": 0.66015625, + "learning_rate": 1.9787917461721143e-05, + "loss": 1.5336, + "step": 808 + }, + { + "epoch": 0.1395669800741827, + "grad_norm": 0.62109375, + "learning_rate": 1.9787359501028795e-05, + "loss": 1.5797, + "step": 809 + }, + { + "epoch": 0.13973949797291468, + "grad_norm": 0.6328125, + "learning_rate": 1.97868008152277e-05, + "loss": 1.4998, + "step": 810 + }, + { + "epoch": 0.13991201587164667, + "grad_norm": 0.6328125, + "learning_rate": 1.9786241404359247e-05, + "loss": 1.6379, + "step": 811 + }, + { + "epoch": 0.1400845337703787, + "grad_norm": 0.671875, + "learning_rate": 1.978568126846488e-05, + "loss": 1.6053, + "step": 812 + }, + { + "epoch": 0.14025705166911068, + "grad_norm": 0.75390625, + "learning_rate": 1.978512040758611e-05, + "loss": 1.6019, + "step": 813 + }, + { + "epoch": 0.14042956956784267, + "grad_norm": 0.62109375, + "learning_rate": 1.9784558821764476e-05, + "loss": 1.6052, + "step": 814 + }, + { + "epoch": 0.14060208746657465, + "grad_norm": 0.83203125, + "learning_rate": 1.978399651104159e-05, + "loss": 1.6307, + "step": 815 + }, + { + "epoch": 0.14077460536530664, + "grad_norm": 0.6796875, + "learning_rate": 1.9783433475459103e-05, + "loss": 1.5099, + "step": 816 + }, + { + "epoch": 0.14094712326403863, + "grad_norm": 0.85546875, + "learning_rate": 1.9782869715058738e-05, + "loss": 1.5855, + "step": 817 + }, + { + "epoch": 0.14111964116277065, + "grad_norm": 0.59375, + "learning_rate": 1.978230522988226e-05, + "loss": 1.5681, + "step": 818 + }, + { + "epoch": 0.14129215906150264, + "grad_norm": 0.640625, + "learning_rate": 1.9781740019971485e-05, + "loss": 1.6274, + "step": 819 + }, + { + "epoch": 0.14146467696023463, + "grad_norm": 0.70703125, + "learning_rate": 1.9781174085368292e-05, + "loss": 1.6156, + "step": 820 + }, + { + "epoch": 0.14163719485896661, + "grad_norm": 0.67578125, + "learning_rate": 1.9780607426114606e-05, + "loss": 1.6525, + "step": 821 + }, + { + "epoch": 0.1418097127576986, + "grad_norm": 0.67578125, + "learning_rate": 1.9780040042252412e-05, + "loss": 1.4553, + "step": 822 + }, + { + "epoch": 0.1419822306564306, + "grad_norm": 0.66796875, + "learning_rate": 1.977947193382374e-05, + "loss": 1.5795, + "step": 823 + }, + { + "epoch": 0.1421547485551626, + "grad_norm": 0.625, + "learning_rate": 1.9778903100870687e-05, + "loss": 1.5352, + "step": 824 + }, + { + "epoch": 0.1423272664538946, + "grad_norm": 0.70703125, + "learning_rate": 1.9778333543435387e-05, + "loss": 1.4478, + "step": 825 + }, + { + "epoch": 0.14249978435262659, + "grad_norm": 0.609375, + "learning_rate": 1.977776326156004e-05, + "loss": 1.609, + "step": 826 + }, + { + "epoch": 0.14267230225135857, + "grad_norm": 0.6171875, + "learning_rate": 1.9777192255286897e-05, + "loss": 1.6362, + "step": 827 + }, + { + "epoch": 0.14284482015009056, + "grad_norm": 1.3984375, + "learning_rate": 1.977662052465826e-05, + "loss": 1.5537, + "step": 828 + }, + { + "epoch": 0.14301733804882258, + "grad_norm": 0.64453125, + "learning_rate": 1.977604806971649e-05, + "loss": 1.6069, + "step": 829 + }, + { + "epoch": 0.14318985594755457, + "grad_norm": 0.625, + "learning_rate": 1.9775474890503996e-05, + "loss": 1.5911, + "step": 830 + }, + { + "epoch": 0.14336237384628656, + "grad_norm": 0.59765625, + "learning_rate": 1.9774900987063237e-05, + "loss": 1.5657, + "step": 831 + }, + { + "epoch": 0.14353489174501854, + "grad_norm": 0.62890625, + "learning_rate": 1.9774326359436743e-05, + "loss": 1.605, + "step": 832 + }, + { + "epoch": 0.14370740964375053, + "grad_norm": 0.609375, + "learning_rate": 1.9773751007667074e-05, + "loss": 1.5102, + "step": 833 + }, + { + "epoch": 0.14387992754248252, + "grad_norm": 0.56640625, + "learning_rate": 1.9773174931796864e-05, + "loss": 1.5579, + "step": 834 + }, + { + "epoch": 0.14405244544121454, + "grad_norm": 0.6796875, + "learning_rate": 1.977259813186879e-05, + "loss": 1.558, + "step": 835 + }, + { + "epoch": 0.14422496333994653, + "grad_norm": 0.6484375, + "learning_rate": 1.977202060792558e-05, + "loss": 1.5766, + "step": 836 + }, + { + "epoch": 0.14439748123867852, + "grad_norm": 0.62109375, + "learning_rate": 1.977144236001003e-05, + "loss": 1.5227, + "step": 837 + }, + { + "epoch": 0.1445699991374105, + "grad_norm": 0.61328125, + "learning_rate": 1.977086338816497e-05, + "loss": 1.6174, + "step": 838 + }, + { + "epoch": 0.1447425170361425, + "grad_norm": 0.71484375, + "learning_rate": 1.9770283692433306e-05, + "loss": 1.5583, + "step": 839 + }, + { + "epoch": 0.14491503493487448, + "grad_norm": 0.703125, + "learning_rate": 1.9769703272857976e-05, + "loss": 1.4618, + "step": 840 + }, + { + "epoch": 0.1450875528336065, + "grad_norm": 2.84375, + "learning_rate": 1.976912212948198e-05, + "loss": 1.5849, + "step": 841 + }, + { + "epoch": 0.1452600707323385, + "grad_norm": 0.7890625, + "learning_rate": 1.976854026234838e-05, + "loss": 1.5698, + "step": 842 + }, + { + "epoch": 0.14543258863107048, + "grad_norm": 0.6953125, + "learning_rate": 1.9767957671500277e-05, + "loss": 1.5552, + "step": 843 + }, + { + "epoch": 0.14560510652980246, + "grad_norm": 0.7578125, + "learning_rate": 1.9767374356980838e-05, + "loss": 1.5065, + "step": 844 + }, + { + "epoch": 0.14577762442853445, + "grad_norm": 0.65234375, + "learning_rate": 1.976679031883328e-05, + "loss": 1.5852, + "step": 845 + }, + { + "epoch": 0.14595014232726644, + "grad_norm": 0.9765625, + "learning_rate": 1.976620555710087e-05, + "loss": 1.6179, + "step": 846 + }, + { + "epoch": 0.14612266022599846, + "grad_norm": 0.7109375, + "learning_rate": 1.9765620071826928e-05, + "loss": 1.5693, + "step": 847 + }, + { + "epoch": 0.14629517812473045, + "grad_norm": 0.7578125, + "learning_rate": 1.976503386305483e-05, + "loss": 1.4705, + "step": 848 + }, + { + "epoch": 0.14646769602346243, + "grad_norm": 0.72265625, + "learning_rate": 1.9764446930828015e-05, + "loss": 1.5468, + "step": 849 + }, + { + "epoch": 0.14664021392219442, + "grad_norm": 0.65234375, + "learning_rate": 1.9763859275189956e-05, + "loss": 1.4668, + "step": 850 + }, + { + "epoch": 0.1468127318209264, + "grad_norm": 1.234375, + "learning_rate": 1.9763270896184195e-05, + "loss": 1.5784, + "step": 851 + }, + { + "epoch": 0.1469852497196584, + "grad_norm": 0.6953125, + "learning_rate": 1.9762681793854323e-05, + "loss": 1.5303, + "step": 852 + }, + { + "epoch": 0.14715776761839042, + "grad_norm": 0.65234375, + "learning_rate": 1.9762091968243982e-05, + "loss": 1.5035, + "step": 853 + }, + { + "epoch": 0.1473302855171224, + "grad_norm": 0.69140625, + "learning_rate": 1.9761501419396875e-05, + "loss": 1.6431, + "step": 854 + }, + { + "epoch": 0.1475028034158544, + "grad_norm": 0.765625, + "learning_rate": 1.9760910147356743e-05, + "loss": 1.586, + "step": 855 + }, + { + "epoch": 0.14767532131458638, + "grad_norm": 0.65234375, + "learning_rate": 1.9760318152167406e-05, + "loss": 1.5336, + "step": 856 + }, + { + "epoch": 0.14784783921331837, + "grad_norm": 0.765625, + "learning_rate": 1.9759725433872713e-05, + "loss": 1.5582, + "step": 857 + }, + { + "epoch": 0.1480203571120504, + "grad_norm": 0.65625, + "learning_rate": 1.9759131992516575e-05, + "loss": 1.607, + "step": 858 + }, + { + "epoch": 0.14819287501078238, + "grad_norm": 0.94140625, + "learning_rate": 1.9758537828142966e-05, + "loss": 1.5921, + "step": 859 + }, + { + "epoch": 0.14836539290951437, + "grad_norm": 0.6953125, + "learning_rate": 1.9757942940795897e-05, + "loss": 1.5577, + "step": 860 + }, + { + "epoch": 0.14853791080824635, + "grad_norm": 0.69140625, + "learning_rate": 1.975734733051945e-05, + "loss": 1.6636, + "step": 861 + }, + { + "epoch": 0.14871042870697834, + "grad_norm": 0.671875, + "learning_rate": 1.9756750997357738e-05, + "loss": 1.6487, + "step": 862 + }, + { + "epoch": 0.14888294660571033, + "grad_norm": 0.61328125, + "learning_rate": 1.9756153941354955e-05, + "loss": 1.5294, + "step": 863 + }, + { + "epoch": 0.14905546450444235, + "grad_norm": 0.63671875, + "learning_rate": 1.9755556162555323e-05, + "loss": 1.5988, + "step": 864 + }, + { + "epoch": 0.14922798240317434, + "grad_norm": 0.625, + "learning_rate": 1.975495766100314e-05, + "loss": 1.5239, + "step": 865 + }, + { + "epoch": 0.14940050030190632, + "grad_norm": 0.93359375, + "learning_rate": 1.975435843674274e-05, + "loss": 1.6436, + "step": 866 + }, + { + "epoch": 0.1495730182006383, + "grad_norm": 0.62890625, + "learning_rate": 1.975375848981852e-05, + "loss": 1.6203, + "step": 867 + }, + { + "epoch": 0.1497455360993703, + "grad_norm": 0.63671875, + "learning_rate": 1.9753157820274924e-05, + "loss": 1.6343, + "step": 868 + }, + { + "epoch": 0.1499180539981023, + "grad_norm": 0.72265625, + "learning_rate": 1.975255642815646e-05, + "loss": 1.6688, + "step": 869 + }, + { + "epoch": 0.1500905718968343, + "grad_norm": 0.69921875, + "learning_rate": 1.9751954313507674e-05, + "loss": 1.6492, + "step": 870 + }, + { + "epoch": 0.1502630897955663, + "grad_norm": 0.66015625, + "learning_rate": 1.9751351476373184e-05, + "loss": 1.6285, + "step": 871 + }, + { + "epoch": 0.15043560769429828, + "grad_norm": 0.6953125, + "learning_rate": 1.975074791679765e-05, + "loss": 1.6182, + "step": 872 + }, + { + "epoch": 0.15060812559303027, + "grad_norm": 0.66796875, + "learning_rate": 1.9750143634825776e-05, + "loss": 1.6117, + "step": 873 + }, + { + "epoch": 0.15078064349176226, + "grad_norm": 0.75, + "learning_rate": 1.9749538630502346e-05, + "loss": 1.5948, + "step": 874 + }, + { + "epoch": 0.15095316139049425, + "grad_norm": 0.6796875, + "learning_rate": 1.9748932903872176e-05, + "loss": 1.5501, + "step": 875 + }, + { + "epoch": 0.15112567928922627, + "grad_norm": 0.69921875, + "learning_rate": 1.974832645498014e-05, + "loss": 1.5538, + "step": 876 + }, + { + "epoch": 0.15129819718795826, + "grad_norm": 0.671875, + "learning_rate": 1.9747719283871172e-05, + "loss": 1.534, + "step": 877 + }, + { + "epoch": 0.15147071508669024, + "grad_norm": 0.79296875, + "learning_rate": 1.974711139059025e-05, + "loss": 1.5364, + "step": 878 + }, + { + "epoch": 0.15164323298542223, + "grad_norm": 0.625, + "learning_rate": 1.9746502775182415e-05, + "loss": 1.5412, + "step": 879 + }, + { + "epoch": 0.15181575088415422, + "grad_norm": 0.68359375, + "learning_rate": 1.9745893437692757e-05, + "loss": 1.4718, + "step": 880 + }, + { + "epoch": 0.15198826878288624, + "grad_norm": 0.83984375, + "learning_rate": 1.9745283378166417e-05, + "loss": 1.6271, + "step": 881 + }, + { + "epoch": 0.15216078668161823, + "grad_norm": 0.609375, + "learning_rate": 1.9744672596648593e-05, + "loss": 1.6166, + "step": 882 + }, + { + "epoch": 0.15233330458035022, + "grad_norm": 0.7578125, + "learning_rate": 1.9744061093184537e-05, + "loss": 1.5178, + "step": 883 + }, + { + "epoch": 0.1525058224790822, + "grad_norm": 0.75390625, + "learning_rate": 1.974344886781955e-05, + "loss": 1.5554, + "step": 884 + }, + { + "epoch": 0.1526783403778142, + "grad_norm": 0.72265625, + "learning_rate": 1.9742835920598988e-05, + "loss": 1.5596, + "step": 885 + }, + { + "epoch": 0.15285085827654618, + "grad_norm": 0.7578125, + "learning_rate": 1.974222225156827e-05, + "loss": 1.6621, + "step": 886 + }, + { + "epoch": 0.1530233761752782, + "grad_norm": 0.69921875, + "learning_rate": 1.974160786077285e-05, + "loss": 1.6319, + "step": 887 + }, + { + "epoch": 0.15319589407401019, + "grad_norm": 0.70703125, + "learning_rate": 1.9740992748258258e-05, + "loss": 1.5838, + "step": 888 + }, + { + "epoch": 0.15336841197274217, + "grad_norm": 0.79296875, + "learning_rate": 1.9740376914070055e-05, + "loss": 1.5944, + "step": 889 + }, + { + "epoch": 0.15354092987147416, + "grad_norm": 0.73046875, + "learning_rate": 1.9739760358253867e-05, + "loss": 1.6164, + "step": 890 + }, + { + "epoch": 0.15371344777020615, + "grad_norm": 0.66015625, + "learning_rate": 1.973914308085538e-05, + "loss": 1.6087, + "step": 891 + }, + { + "epoch": 0.15388596566893814, + "grad_norm": 0.671875, + "learning_rate": 1.9738525081920316e-05, + "loss": 1.5904, + "step": 892 + }, + { + "epoch": 0.15405848356767016, + "grad_norm": 0.58203125, + "learning_rate": 1.9737906361494467e-05, + "loss": 1.5325, + "step": 893 + }, + { + "epoch": 0.15423100146640215, + "grad_norm": 0.640625, + "learning_rate": 1.973728691962367e-05, + "loss": 1.5605, + "step": 894 + }, + { + "epoch": 0.15440351936513413, + "grad_norm": 0.8359375, + "learning_rate": 1.973666675635382e-05, + "loss": 1.6023, + "step": 895 + }, + { + "epoch": 0.15457603726386612, + "grad_norm": 0.64453125, + "learning_rate": 1.9736045871730854e-05, + "loss": 1.5483, + "step": 896 + }, + { + "epoch": 0.1547485551625981, + "grad_norm": 0.6484375, + "learning_rate": 1.9735424265800775e-05, + "loss": 1.587, + "step": 897 + }, + { + "epoch": 0.1549210730613301, + "grad_norm": 0.73046875, + "learning_rate": 1.973480193860964e-05, + "loss": 1.5754, + "step": 898 + }, + { + "epoch": 0.15509359096006212, + "grad_norm": 0.640625, + "learning_rate": 1.973417889020355e-05, + "loss": 1.6912, + "step": 899 + }, + { + "epoch": 0.1552661088587941, + "grad_norm": 0.96875, + "learning_rate": 1.9733555120628666e-05, + "loss": 1.4927, + "step": 900 + }, + { + "epoch": 0.1552661088587941, + "eval_loss": 1.5473740100860596, + "eval_runtime": 10.7618, + "eval_samples_per_second": 95.151, + "eval_steps_per_second": 23.788, + "step": 900 + }, + { + "epoch": 0.1554386267575261, + "grad_norm": 0.6328125, + "learning_rate": 1.97329306299312e-05, + "loss": 1.4499, + "step": 901 + }, + { + "epoch": 0.15561114465625808, + "grad_norm": 0.734375, + "learning_rate": 1.9732305418157423e-05, + "loss": 1.5901, + "step": 902 + }, + { + "epoch": 0.15578366255499007, + "grad_norm": 0.81640625, + "learning_rate": 1.9731679485353645e-05, + "loss": 1.4815, + "step": 903 + }, + { + "epoch": 0.1559561804537221, + "grad_norm": 0.66015625, + "learning_rate": 1.9731052831566248e-05, + "loss": 1.5613, + "step": 904 + }, + { + "epoch": 0.15612869835245408, + "grad_norm": 0.67578125, + "learning_rate": 1.9730425456841652e-05, + "loss": 1.5202, + "step": 905 + }, + { + "epoch": 0.15630121625118606, + "grad_norm": 0.76171875, + "learning_rate": 1.9729797361226342e-05, + "loss": 1.5046, + "step": 906 + }, + { + "epoch": 0.15647373414991805, + "grad_norm": 0.7265625, + "learning_rate": 1.9729168544766848e-05, + "loss": 1.6204, + "step": 907 + }, + { + "epoch": 0.15664625204865004, + "grad_norm": 0.6328125, + "learning_rate": 1.9728539007509756e-05, + "loss": 1.5183, + "step": 908 + }, + { + "epoch": 0.15681876994738203, + "grad_norm": 0.640625, + "learning_rate": 1.972790874950171e-05, + "loss": 1.4783, + "step": 909 + }, + { + "epoch": 0.15699128784611405, + "grad_norm": 0.5859375, + "learning_rate": 1.9727277770789398e-05, + "loss": 1.5652, + "step": 910 + }, + { + "epoch": 0.15716380574484604, + "grad_norm": 0.6015625, + "learning_rate": 1.9726646071419573e-05, + "loss": 1.4987, + "step": 911 + }, + { + "epoch": 0.15733632364357802, + "grad_norm": 0.71484375, + "learning_rate": 1.9726013651439033e-05, + "loss": 1.5341, + "step": 912 + }, + { + "epoch": 0.15750884154231, + "grad_norm": 0.6484375, + "learning_rate": 1.972538051089463e-05, + "loss": 1.609, + "step": 913 + }, + { + "epoch": 0.157681359441042, + "grad_norm": 0.67578125, + "learning_rate": 1.972474664983327e-05, + "loss": 1.5571, + "step": 914 + }, + { + "epoch": 0.157853877339774, + "grad_norm": 0.71875, + "learning_rate": 1.9724112068301914e-05, + "loss": 1.5965, + "step": 915 + }, + { + "epoch": 0.158026395238506, + "grad_norm": 0.6875, + "learning_rate": 1.972347676634758e-05, + "loss": 1.567, + "step": 916 + }, + { + "epoch": 0.158198913137238, + "grad_norm": 0.6328125, + "learning_rate": 1.9722840744017332e-05, + "loss": 1.5872, + "step": 917 + }, + { + "epoch": 0.15837143103596998, + "grad_norm": 0.625, + "learning_rate": 1.972220400135829e-05, + "loss": 1.5357, + "step": 918 + }, + { + "epoch": 0.15854394893470197, + "grad_norm": 0.61328125, + "learning_rate": 1.9721566538417626e-05, + "loss": 1.6097, + "step": 919 + }, + { + "epoch": 0.15871646683343396, + "grad_norm": 0.76953125, + "learning_rate": 1.972092835524257e-05, + "loss": 1.5192, + "step": 920 + }, + { + "epoch": 0.15888898473216595, + "grad_norm": 0.6953125, + "learning_rate": 1.9720289451880407e-05, + "loss": 1.5389, + "step": 921 + }, + { + "epoch": 0.15906150263089797, + "grad_norm": 0.60546875, + "learning_rate": 1.9719649828378464e-05, + "loss": 1.6057, + "step": 922 + }, + { + "epoch": 0.15923402052962995, + "grad_norm": 0.671875, + "learning_rate": 1.9719009484784125e-05, + "loss": 1.5346, + "step": 923 + }, + { + "epoch": 0.15940653842836194, + "grad_norm": 1.234375, + "learning_rate": 1.9718368421144842e-05, + "loss": 1.6481, + "step": 924 + }, + { + "epoch": 0.15957905632709393, + "grad_norm": 0.703125, + "learning_rate": 1.97177266375081e-05, + "loss": 1.6184, + "step": 925 + }, + { + "epoch": 0.15975157422582592, + "grad_norm": 0.6328125, + "learning_rate": 1.9717084133921456e-05, + "loss": 1.6208, + "step": 926 + }, + { + "epoch": 0.1599240921245579, + "grad_norm": 0.7734375, + "learning_rate": 1.97164409104325e-05, + "loss": 1.6104, + "step": 927 + }, + { + "epoch": 0.16009661002328993, + "grad_norm": 0.6640625, + "learning_rate": 1.9715796967088888e-05, + "loss": 1.6334, + "step": 928 + }, + { + "epoch": 0.16026912792202191, + "grad_norm": 0.859375, + "learning_rate": 1.971515230393833e-05, + "loss": 1.5719, + "step": 929 + }, + { + "epoch": 0.1604416458207539, + "grad_norm": 0.78125, + "learning_rate": 1.971450692102859e-05, + "loss": 1.6079, + "step": 930 + }, + { + "epoch": 0.1606141637194859, + "grad_norm": 0.640625, + "learning_rate": 1.9713860818407474e-05, + "loss": 1.5634, + "step": 931 + }, + { + "epoch": 0.16078668161821788, + "grad_norm": 3.171875, + "learning_rate": 1.9713213996122857e-05, + "loss": 1.5833, + "step": 932 + }, + { + "epoch": 0.1609591995169499, + "grad_norm": 0.97265625, + "learning_rate": 1.9712566454222653e-05, + "loss": 1.5331, + "step": 933 + }, + { + "epoch": 0.16113171741568189, + "grad_norm": 0.6875, + "learning_rate": 1.9711918192754842e-05, + "loss": 1.5762, + "step": 934 + }, + { + "epoch": 0.16130423531441387, + "grad_norm": 1.234375, + "learning_rate": 1.9711269211767446e-05, + "loss": 1.63, + "step": 935 + }, + { + "epoch": 0.16147675321314586, + "grad_norm": 0.96875, + "learning_rate": 1.971061951130855e-05, + "loss": 1.6418, + "step": 936 + }, + { + "epoch": 0.16164927111187785, + "grad_norm": 0.8125, + "learning_rate": 1.9709969091426288e-05, + "loss": 1.5717, + "step": 937 + }, + { + "epoch": 0.16182178901060984, + "grad_norm": 0.76953125, + "learning_rate": 1.970931795216884e-05, + "loss": 1.524, + "step": 938 + }, + { + "epoch": 0.16199430690934186, + "grad_norm": 0.61328125, + "learning_rate": 1.9708666093584456e-05, + "loss": 1.5546, + "step": 939 + }, + { + "epoch": 0.16216682480807385, + "grad_norm": 0.9609375, + "learning_rate": 1.970801351572142e-05, + "loss": 1.6206, + "step": 940 + }, + { + "epoch": 0.16233934270680583, + "grad_norm": 0.9140625, + "learning_rate": 1.970736021862809e-05, + "loss": 1.5482, + "step": 941 + }, + { + "epoch": 0.16251186060553782, + "grad_norm": 0.65234375, + "learning_rate": 1.9706706202352856e-05, + "loss": 1.5676, + "step": 942 + }, + { + "epoch": 0.1626843785042698, + "grad_norm": 0.63671875, + "learning_rate": 1.970605146694418e-05, + "loss": 1.625, + "step": 943 + }, + { + "epoch": 0.1628568964030018, + "grad_norm": 0.67578125, + "learning_rate": 1.9705396012450563e-05, + "loss": 1.5227, + "step": 944 + }, + { + "epoch": 0.16302941430173382, + "grad_norm": 0.6796875, + "learning_rate": 1.9704739838920565e-05, + "loss": 1.6658, + "step": 945 + }, + { + "epoch": 0.1632019322004658, + "grad_norm": 0.62890625, + "learning_rate": 1.9704082946402805e-05, + "loss": 1.5264, + "step": 946 + }, + { + "epoch": 0.1633744500991978, + "grad_norm": 0.62109375, + "learning_rate": 1.9703425334945945e-05, + "loss": 1.5827, + "step": 947 + }, + { + "epoch": 0.16354696799792978, + "grad_norm": 0.6640625, + "learning_rate": 1.9702767004598708e-05, + "loss": 1.5444, + "step": 948 + }, + { + "epoch": 0.16371948589666177, + "grad_norm": 0.71875, + "learning_rate": 1.9702107955409862e-05, + "loss": 1.5957, + "step": 949 + }, + { + "epoch": 0.16389200379539376, + "grad_norm": 0.7890625, + "learning_rate": 1.9701448187428244e-05, + "loss": 1.5006, + "step": 950 + }, + { + "epoch": 0.16406452169412578, + "grad_norm": 0.6171875, + "learning_rate": 1.970078770070272e-05, + "loss": 1.5879, + "step": 951 + }, + { + "epoch": 0.16423703959285776, + "grad_norm": 0.74609375, + "learning_rate": 1.9700126495282234e-05, + "loss": 1.5882, + "step": 952 + }, + { + "epoch": 0.16440955749158975, + "grad_norm": 0.80859375, + "learning_rate": 1.9699464571215765e-05, + "loss": 1.5719, + "step": 953 + }, + { + "epoch": 0.16458207539032174, + "grad_norm": 0.734375, + "learning_rate": 1.9698801928552358e-05, + "loss": 1.5701, + "step": 954 + }, + { + "epoch": 0.16475459328905373, + "grad_norm": 0.64453125, + "learning_rate": 1.96981385673411e-05, + "loss": 1.6497, + "step": 955 + }, + { + "epoch": 0.16492711118778575, + "grad_norm": 0.73828125, + "learning_rate": 1.9697474487631143e-05, + "loss": 1.5415, + "step": 956 + }, + { + "epoch": 0.16509962908651774, + "grad_norm": 0.67578125, + "learning_rate": 1.9696809689471682e-05, + "loss": 1.5721, + "step": 957 + }, + { + "epoch": 0.16527214698524972, + "grad_norm": 0.8515625, + "learning_rate": 1.9696144172911974e-05, + "loss": 1.5413, + "step": 958 + }, + { + "epoch": 0.1654446648839817, + "grad_norm": 0.6796875, + "learning_rate": 1.9695477938001316e-05, + "loss": 1.5264, + "step": 959 + }, + { + "epoch": 0.1656171827827137, + "grad_norm": 0.60546875, + "learning_rate": 1.9694810984789074e-05, + "loss": 1.5174, + "step": 960 + }, + { + "epoch": 0.1657897006814457, + "grad_norm": 2.015625, + "learning_rate": 1.969414331332466e-05, + "loss": 1.5387, + "step": 961 + }, + { + "epoch": 0.1659622185801777, + "grad_norm": 0.77734375, + "learning_rate": 1.9693474923657536e-05, + "loss": 1.5397, + "step": 962 + }, + { + "epoch": 0.1661347364789097, + "grad_norm": 0.671875, + "learning_rate": 1.9692805815837224e-05, + "loss": 1.5345, + "step": 963 + }, + { + "epoch": 0.16630725437764168, + "grad_norm": 0.71484375, + "learning_rate": 1.9692135989913294e-05, + "loss": 1.487, + "step": 964 + }, + { + "epoch": 0.16647977227637367, + "grad_norm": 0.69140625, + "learning_rate": 1.969146544593537e-05, + "loss": 1.6524, + "step": 965 + }, + { + "epoch": 0.16665229017510566, + "grad_norm": 0.62890625, + "learning_rate": 1.9690794183953128e-05, + "loss": 1.667, + "step": 966 + }, + { + "epoch": 0.16682480807383765, + "grad_norm": 0.6640625, + "learning_rate": 1.96901222040163e-05, + "loss": 1.5474, + "step": 967 + }, + { + "epoch": 0.16699732597256967, + "grad_norm": 0.67578125, + "learning_rate": 1.968944950617468e-05, + "loss": 1.5477, + "step": 968 + }, + { + "epoch": 0.16716984387130165, + "grad_norm": 0.89453125, + "learning_rate": 1.9688776090478096e-05, + "loss": 1.5634, + "step": 969 + }, + { + "epoch": 0.16734236177003364, + "grad_norm": 0.6484375, + "learning_rate": 1.9688101956976436e-05, + "loss": 1.5918, + "step": 970 + }, + { + "epoch": 0.16751487966876563, + "grad_norm": 0.75390625, + "learning_rate": 1.968742710571965e-05, + "loss": 1.6563, + "step": 971 + }, + { + "epoch": 0.16768739756749762, + "grad_norm": 0.6015625, + "learning_rate": 1.968675153675774e-05, + "loss": 1.5484, + "step": 972 + }, + { + "epoch": 0.1678599154662296, + "grad_norm": 0.67578125, + "learning_rate": 1.9686075250140745e-05, + "loss": 1.5401, + "step": 973 + }, + { + "epoch": 0.16803243336496163, + "grad_norm": 0.70703125, + "learning_rate": 1.9685398245918778e-05, + "loss": 1.5453, + "step": 974 + }, + { + "epoch": 0.16820495126369361, + "grad_norm": 0.66015625, + "learning_rate": 1.9684720524141988e-05, + "loss": 1.5738, + "step": 975 + }, + { + "epoch": 0.1683774691624256, + "grad_norm": 0.6484375, + "learning_rate": 1.9684042084860594e-05, + "loss": 1.5588, + "step": 976 + }, + { + "epoch": 0.1685499870611576, + "grad_norm": 0.7734375, + "learning_rate": 1.968336292812485e-05, + "loss": 1.5041, + "step": 977 + }, + { + "epoch": 0.16872250495988958, + "grad_norm": 0.65625, + "learning_rate": 1.9682683053985073e-05, + "loss": 1.5881, + "step": 978 + }, + { + "epoch": 0.16889502285862157, + "grad_norm": 0.921875, + "learning_rate": 1.968200246249164e-05, + "loss": 1.5471, + "step": 979 + }, + { + "epoch": 0.16906754075735358, + "grad_norm": 0.68359375, + "learning_rate": 1.9681321153694967e-05, + "loss": 1.5248, + "step": 980 + }, + { + "epoch": 0.16924005865608557, + "grad_norm": 0.921875, + "learning_rate": 1.968063912764553e-05, + "loss": 1.5456, + "step": 981 + }, + { + "epoch": 0.16941257655481756, + "grad_norm": 0.68359375, + "learning_rate": 1.967995638439386e-05, + "loss": 1.56, + "step": 982 + }, + { + "epoch": 0.16958509445354955, + "grad_norm": 0.640625, + "learning_rate": 1.967927292399054e-05, + "loss": 1.5441, + "step": 983 + }, + { + "epoch": 0.16975761235228154, + "grad_norm": 0.72265625, + "learning_rate": 1.9678588746486198e-05, + "loss": 1.5252, + "step": 984 + }, + { + "epoch": 0.16993013025101356, + "grad_norm": 0.640625, + "learning_rate": 1.967790385193153e-05, + "loss": 1.54, + "step": 985 + }, + { + "epoch": 0.17010264814974554, + "grad_norm": 0.6953125, + "learning_rate": 1.9677218240377272e-05, + "loss": 1.6079, + "step": 986 + }, + { + "epoch": 0.17027516604847753, + "grad_norm": 0.60546875, + "learning_rate": 1.9676531911874223e-05, + "loss": 1.5549, + "step": 987 + }, + { + "epoch": 0.17044768394720952, + "grad_norm": 0.6171875, + "learning_rate": 1.967584486647323e-05, + "loss": 1.6047, + "step": 988 + }, + { + "epoch": 0.1706202018459415, + "grad_norm": 0.7734375, + "learning_rate": 1.9675157104225188e-05, + "loss": 1.6402, + "step": 989 + }, + { + "epoch": 0.1707927197446735, + "grad_norm": 0.9140625, + "learning_rate": 1.9674468625181058e-05, + "loss": 1.5565, + "step": 990 + }, + { + "epoch": 0.17096523764340552, + "grad_norm": 0.69921875, + "learning_rate": 1.967377942939184e-05, + "loss": 1.4881, + "step": 991 + }, + { + "epoch": 0.1711377555421375, + "grad_norm": 0.75, + "learning_rate": 1.96730895169086e-05, + "loss": 1.6077, + "step": 992 + }, + { + "epoch": 0.1713102734408695, + "grad_norm": 0.984375, + "learning_rate": 1.9672398887782448e-05, + "loss": 1.486, + "step": 993 + }, + { + "epoch": 0.17148279133960148, + "grad_norm": 0.6328125, + "learning_rate": 1.967170754206455e-05, + "loss": 1.6017, + "step": 994 + }, + { + "epoch": 0.17165530923833347, + "grad_norm": 0.69921875, + "learning_rate": 1.9671015479806126e-05, + "loss": 1.6084, + "step": 995 + }, + { + "epoch": 0.17182782713706546, + "grad_norm": 0.7578125, + "learning_rate": 1.9670322701058447e-05, + "loss": 1.5449, + "step": 996 + }, + { + "epoch": 0.17200034503579748, + "grad_norm": 0.609375, + "learning_rate": 1.966962920587284e-05, + "loss": 1.5863, + "step": 997 + }, + { + "epoch": 0.17217286293452946, + "grad_norm": 0.74609375, + "learning_rate": 1.9668934994300684e-05, + "loss": 1.5316, + "step": 998 + }, + { + "epoch": 0.17234538083326145, + "grad_norm": 0.6875, + "learning_rate": 1.9668240066393406e-05, + "loss": 1.56, + "step": 999 + }, + { + "epoch": 0.17251789873199344, + "grad_norm": 0.6953125, + "learning_rate": 1.9667544422202497e-05, + "loss": 1.5559, + "step": 1000 + }, + { + "epoch": 0.17251789873199344, + "eval_loss": 1.538590908050537, + "eval_runtime": 11.0756, + "eval_samples_per_second": 92.455, + "eval_steps_per_second": 23.114, + "step": 1000 + }, + { + "epoch": 0.17269041663072543, + "grad_norm": 1.046875, + "learning_rate": 1.9666848061779487e-05, + "loss": 1.6472, + "step": 1001 + }, + { + "epoch": 0.17286293452945742, + "grad_norm": 0.77734375, + "learning_rate": 1.966615098517598e-05, + "loss": 1.4592, + "step": 1002 + }, + { + "epoch": 0.17303545242818943, + "grad_norm": 0.79296875, + "learning_rate": 1.9665453192443603e-05, + "loss": 1.5818, + "step": 1003 + }, + { + "epoch": 0.17320797032692142, + "grad_norm": 0.8984375, + "learning_rate": 1.9664754683634064e-05, + "loss": 1.6736, + "step": 1004 + }, + { + "epoch": 0.1733804882256534, + "grad_norm": 0.6953125, + "learning_rate": 1.966405545879911e-05, + "loss": 1.5858, + "step": 1005 + }, + { + "epoch": 0.1735530061243854, + "grad_norm": 0.8203125, + "learning_rate": 1.9663355517990543e-05, + "loss": 1.5809, + "step": 1006 + }, + { + "epoch": 0.1737255240231174, + "grad_norm": 0.7421875, + "learning_rate": 1.966265486126022e-05, + "loss": 1.6794, + "step": 1007 + }, + { + "epoch": 0.1738980419218494, + "grad_norm": 0.63671875, + "learning_rate": 1.966195348866005e-05, + "loss": 1.6081, + "step": 1008 + }, + { + "epoch": 0.1740705598205814, + "grad_norm": 0.71484375, + "learning_rate": 1.9661251400241994e-05, + "loss": 1.572, + "step": 1009 + }, + { + "epoch": 0.17424307771931338, + "grad_norm": 0.7109375, + "learning_rate": 1.9660548596058068e-05, + "loss": 1.4952, + "step": 1010 + }, + { + "epoch": 0.17441559561804537, + "grad_norm": 0.71875, + "learning_rate": 1.9659845076160345e-05, + "loss": 1.6122, + "step": 1011 + }, + { + "epoch": 0.17458811351677736, + "grad_norm": 0.89453125, + "learning_rate": 1.9659140840600934e-05, + "loss": 1.4911, + "step": 1012 + }, + { + "epoch": 0.17476063141550935, + "grad_norm": 0.6875, + "learning_rate": 1.9658435889432022e-05, + "loss": 1.4473, + "step": 1013 + }, + { + "epoch": 0.17493314931424137, + "grad_norm": 0.8515625, + "learning_rate": 1.9657730222705828e-05, + "loss": 1.5437, + "step": 1014 + }, + { + "epoch": 0.17510566721297335, + "grad_norm": 0.94140625, + "learning_rate": 1.9657023840474637e-05, + "loss": 1.5346, + "step": 1015 + }, + { + "epoch": 0.17527818511170534, + "grad_norm": 0.63671875, + "learning_rate": 1.9656316742790778e-05, + "loss": 1.6122, + "step": 1016 + }, + { + "epoch": 0.17545070301043733, + "grad_norm": 0.8828125, + "learning_rate": 1.9655608929706636e-05, + "loss": 1.635, + "step": 1017 + }, + { + "epoch": 0.17562322090916932, + "grad_norm": 1.6328125, + "learning_rate": 1.965490040127466e-05, + "loss": 1.6069, + "step": 1018 + }, + { + "epoch": 0.1757957388079013, + "grad_norm": 0.76171875, + "learning_rate": 1.965419115754733e-05, + "loss": 1.5818, + "step": 1019 + }, + { + "epoch": 0.17596825670663332, + "grad_norm": 0.68359375, + "learning_rate": 1.96534811985772e-05, + "loss": 1.4853, + "step": 1020 + }, + { + "epoch": 0.1761407746053653, + "grad_norm": 0.6796875, + "learning_rate": 1.9652770524416865e-05, + "loss": 1.5027, + "step": 1021 + }, + { + "epoch": 0.1763132925040973, + "grad_norm": 0.70703125, + "learning_rate": 1.9652059135118976e-05, + "loss": 1.5647, + "step": 1022 + }, + { + "epoch": 0.1764858104028293, + "grad_norm": 0.734375, + "learning_rate": 1.965134703073624e-05, + "loss": 1.5769, + "step": 1023 + }, + { + "epoch": 0.17665832830156128, + "grad_norm": 0.54296875, + "learning_rate": 1.9650634211321406e-05, + "loss": 1.4441, + "step": 1024 + }, + { + "epoch": 0.17683084620029327, + "grad_norm": 0.62109375, + "learning_rate": 1.9649920676927292e-05, + "loss": 1.5214, + "step": 1025 + }, + { + "epoch": 0.17700336409902528, + "grad_norm": 0.6328125, + "learning_rate": 1.964920642760676e-05, + "loss": 1.5326, + "step": 1026 + }, + { + "epoch": 0.17717588199775727, + "grad_norm": 0.671875, + "learning_rate": 1.9648491463412724e-05, + "loss": 1.5217, + "step": 1027 + }, + { + "epoch": 0.17734839989648926, + "grad_norm": 0.671875, + "learning_rate": 1.9647775784398154e-05, + "loss": 1.6123, + "step": 1028 + }, + { + "epoch": 0.17752091779522125, + "grad_norm": 0.73828125, + "learning_rate": 1.9647059390616072e-05, + "loss": 1.5351, + "step": 1029 + }, + { + "epoch": 0.17769343569395324, + "grad_norm": 1.171875, + "learning_rate": 1.964634228211955e-05, + "loss": 1.5681, + "step": 1030 + }, + { + "epoch": 0.17786595359268523, + "grad_norm": 0.70703125, + "learning_rate": 1.9645624458961722e-05, + "loss": 1.5436, + "step": 1031 + }, + { + "epoch": 0.17803847149141724, + "grad_norm": 0.94921875, + "learning_rate": 1.9644905921195763e-05, + "loss": 1.6017, + "step": 1032 + }, + { + "epoch": 0.17821098939014923, + "grad_norm": 0.6328125, + "learning_rate": 1.9644186668874914e-05, + "loss": 1.55, + "step": 1033 + }, + { + "epoch": 0.17838350728888122, + "grad_norm": 0.87109375, + "learning_rate": 1.9643466702052453e-05, + "loss": 1.5529, + "step": 1034 + }, + { + "epoch": 0.1785560251876132, + "grad_norm": 0.87109375, + "learning_rate": 1.9642746020781723e-05, + "loss": 1.5194, + "step": 1035 + }, + { + "epoch": 0.1787285430863452, + "grad_norm": 0.87890625, + "learning_rate": 1.9642024625116117e-05, + "loss": 1.6674, + "step": 1036 + }, + { + "epoch": 0.17890106098507721, + "grad_norm": 0.94140625, + "learning_rate": 1.9641302515109084e-05, + "loss": 1.6355, + "step": 1037 + }, + { + "epoch": 0.1790735788838092, + "grad_norm": 0.9609375, + "learning_rate": 1.9640579690814118e-05, + "loss": 1.4246, + "step": 1038 + }, + { + "epoch": 0.1792460967825412, + "grad_norm": 0.66015625, + "learning_rate": 1.9639856152284768e-05, + "loss": 1.5162, + "step": 1039 + }, + { + "epoch": 0.17941861468127318, + "grad_norm": 0.8359375, + "learning_rate": 1.9639131899574643e-05, + "loss": 1.5773, + "step": 1040 + }, + { + "epoch": 0.17959113258000517, + "grad_norm": 0.8125, + "learning_rate": 1.9638406932737402e-05, + "loss": 1.6291, + "step": 1041 + }, + { + "epoch": 0.17976365047873716, + "grad_norm": 0.81640625, + "learning_rate": 1.963768125182675e-05, + "loss": 1.554, + "step": 1042 + }, + { + "epoch": 0.17993616837746917, + "grad_norm": 1.2734375, + "learning_rate": 1.9636954856896452e-05, + "loss": 1.4847, + "step": 1043 + }, + { + "epoch": 0.18010868627620116, + "grad_norm": 0.62109375, + "learning_rate": 1.9636227748000322e-05, + "loss": 1.5396, + "step": 1044 + }, + { + "epoch": 0.18028120417493315, + "grad_norm": 0.625, + "learning_rate": 1.963549992519223e-05, + "loss": 1.5237, + "step": 1045 + }, + { + "epoch": 0.18045372207366514, + "grad_norm": 0.8046875, + "learning_rate": 1.9634771388526103e-05, + "loss": 1.5847, + "step": 1046 + }, + { + "epoch": 0.18062623997239713, + "grad_norm": 0.671875, + "learning_rate": 1.9634042138055905e-05, + "loss": 1.6293, + "step": 1047 + }, + { + "epoch": 0.18079875787112912, + "grad_norm": 0.90625, + "learning_rate": 1.9633312173835674e-05, + "loss": 1.6017, + "step": 1048 + }, + { + "epoch": 0.18097127576986113, + "grad_norm": 0.61328125, + "learning_rate": 1.963258149591948e-05, + "loss": 1.4581, + "step": 1049 + }, + { + "epoch": 0.18114379366859312, + "grad_norm": 0.671875, + "learning_rate": 1.9631850104361467e-05, + "loss": 1.615, + "step": 1050 + }, + { + "epoch": 0.1813163115673251, + "grad_norm": 0.73046875, + "learning_rate": 1.9631117999215812e-05, + "loss": 1.4861, + "step": 1051 + }, + { + "epoch": 0.1814888294660571, + "grad_norm": 0.69140625, + "learning_rate": 1.9630385180536758e-05, + "loss": 1.5275, + "step": 1052 + }, + { + "epoch": 0.1816613473647891, + "grad_norm": 0.6328125, + "learning_rate": 1.96296516483786e-05, + "loss": 1.5503, + "step": 1053 + }, + { + "epoch": 0.18183386526352108, + "grad_norm": 0.69921875, + "learning_rate": 1.9628917402795677e-05, + "loss": 1.5439, + "step": 1054 + }, + { + "epoch": 0.1820063831622531, + "grad_norm": 0.6171875, + "learning_rate": 1.9628182443842388e-05, + "loss": 1.6039, + "step": 1055 + }, + { + "epoch": 0.18217890106098508, + "grad_norm": 0.59765625, + "learning_rate": 1.962744677157318e-05, + "loss": 1.6551, + "step": 1056 + }, + { + "epoch": 0.18235141895971707, + "grad_norm": 0.66015625, + "learning_rate": 1.9626710386042567e-05, + "loss": 1.4362, + "step": 1057 + }, + { + "epoch": 0.18252393685844906, + "grad_norm": 0.609375, + "learning_rate": 1.9625973287305093e-05, + "loss": 1.5739, + "step": 1058 + }, + { + "epoch": 0.18269645475718105, + "grad_norm": 0.72265625, + "learning_rate": 1.9625235475415377e-05, + "loss": 1.5182, + "step": 1059 + }, + { + "epoch": 0.18286897265591306, + "grad_norm": 0.61328125, + "learning_rate": 1.962449695042807e-05, + "loss": 1.4969, + "step": 1060 + }, + { + "epoch": 0.18304149055464505, + "grad_norm": 0.62109375, + "learning_rate": 1.9623757712397896e-05, + "loss": 1.581, + "step": 1061 + }, + { + "epoch": 0.18321400845337704, + "grad_norm": 0.71875, + "learning_rate": 1.962301776137962e-05, + "loss": 1.6426, + "step": 1062 + }, + { + "epoch": 0.18338652635210903, + "grad_norm": 0.5859375, + "learning_rate": 1.9622277097428058e-05, + "loss": 1.5438, + "step": 1063 + }, + { + "epoch": 0.18355904425084102, + "grad_norm": 0.578125, + "learning_rate": 1.9621535720598085e-05, + "loss": 1.5331, + "step": 1064 + }, + { + "epoch": 0.183731562149573, + "grad_norm": 0.609375, + "learning_rate": 1.9620793630944632e-05, + "loss": 1.4752, + "step": 1065 + }, + { + "epoch": 0.18390408004830502, + "grad_norm": 0.671875, + "learning_rate": 1.962005082852267e-05, + "loss": 1.4254, + "step": 1066 + }, + { + "epoch": 0.184076597947037, + "grad_norm": 0.6328125, + "learning_rate": 1.961930731338723e-05, + "loss": 1.5386, + "step": 1067 + }, + { + "epoch": 0.184249115845769, + "grad_norm": 0.66015625, + "learning_rate": 1.961856308559341e-05, + "loss": 1.5059, + "step": 1068 + }, + { + "epoch": 0.184421633744501, + "grad_norm": 0.6875, + "learning_rate": 1.961781814519633e-05, + "loss": 1.5058, + "step": 1069 + }, + { + "epoch": 0.18459415164323298, + "grad_norm": 0.72265625, + "learning_rate": 1.9617072492251187e-05, + "loss": 1.5646, + "step": 1070 + }, + { + "epoch": 0.18476666954196497, + "grad_norm": 0.86328125, + "learning_rate": 1.9616326126813224e-05, + "loss": 1.5363, + "step": 1071 + }, + { + "epoch": 0.18493918744069698, + "grad_norm": 0.60546875, + "learning_rate": 1.961557904893774e-05, + "loss": 1.6104, + "step": 1072 + }, + { + "epoch": 0.18511170533942897, + "grad_norm": 0.66015625, + "learning_rate": 1.9614831258680073e-05, + "loss": 1.5752, + "step": 1073 + }, + { + "epoch": 0.18528422323816096, + "grad_norm": 0.61328125, + "learning_rate": 1.9614082756095632e-05, + "loss": 1.5502, + "step": 1074 + }, + { + "epoch": 0.18545674113689295, + "grad_norm": 0.68359375, + "learning_rate": 1.961333354123987e-05, + "loss": 1.6264, + "step": 1075 + }, + { + "epoch": 0.18562925903562494, + "grad_norm": 0.63671875, + "learning_rate": 1.9612583614168295e-05, + "loss": 1.5407, + "step": 1076 + }, + { + "epoch": 0.18580177693435693, + "grad_norm": 0.75, + "learning_rate": 1.961183297493646e-05, + "loss": 1.5649, + "step": 1077 + }, + { + "epoch": 0.18597429483308894, + "grad_norm": 0.62890625, + "learning_rate": 1.961108162359998e-05, + "loss": 1.4631, + "step": 1078 + }, + { + "epoch": 0.18614681273182093, + "grad_norm": 0.640625, + "learning_rate": 1.9610329560214524e-05, + "loss": 1.5548, + "step": 1079 + }, + { + "epoch": 0.18631933063055292, + "grad_norm": 0.671875, + "learning_rate": 1.9609576784835803e-05, + "loss": 1.52, + "step": 1080 + }, + { + "epoch": 0.1864918485292849, + "grad_norm": 0.6640625, + "learning_rate": 1.960882329751959e-05, + "loss": 1.5762, + "step": 1081 + }, + { + "epoch": 0.1866643664280169, + "grad_norm": 0.60546875, + "learning_rate": 1.9608069098321712e-05, + "loss": 1.4734, + "step": 1082 + }, + { + "epoch": 0.1868368843267489, + "grad_norm": 0.6484375, + "learning_rate": 1.960731418729804e-05, + "loss": 1.4512, + "step": 1083 + }, + { + "epoch": 0.1870094022254809, + "grad_norm": 0.6484375, + "learning_rate": 1.9606558564504503e-05, + "loss": 1.584, + "step": 1084 + }, + { + "epoch": 0.1871819201242129, + "grad_norm": 0.6953125, + "learning_rate": 1.9605802229997086e-05, + "loss": 1.6183, + "step": 1085 + }, + { + "epoch": 0.18735443802294488, + "grad_norm": 0.62109375, + "learning_rate": 1.9605045183831814e-05, + "loss": 1.5524, + "step": 1086 + }, + { + "epoch": 0.18752695592167687, + "grad_norm": 0.7109375, + "learning_rate": 1.960428742606478e-05, + "loss": 1.4852, + "step": 1087 + }, + { + "epoch": 0.18769947382040886, + "grad_norm": 0.73046875, + "learning_rate": 1.9603528956752126e-05, + "loss": 1.5881, + "step": 1088 + }, + { + "epoch": 0.18787199171914087, + "grad_norm": 1.0078125, + "learning_rate": 1.9602769775950044e-05, + "loss": 1.5719, + "step": 1089 + }, + { + "epoch": 0.18804450961787286, + "grad_norm": 0.82421875, + "learning_rate": 1.960200988371477e-05, + "loss": 1.6392, + "step": 1090 + }, + { + "epoch": 0.18821702751660485, + "grad_norm": 0.60546875, + "learning_rate": 1.9601249280102613e-05, + "loss": 1.6041, + "step": 1091 + }, + { + "epoch": 0.18838954541533684, + "grad_norm": 0.9609375, + "learning_rate": 1.9600487965169917e-05, + "loss": 1.5457, + "step": 1092 + }, + { + "epoch": 0.18856206331406883, + "grad_norm": 0.7734375, + "learning_rate": 1.9599725938973085e-05, + "loss": 1.5748, + "step": 1093 + }, + { + "epoch": 0.18873458121280082, + "grad_norm": 0.78125, + "learning_rate": 1.959896320156857e-05, + "loss": 1.5051, + "step": 1094 + }, + { + "epoch": 0.18890709911153283, + "grad_norm": 0.66015625, + "learning_rate": 1.959819975301289e-05, + "loss": 1.5309, + "step": 1095 + }, + { + "epoch": 0.18907961701026482, + "grad_norm": 0.65234375, + "learning_rate": 1.9597435593362597e-05, + "loss": 1.5352, + "step": 1096 + }, + { + "epoch": 0.1892521349089968, + "grad_norm": 0.68359375, + "learning_rate": 1.9596670722674307e-05, + "loss": 1.6632, + "step": 1097 + }, + { + "epoch": 0.1894246528077288, + "grad_norm": 0.64453125, + "learning_rate": 1.9595905141004687e-05, + "loss": 1.5162, + "step": 1098 + }, + { + "epoch": 0.1895971707064608, + "grad_norm": 0.7109375, + "learning_rate": 1.959513884841046e-05, + "loss": 1.4775, + "step": 1099 + }, + { + "epoch": 0.18976968860519278, + "grad_norm": 0.6484375, + "learning_rate": 1.959437184494839e-05, + "loss": 1.4998, + "step": 1100 + }, + { + "epoch": 0.18976968860519278, + "eval_loss": 1.5296015739440918, + "eval_runtime": 10.937, + "eval_samples_per_second": 93.627, + "eval_steps_per_second": 23.407, + "step": 1100 + }, + { + "epoch": 0.1899422065039248, + "grad_norm": 0.76171875, + "learning_rate": 1.9593604130675306e-05, + "loss": 1.5226, + "step": 1101 + }, + { + "epoch": 0.19011472440265678, + "grad_norm": 0.60546875, + "learning_rate": 1.9592835705648087e-05, + "loss": 1.5056, + "step": 1102 + }, + { + "epoch": 0.19028724230138877, + "grad_norm": 0.88671875, + "learning_rate": 1.9592066569923654e-05, + "loss": 1.5678, + "step": 1103 + }, + { + "epoch": 0.19045976020012076, + "grad_norm": 0.5625, + "learning_rate": 1.9591296723559e-05, + "loss": 1.5051, + "step": 1104 + }, + { + "epoch": 0.19063227809885275, + "grad_norm": 1.1640625, + "learning_rate": 1.9590526166611153e-05, + "loss": 1.534, + "step": 1105 + }, + { + "epoch": 0.19080479599758474, + "grad_norm": 0.609375, + "learning_rate": 1.9589754899137207e-05, + "loss": 1.4893, + "step": 1106 + }, + { + "epoch": 0.19097731389631675, + "grad_norm": 0.58203125, + "learning_rate": 1.9588982921194296e-05, + "loss": 1.4873, + "step": 1107 + }, + { + "epoch": 0.19114983179504874, + "grad_norm": 0.58984375, + "learning_rate": 1.9588210232839617e-05, + "loss": 1.5466, + "step": 1108 + }, + { + "epoch": 0.19132234969378073, + "grad_norm": 0.68359375, + "learning_rate": 1.9587436834130413e-05, + "loss": 1.5609, + "step": 1109 + }, + { + "epoch": 0.19149486759251272, + "grad_norm": 0.76953125, + "learning_rate": 1.9586662725123984e-05, + "loss": 1.5532, + "step": 1110 + }, + { + "epoch": 0.1916673854912447, + "grad_norm": 0.734375, + "learning_rate": 1.958588790587768e-05, + "loss": 1.545, + "step": 1111 + }, + { + "epoch": 0.19183990338997672, + "grad_norm": 0.7421875, + "learning_rate": 1.9585112376448902e-05, + "loss": 1.4745, + "step": 1112 + }, + { + "epoch": 0.1920124212887087, + "grad_norm": 1.203125, + "learning_rate": 1.9584336136895114e-05, + "loss": 1.465, + "step": 1113 + }, + { + "epoch": 0.1921849391874407, + "grad_norm": 0.640625, + "learning_rate": 1.9583559187273816e-05, + "loss": 1.4556, + "step": 1114 + }, + { + "epoch": 0.1923574570861727, + "grad_norm": 0.76953125, + "learning_rate": 1.958278152764257e-05, + "loss": 1.5155, + "step": 1115 + }, + { + "epoch": 0.19252997498490468, + "grad_norm": 0.984375, + "learning_rate": 1.9582003158058996e-05, + "loss": 1.5266, + "step": 1116 + }, + { + "epoch": 0.19270249288363667, + "grad_norm": 0.7109375, + "learning_rate": 1.9581224078580755e-05, + "loss": 1.4815, + "step": 1117 + }, + { + "epoch": 0.19287501078236868, + "grad_norm": 0.78515625, + "learning_rate": 1.9580444289265567e-05, + "loss": 1.5429, + "step": 1118 + }, + { + "epoch": 0.19304752868110067, + "grad_norm": 0.68359375, + "learning_rate": 1.957966379017121e-05, + "loss": 1.5509, + "step": 1119 + }, + { + "epoch": 0.19322004657983266, + "grad_norm": 0.625, + "learning_rate": 1.9578882581355497e-05, + "loss": 1.5378, + "step": 1120 + }, + { + "epoch": 0.19339256447856465, + "grad_norm": 0.91015625, + "learning_rate": 1.9578100662876314e-05, + "loss": 1.5299, + "step": 1121 + }, + { + "epoch": 0.19356508237729664, + "grad_norm": 0.6953125, + "learning_rate": 1.9577318034791586e-05, + "loss": 1.5234, + "step": 1122 + }, + { + "epoch": 0.19373760027602863, + "grad_norm": 0.88671875, + "learning_rate": 1.9576534697159298e-05, + "loss": 1.4997, + "step": 1123 + }, + { + "epoch": 0.19391011817476064, + "grad_norm": 0.80859375, + "learning_rate": 1.957575065003748e-05, + "loss": 1.4906, + "step": 1124 + }, + { + "epoch": 0.19408263607349263, + "grad_norm": 0.77734375, + "learning_rate": 1.9574965893484223e-05, + "loss": 1.5698, + "step": 1125 + }, + { + "epoch": 0.19425515397222462, + "grad_norm": 0.59765625, + "learning_rate": 1.9574180427557666e-05, + "loss": 1.4581, + "step": 1126 + }, + { + "epoch": 0.1944276718709566, + "grad_norm": 0.9140625, + "learning_rate": 1.9573394252316e-05, + "loss": 1.5503, + "step": 1127 + }, + { + "epoch": 0.1946001897696886, + "grad_norm": 0.78125, + "learning_rate": 1.957260736781747e-05, + "loss": 1.5962, + "step": 1128 + }, + { + "epoch": 0.19477270766842059, + "grad_norm": 0.70703125, + "learning_rate": 1.9571819774120375e-05, + "loss": 1.5985, + "step": 1129 + }, + { + "epoch": 0.1949452255671526, + "grad_norm": 0.90625, + "learning_rate": 1.957103147128306e-05, + "loss": 1.5408, + "step": 1130 + }, + { + "epoch": 0.1951177434658846, + "grad_norm": 0.83984375, + "learning_rate": 1.9570242459363937e-05, + "loss": 1.6447, + "step": 1131 + }, + { + "epoch": 0.19529026136461658, + "grad_norm": 0.69921875, + "learning_rate": 1.956945273842145e-05, + "loss": 1.5605, + "step": 1132 + }, + { + "epoch": 0.19546277926334857, + "grad_norm": 0.93359375, + "learning_rate": 1.9568662308514116e-05, + "loss": 1.5723, + "step": 1133 + }, + { + "epoch": 0.19563529716208056, + "grad_norm": 0.67578125, + "learning_rate": 1.9567871169700486e-05, + "loss": 1.6163, + "step": 1134 + }, + { + "epoch": 0.19580781506081255, + "grad_norm": 0.75390625, + "learning_rate": 1.956707932203918e-05, + "loss": 1.5854, + "step": 1135 + }, + { + "epoch": 0.19598033295954456, + "grad_norm": 0.82421875, + "learning_rate": 1.9566286765588857e-05, + "loss": 1.4622, + "step": 1136 + }, + { + "epoch": 0.19615285085827655, + "grad_norm": 0.59765625, + "learning_rate": 1.956549350040824e-05, + "loss": 1.5131, + "step": 1137 + }, + { + "epoch": 0.19632536875700854, + "grad_norm": 0.70703125, + "learning_rate": 1.9564699526556093e-05, + "loss": 1.5931, + "step": 1138 + }, + { + "epoch": 0.19649788665574053, + "grad_norm": 0.83203125, + "learning_rate": 1.9563904844091248e-05, + "loss": 1.6076, + "step": 1139 + }, + { + "epoch": 0.19667040455447252, + "grad_norm": 0.5859375, + "learning_rate": 1.956310945307257e-05, + "loss": 1.5771, + "step": 1140 + }, + { + "epoch": 0.19684292245320453, + "grad_norm": 0.7109375, + "learning_rate": 1.9562313353558992e-05, + "loss": 1.5983, + "step": 1141 + }, + { + "epoch": 0.19701544035193652, + "grad_norm": 0.80078125, + "learning_rate": 1.9561516545609493e-05, + "loss": 1.6035, + "step": 1142 + }, + { + "epoch": 0.1971879582506685, + "grad_norm": 0.61328125, + "learning_rate": 1.9560719029283104e-05, + "loss": 1.6043, + "step": 1143 + }, + { + "epoch": 0.1973604761494005, + "grad_norm": 2.359375, + "learning_rate": 1.955992080463891e-05, + "loss": 1.5247, + "step": 1144 + }, + { + "epoch": 0.1975329940481325, + "grad_norm": 3.171875, + "learning_rate": 1.9559121871736055e-05, + "loss": 1.5868, + "step": 1145 + }, + { + "epoch": 0.19770551194686448, + "grad_norm": 1.984375, + "learning_rate": 1.9558322230633718e-05, + "loss": 1.5682, + "step": 1146 + }, + { + "epoch": 0.1978780298455965, + "grad_norm": 0.78515625, + "learning_rate": 1.955752188139115e-05, + "loss": 1.5546, + "step": 1147 + }, + { + "epoch": 0.19805054774432848, + "grad_norm": 0.60546875, + "learning_rate": 1.955672082406764e-05, + "loss": 1.5099, + "step": 1148 + }, + { + "epoch": 0.19822306564306047, + "grad_norm": 0.71875, + "learning_rate": 1.955591905872254e-05, + "loss": 1.5612, + "step": 1149 + }, + { + "epoch": 0.19839558354179246, + "grad_norm": 0.69140625, + "learning_rate": 1.9555116585415247e-05, + "loss": 1.5709, + "step": 1150 + }, + { + "epoch": 0.19856810144052445, + "grad_norm": 0.69140625, + "learning_rate": 1.9554313404205216e-05, + "loss": 1.5448, + "step": 1151 + }, + { + "epoch": 0.19874061933925644, + "grad_norm": 0.609375, + "learning_rate": 1.955350951515195e-05, + "loss": 1.5552, + "step": 1152 + }, + { + "epoch": 0.19891313723798845, + "grad_norm": 1.015625, + "learning_rate": 1.9552704918315006e-05, + "loss": 1.4922, + "step": 1153 + }, + { + "epoch": 0.19908565513672044, + "grad_norm": 0.67578125, + "learning_rate": 1.9551899613753994e-05, + "loss": 1.5521, + "step": 1154 + }, + { + "epoch": 0.19925817303545243, + "grad_norm": 0.66796875, + "learning_rate": 1.9551093601528573e-05, + "loss": 1.5857, + "step": 1155 + }, + { + "epoch": 0.19943069093418442, + "grad_norm": 0.73046875, + "learning_rate": 1.955028688169846e-05, + "loss": 1.586, + "step": 1156 + }, + { + "epoch": 0.1996032088329164, + "grad_norm": 0.765625, + "learning_rate": 1.9549479454323423e-05, + "loss": 1.5905, + "step": 1157 + }, + { + "epoch": 0.1997757267316484, + "grad_norm": 0.71484375, + "learning_rate": 1.954867131946328e-05, + "loss": 1.5615, + "step": 1158 + }, + { + "epoch": 0.1999482446303804, + "grad_norm": 0.8203125, + "learning_rate": 1.9547862477177904e-05, + "loss": 1.4858, + "step": 1159 + }, + { + "epoch": 0.2001207625291124, + "grad_norm": 0.71875, + "learning_rate": 1.9547052927527217e-05, + "loss": 1.5963, + "step": 1160 + }, + { + "epoch": 0.2002932804278444, + "grad_norm": 0.66015625, + "learning_rate": 1.9546242670571198e-05, + "loss": 1.535, + "step": 1161 + }, + { + "epoch": 0.20046579832657638, + "grad_norm": 0.89453125, + "learning_rate": 1.954543170636987e-05, + "loss": 1.5349, + "step": 1162 + }, + { + "epoch": 0.20063831622530837, + "grad_norm": 0.625, + "learning_rate": 1.9544620034983322e-05, + "loss": 1.5516, + "step": 1163 + }, + { + "epoch": 0.20081083412404038, + "grad_norm": 0.6015625, + "learning_rate": 1.9543807656471683e-05, + "loss": 1.5135, + "step": 1164 + }, + { + "epoch": 0.20098335202277237, + "grad_norm": 1.03125, + "learning_rate": 1.954299457089514e-05, + "loss": 1.6499, + "step": 1165 + }, + { + "epoch": 0.20115586992150436, + "grad_norm": 0.72265625, + "learning_rate": 1.9542180778313936e-05, + "loss": 1.6134, + "step": 1166 + }, + { + "epoch": 0.20132838782023635, + "grad_norm": 0.5625, + "learning_rate": 1.954136627878835e-05, + "loss": 1.5139, + "step": 1167 + }, + { + "epoch": 0.20150090571896834, + "grad_norm": 0.92578125, + "learning_rate": 1.9540551072378738e-05, + "loss": 1.5382, + "step": 1168 + }, + { + "epoch": 0.20167342361770033, + "grad_norm": 0.5859375, + "learning_rate": 1.953973515914549e-05, + "loss": 1.4794, + "step": 1169 + }, + { + "epoch": 0.20184594151643234, + "grad_norm": 0.80078125, + "learning_rate": 1.9538918539149054e-05, + "loss": 1.4757, + "step": 1170 + }, + { + "epoch": 0.20201845941516433, + "grad_norm": 0.83203125, + "learning_rate": 1.9538101212449932e-05, + "loss": 1.5418, + "step": 1171 + }, + { + "epoch": 0.20219097731389632, + "grad_norm": 0.80078125, + "learning_rate": 1.953728317910867e-05, + "loss": 1.6522, + "step": 1172 + }, + { + "epoch": 0.2023634952126283, + "grad_norm": 0.640625, + "learning_rate": 1.953646443918589e-05, + "loss": 1.5378, + "step": 1173 + }, + { + "epoch": 0.2025360131113603, + "grad_norm": 0.69921875, + "learning_rate": 1.9535644992742225e-05, + "loss": 1.578, + "step": 1174 + }, + { + "epoch": 0.20270853101009229, + "grad_norm": 0.625, + "learning_rate": 1.9534824839838406e-05, + "loss": 1.5171, + "step": 1175 + }, + { + "epoch": 0.2028810489088243, + "grad_norm": 0.609375, + "learning_rate": 1.953400398053518e-05, + "loss": 1.6417, + "step": 1176 + }, + { + "epoch": 0.2030535668075563, + "grad_norm": 0.5703125, + "learning_rate": 1.953318241489337e-05, + "loss": 1.4812, + "step": 1177 + }, + { + "epoch": 0.20322608470628828, + "grad_norm": 0.65234375, + "learning_rate": 1.9532360142973842e-05, + "loss": 1.6705, + "step": 1178 + }, + { + "epoch": 0.20339860260502027, + "grad_norm": 0.71875, + "learning_rate": 1.9531537164837516e-05, + "loss": 1.4646, + "step": 1179 + }, + { + "epoch": 0.20357112050375226, + "grad_norm": 0.6875, + "learning_rate": 1.9530713480545357e-05, + "loss": 1.6521, + "step": 1180 + }, + { + "epoch": 0.20374363840248425, + "grad_norm": 0.68359375, + "learning_rate": 1.9529889090158394e-05, + "loss": 1.5087, + "step": 1181 + }, + { + "epoch": 0.20391615630121626, + "grad_norm": 0.61328125, + "learning_rate": 1.9529063993737703e-05, + "loss": 1.5173, + "step": 1182 + }, + { + "epoch": 0.20408867419994825, + "grad_norm": 0.578125, + "learning_rate": 1.952823819134441e-05, + "loss": 1.5581, + "step": 1183 + }, + { + "epoch": 0.20426119209868024, + "grad_norm": 0.69921875, + "learning_rate": 1.9527411683039697e-05, + "loss": 1.5374, + "step": 1184 + }, + { + "epoch": 0.20443370999741223, + "grad_norm": 0.640625, + "learning_rate": 1.952658446888479e-05, + "loss": 1.523, + "step": 1185 + }, + { + "epoch": 0.20460622789614422, + "grad_norm": 0.79296875, + "learning_rate": 1.952575654894099e-05, + "loss": 1.3737, + "step": 1186 + }, + { + "epoch": 0.2047787457948762, + "grad_norm": 0.64453125, + "learning_rate": 1.9524927923269623e-05, + "loss": 1.5357, + "step": 1187 + }, + { + "epoch": 0.20495126369360822, + "grad_norm": 0.6953125, + "learning_rate": 1.9524098591932078e-05, + "loss": 1.5545, + "step": 1188 + }, + { + "epoch": 0.2051237815923402, + "grad_norm": 0.67578125, + "learning_rate": 1.9523268554989806e-05, + "loss": 1.6352, + "step": 1189 + }, + { + "epoch": 0.2052962994910722, + "grad_norm": 0.6171875, + "learning_rate": 1.9522437812504285e-05, + "loss": 1.5834, + "step": 1190 + }, + { + "epoch": 0.2054688173898042, + "grad_norm": 0.71484375, + "learning_rate": 1.952160636453708e-05, + "loss": 1.5966, + "step": 1191 + }, + { + "epoch": 0.20564133528853618, + "grad_norm": 0.62890625, + "learning_rate": 1.9520774211149783e-05, + "loss": 1.4937, + "step": 1192 + }, + { + "epoch": 0.2058138531872682, + "grad_norm": 0.61328125, + "learning_rate": 1.951994135240404e-05, + "loss": 1.4849, + "step": 1193 + }, + { + "epoch": 0.20598637108600018, + "grad_norm": 0.5703125, + "learning_rate": 1.9519107788361557e-05, + "loss": 1.5178, + "step": 1194 + }, + { + "epoch": 0.20615888898473217, + "grad_norm": 0.69140625, + "learning_rate": 1.9518273519084093e-05, + "loss": 1.4902, + "step": 1195 + }, + { + "epoch": 0.20633140688346416, + "grad_norm": 0.67578125, + "learning_rate": 1.9517438544633455e-05, + "loss": 1.5649, + "step": 1196 + }, + { + "epoch": 0.20650392478219615, + "grad_norm": 0.7578125, + "learning_rate": 1.95166028650715e-05, + "loss": 1.5275, + "step": 1197 + }, + { + "epoch": 0.20667644268092814, + "grad_norm": 0.578125, + "learning_rate": 1.951576648046014e-05, + "loss": 1.4538, + "step": 1198 + }, + { + "epoch": 0.20684896057966015, + "grad_norm": 0.58203125, + "learning_rate": 1.951492939086135e-05, + "loss": 1.5145, + "step": 1199 + }, + { + "epoch": 0.20702147847839214, + "grad_norm": 0.671875, + "learning_rate": 1.951409159633713e-05, + "loss": 1.3968, + "step": 1200 + }, + { + "epoch": 0.20702147847839214, + "eval_loss": 1.5226820707321167, + "eval_runtime": 10.8512, + "eval_samples_per_second": 94.368, + "eval_steps_per_second": 23.592, + "step": 1200 + }, + { + "epoch": 0.20719399637712413, + "grad_norm": 0.6484375, + "learning_rate": 1.951325309694956e-05, + "loss": 1.5766, + "step": 1201 + }, + { + "epoch": 0.20736651427585612, + "grad_norm": 0.62109375, + "learning_rate": 1.951241389276076e-05, + "loss": 1.4671, + "step": 1202 + }, + { + "epoch": 0.2075390321745881, + "grad_norm": 0.72265625, + "learning_rate": 1.9511573983832903e-05, + "loss": 1.4517, + "step": 1203 + }, + { + "epoch": 0.2077115500733201, + "grad_norm": 4.59375, + "learning_rate": 1.9510733370228214e-05, + "loss": 1.5248, + "step": 1204 + }, + { + "epoch": 0.2078840679720521, + "grad_norm": 0.76953125, + "learning_rate": 1.950989205200897e-05, + "loss": 1.5138, + "step": 1205 + }, + { + "epoch": 0.2080565858707841, + "grad_norm": 0.6640625, + "learning_rate": 1.9509050029237506e-05, + "loss": 1.5073, + "step": 1206 + }, + { + "epoch": 0.2082291037695161, + "grad_norm": 0.578125, + "learning_rate": 1.9508207301976197e-05, + "loss": 1.5608, + "step": 1207 + }, + { + "epoch": 0.20840162166824808, + "grad_norm": 0.859375, + "learning_rate": 1.9507363870287482e-05, + "loss": 1.5454, + "step": 1208 + }, + { + "epoch": 0.20857413956698007, + "grad_norm": 0.65234375, + "learning_rate": 1.950651973423385e-05, + "loss": 1.4313, + "step": 1209 + }, + { + "epoch": 0.20874665746571205, + "grad_norm": 0.73828125, + "learning_rate": 1.950567489387783e-05, + "loss": 1.5429, + "step": 1210 + }, + { + "epoch": 0.20891917536444407, + "grad_norm": 0.60546875, + "learning_rate": 1.950482934928202e-05, + "loss": 1.4819, + "step": 1211 + }, + { + "epoch": 0.20909169326317606, + "grad_norm": 0.671875, + "learning_rate": 1.9503983100509067e-05, + "loss": 1.5763, + "step": 1212 + }, + { + "epoch": 0.20926421116190805, + "grad_norm": 0.6640625, + "learning_rate": 1.9503136147621662e-05, + "loss": 1.4882, + "step": 1213 + }, + { + "epoch": 0.20943672906064004, + "grad_norm": 0.6796875, + "learning_rate": 1.9502288490682553e-05, + "loss": 1.5377, + "step": 1214 + }, + { + "epoch": 0.20960924695937203, + "grad_norm": 0.625, + "learning_rate": 1.950144012975454e-05, + "loss": 1.5165, + "step": 1215 + }, + { + "epoch": 0.20978176485810404, + "grad_norm": 0.6875, + "learning_rate": 1.950059106490047e-05, + "loss": 1.5467, + "step": 1216 + }, + { + "epoch": 0.20995428275683603, + "grad_norm": 0.7109375, + "learning_rate": 1.9499741296183255e-05, + "loss": 1.6396, + "step": 1217 + }, + { + "epoch": 0.21012680065556802, + "grad_norm": 0.6328125, + "learning_rate": 1.9498890823665846e-05, + "loss": 1.6346, + "step": 1218 + }, + { + "epoch": 0.2102993185543, + "grad_norm": 0.6015625, + "learning_rate": 1.9498039647411255e-05, + "loss": 1.592, + "step": 1219 + }, + { + "epoch": 0.210471836453032, + "grad_norm": 0.69921875, + "learning_rate": 1.9497187767482538e-05, + "loss": 1.5075, + "step": 1220 + }, + { + "epoch": 0.21064435435176398, + "grad_norm": 0.61328125, + "learning_rate": 1.9496335183942814e-05, + "loss": 1.5514, + "step": 1221 + }, + { + "epoch": 0.210816872250496, + "grad_norm": 0.83984375, + "learning_rate": 1.949548189685524e-05, + "loss": 1.5402, + "step": 1222 + }, + { + "epoch": 0.210989390149228, + "grad_norm": 0.65234375, + "learning_rate": 1.949462790628304e-05, + "loss": 1.5735, + "step": 1223 + }, + { + "epoch": 0.21116190804795998, + "grad_norm": 0.6953125, + "learning_rate": 1.9493773212289475e-05, + "loss": 1.5145, + "step": 1224 + }, + { + "epoch": 0.21133442594669197, + "grad_norm": 0.58984375, + "learning_rate": 1.9492917814937874e-05, + "loss": 1.4504, + "step": 1225 + }, + { + "epoch": 0.21150694384542396, + "grad_norm": 0.59765625, + "learning_rate": 1.9492061714291605e-05, + "loss": 1.4625, + "step": 1226 + }, + { + "epoch": 0.21167946174415594, + "grad_norm": 0.6171875, + "learning_rate": 1.9491204910414097e-05, + "loss": 1.4586, + "step": 1227 + }, + { + "epoch": 0.21185197964288796, + "grad_norm": 0.58984375, + "learning_rate": 1.9490347403368823e-05, + "loss": 1.4972, + "step": 1228 + }, + { + "epoch": 0.21202449754161995, + "grad_norm": 0.69921875, + "learning_rate": 1.948948919321932e-05, + "loss": 1.5251, + "step": 1229 + }, + { + "epoch": 0.21219701544035194, + "grad_norm": 0.6875, + "learning_rate": 1.9488630280029154e-05, + "loss": 1.5978, + "step": 1230 + }, + { + "epoch": 0.21236953333908393, + "grad_norm": 0.625, + "learning_rate": 1.948777066386198e-05, + "loss": 1.5903, + "step": 1231 + }, + { + "epoch": 0.21254205123781592, + "grad_norm": 0.73828125, + "learning_rate": 1.9486910344781467e-05, + "loss": 1.5965, + "step": 1232 + }, + { + "epoch": 0.2127145691365479, + "grad_norm": 0.73046875, + "learning_rate": 1.9486049322851358e-05, + "loss": 1.5626, + "step": 1233 + }, + { + "epoch": 0.21288708703527992, + "grad_norm": 0.66015625, + "learning_rate": 1.9485187598135445e-05, + "loss": 1.5257, + "step": 1234 + }, + { + "epoch": 0.2130596049340119, + "grad_norm": 0.75390625, + "learning_rate": 1.9484325170697574e-05, + "loss": 1.5239, + "step": 1235 + }, + { + "epoch": 0.2132321228327439, + "grad_norm": 0.7578125, + "learning_rate": 1.9483462040601627e-05, + "loss": 1.5665, + "step": 1236 + }, + { + "epoch": 0.2134046407314759, + "grad_norm": 0.99609375, + "learning_rate": 1.948259820791156e-05, + "loss": 1.4892, + "step": 1237 + }, + { + "epoch": 0.21357715863020788, + "grad_norm": 0.6328125, + "learning_rate": 1.9481733672691365e-05, + "loss": 1.5117, + "step": 1238 + }, + { + "epoch": 0.21374967652893986, + "grad_norm": 0.88671875, + "learning_rate": 1.9480868435005098e-05, + "loss": 1.6343, + "step": 1239 + }, + { + "epoch": 0.21392219442767188, + "grad_norm": 0.65234375, + "learning_rate": 1.9480002494916857e-05, + "loss": 1.5402, + "step": 1240 + }, + { + "epoch": 0.21409471232640387, + "grad_norm": 0.8359375, + "learning_rate": 1.9479135852490794e-05, + "loss": 1.4328, + "step": 1241 + }, + { + "epoch": 0.21426723022513586, + "grad_norm": 1.1328125, + "learning_rate": 1.9478268507791124e-05, + "loss": 1.6286, + "step": 1242 + }, + { + "epoch": 0.21443974812386785, + "grad_norm": 0.8046875, + "learning_rate": 1.9477400460882096e-05, + "loss": 1.5382, + "step": 1243 + }, + { + "epoch": 0.21461226602259983, + "grad_norm": 0.859375, + "learning_rate": 1.9476531711828027e-05, + "loss": 1.5822, + "step": 1244 + }, + { + "epoch": 0.21478478392133185, + "grad_norm": 0.58203125, + "learning_rate": 1.9475662260693275e-05, + "loss": 1.5945, + "step": 1245 + }, + { + "epoch": 0.21495730182006384, + "grad_norm": 0.59375, + "learning_rate": 1.9474792107542258e-05, + "loss": 1.552, + "step": 1246 + }, + { + "epoch": 0.21512981971879583, + "grad_norm": 0.7578125, + "learning_rate": 1.9473921252439438e-05, + "loss": 1.5303, + "step": 1247 + }, + { + "epoch": 0.21530233761752782, + "grad_norm": 0.68359375, + "learning_rate": 1.947304969544934e-05, + "loss": 1.4816, + "step": 1248 + }, + { + "epoch": 0.2154748555162598, + "grad_norm": 0.68359375, + "learning_rate": 1.9472177436636523e-05, + "loss": 1.5002, + "step": 1249 + }, + { + "epoch": 0.2156473734149918, + "grad_norm": 0.640625, + "learning_rate": 1.9471304476065624e-05, + "loss": 1.5424, + "step": 1250 + }, + { + "epoch": 0.2158198913137238, + "grad_norm": 3.140625, + "learning_rate": 1.94704308138013e-05, + "loss": 1.5329, + "step": 1251 + }, + { + "epoch": 0.2159924092124558, + "grad_norm": 0.6328125, + "learning_rate": 1.94695564499083e-05, + "loss": 1.5575, + "step": 1252 + }, + { + "epoch": 0.2161649271111878, + "grad_norm": 0.64453125, + "learning_rate": 1.946868138445138e-05, + "loss": 1.5801, + "step": 1253 + }, + { + "epoch": 0.21633744500991978, + "grad_norm": 0.734375, + "learning_rate": 1.9467805617495384e-05, + "loss": 1.5072, + "step": 1254 + }, + { + "epoch": 0.21650996290865177, + "grad_norm": 0.91015625, + "learning_rate": 1.9466929149105193e-05, + "loss": 1.5689, + "step": 1255 + }, + { + "epoch": 0.21668248080738375, + "grad_norm": 0.60546875, + "learning_rate": 1.9466051979345734e-05, + "loss": 1.5702, + "step": 1256 + }, + { + "epoch": 0.21685499870611577, + "grad_norm": 0.76953125, + "learning_rate": 1.9465174108281995e-05, + "loss": 1.5755, + "step": 1257 + }, + { + "epoch": 0.21702751660484776, + "grad_norm": 0.7421875, + "learning_rate": 1.9464295535979022e-05, + "loss": 1.4259, + "step": 1258 + }, + { + "epoch": 0.21720003450357975, + "grad_norm": 0.6796875, + "learning_rate": 1.9463416262501898e-05, + "loss": 1.4476, + "step": 1259 + }, + { + "epoch": 0.21737255240231174, + "grad_norm": 0.59765625, + "learning_rate": 1.9462536287915773e-05, + "loss": 1.5078, + "step": 1260 + }, + { + "epoch": 0.21754507030104372, + "grad_norm": 0.6328125, + "learning_rate": 1.9461655612285827e-05, + "loss": 1.5395, + "step": 1261 + }, + { + "epoch": 0.2177175881997757, + "grad_norm": 0.67578125, + "learning_rate": 1.946077423567732e-05, + "loss": 1.4661, + "step": 1262 + }, + { + "epoch": 0.21789010609850773, + "grad_norm": 0.640625, + "learning_rate": 1.945989215815554e-05, + "loss": 1.55, + "step": 1263 + }, + { + "epoch": 0.21806262399723972, + "grad_norm": 0.59375, + "learning_rate": 1.9459009379785842e-05, + "loss": 1.5413, + "step": 1264 + }, + { + "epoch": 0.2182351418959717, + "grad_norm": 0.62109375, + "learning_rate": 1.9458125900633627e-05, + "loss": 1.4655, + "step": 1265 + }, + { + "epoch": 0.2184076597947037, + "grad_norm": 0.66015625, + "learning_rate": 1.945724172076435e-05, + "loss": 1.608, + "step": 1266 + }, + { + "epoch": 0.21858017769343568, + "grad_norm": 0.63671875, + "learning_rate": 1.945635684024351e-05, + "loss": 1.5109, + "step": 1267 + }, + { + "epoch": 0.2187526955921677, + "grad_norm": 0.59765625, + "learning_rate": 1.945547125913667e-05, + "loss": 1.4214, + "step": 1268 + }, + { + "epoch": 0.2189252134908997, + "grad_norm": 0.61328125, + "learning_rate": 1.9454584977509443e-05, + "loss": 1.5579, + "step": 1269 + }, + { + "epoch": 0.21909773138963168, + "grad_norm": 0.6015625, + "learning_rate": 1.9453697995427483e-05, + "loss": 1.5788, + "step": 1270 + }, + { + "epoch": 0.21927024928836367, + "grad_norm": 5.40625, + "learning_rate": 1.9452810312956507e-05, + "loss": 1.6086, + "step": 1271 + }, + { + "epoch": 0.21944276718709566, + "grad_norm": 0.61328125, + "learning_rate": 1.945192193016228e-05, + "loss": 1.5086, + "step": 1272 + }, + { + "epoch": 0.21961528508582764, + "grad_norm": 0.69140625, + "learning_rate": 1.9451032847110615e-05, + "loss": 1.6391, + "step": 1273 + }, + { + "epoch": 0.21978780298455966, + "grad_norm": 0.609375, + "learning_rate": 1.9450143063867385e-05, + "loss": 1.5566, + "step": 1274 + }, + { + "epoch": 0.21996032088329165, + "grad_norm": 0.66015625, + "learning_rate": 1.944925258049851e-05, + "loss": 1.6582, + "step": 1275 + }, + { + "epoch": 0.22013283878202364, + "grad_norm": 0.60546875, + "learning_rate": 1.9448361397069962e-05, + "loss": 1.5291, + "step": 1276 + }, + { + "epoch": 0.22030535668075563, + "grad_norm": 0.6328125, + "learning_rate": 1.9447469513647767e-05, + "loss": 1.6006, + "step": 1277 + }, + { + "epoch": 0.22047787457948761, + "grad_norm": 0.6328125, + "learning_rate": 1.9446576930298e-05, + "loss": 1.4931, + "step": 1278 + }, + { + "epoch": 0.2206503924782196, + "grad_norm": 0.66015625, + "learning_rate": 1.9445683647086788e-05, + "loss": 1.4638, + "step": 1279 + }, + { + "epoch": 0.22082291037695162, + "grad_norm": 0.58203125, + "learning_rate": 1.944478966408031e-05, + "loss": 1.5128, + "step": 1280 + }, + { + "epoch": 0.2209954282756836, + "grad_norm": 0.62109375, + "learning_rate": 1.9443894981344802e-05, + "loss": 1.5028, + "step": 1281 + }, + { + "epoch": 0.2211679461744156, + "grad_norm": 0.6171875, + "learning_rate": 1.9442999598946545e-05, + "loss": 1.5222, + "step": 1282 + }, + { + "epoch": 0.22134046407314759, + "grad_norm": 0.63671875, + "learning_rate": 1.9442103516951878e-05, + "loss": 1.5544, + "step": 1283 + }, + { + "epoch": 0.22151298197187957, + "grad_norm": 0.6796875, + "learning_rate": 1.944120673542718e-05, + "loss": 1.6135, + "step": 1284 + }, + { + "epoch": 0.22168549987061156, + "grad_norm": 0.65625, + "learning_rate": 1.94403092544389e-05, + "loss": 1.5499, + "step": 1285 + }, + { + "epoch": 0.22185801776934358, + "grad_norm": 0.67578125, + "learning_rate": 1.9439411074053525e-05, + "loss": 1.6011, + "step": 1286 + }, + { + "epoch": 0.22203053566807557, + "grad_norm": 0.5625, + "learning_rate": 1.9438512194337595e-05, + "loss": 1.4681, + "step": 1287 + }, + { + "epoch": 0.22220305356680756, + "grad_norm": 0.58984375, + "learning_rate": 1.9437612615357708e-05, + "loss": 1.4965, + "step": 1288 + }, + { + "epoch": 0.22237557146553955, + "grad_norm": 0.64453125, + "learning_rate": 1.943671233718051e-05, + "loss": 1.5009, + "step": 1289 + }, + { + "epoch": 0.22254808936427153, + "grad_norm": 0.6796875, + "learning_rate": 1.9435811359872696e-05, + "loss": 1.4423, + "step": 1290 + }, + { + "epoch": 0.22272060726300355, + "grad_norm": 0.84765625, + "learning_rate": 1.9434909683501023e-05, + "loss": 1.5931, + "step": 1291 + }, + { + "epoch": 0.22289312516173554, + "grad_norm": 0.73828125, + "learning_rate": 1.9434007308132287e-05, + "loss": 1.536, + "step": 1292 + }, + { + "epoch": 0.22306564306046753, + "grad_norm": 0.73046875, + "learning_rate": 1.943310423383334e-05, + "loss": 1.4643, + "step": 1293 + }, + { + "epoch": 0.22323816095919952, + "grad_norm": 0.80859375, + "learning_rate": 1.9432200460671096e-05, + "loss": 1.5385, + "step": 1294 + }, + { + "epoch": 0.2234106788579315, + "grad_norm": 0.6875, + "learning_rate": 1.9431295988712504e-05, + "loss": 1.6433, + "step": 1295 + }, + { + "epoch": 0.2235831967566635, + "grad_norm": 0.58203125, + "learning_rate": 1.9430390818024575e-05, + "loss": 1.5004, + "step": 1296 + }, + { + "epoch": 0.2237557146553955, + "grad_norm": 0.68359375, + "learning_rate": 1.9429484948674374e-05, + "loss": 1.5267, + "step": 1297 + }, + { + "epoch": 0.2239282325541275, + "grad_norm": 0.73046875, + "learning_rate": 1.9428578380729006e-05, + "loss": 1.562, + "step": 1298 + }, + { + "epoch": 0.2241007504528595, + "grad_norm": 0.6484375, + "learning_rate": 1.942767111425564e-05, + "loss": 1.5171, + "step": 1299 + }, + { + "epoch": 0.22427326835159148, + "grad_norm": 0.67578125, + "learning_rate": 1.942676314932149e-05, + "loss": 1.4994, + "step": 1300 + }, + { + "epoch": 0.22427326835159148, + "eval_loss": 1.515897512435913, + "eval_runtime": 11.0746, + "eval_samples_per_second": 92.464, + "eval_steps_per_second": 23.116, + "step": 1300 + }, + { + "epoch": 0.22444578625032346, + "grad_norm": 0.81640625, + "learning_rate": 1.9425854485993828e-05, + "loss": 1.6136, + "step": 1301 + }, + { + "epoch": 0.22461830414905545, + "grad_norm": 0.65625, + "learning_rate": 1.9424945124339965e-05, + "loss": 1.5522, + "step": 1302 + }, + { + "epoch": 0.22479082204778747, + "grad_norm": 0.57421875, + "learning_rate": 1.9424035064427286e-05, + "loss": 1.4482, + "step": 1303 + }, + { + "epoch": 0.22496333994651946, + "grad_norm": 0.6875, + "learning_rate": 1.9423124306323197e-05, + "loss": 1.6717, + "step": 1304 + }, + { + "epoch": 0.22513585784525145, + "grad_norm": 0.66796875, + "learning_rate": 1.9422212850095183e-05, + "loss": 1.4648, + "step": 1305 + }, + { + "epoch": 0.22530837574398344, + "grad_norm": 0.73828125, + "learning_rate": 1.9421300695810773e-05, + "loss": 1.5514, + "step": 1306 + }, + { + "epoch": 0.22548089364271542, + "grad_norm": 0.65234375, + "learning_rate": 1.9420387843537533e-05, + "loss": 1.5525, + "step": 1307 + }, + { + "epoch": 0.2256534115414474, + "grad_norm": 0.6796875, + "learning_rate": 1.9419474293343107e-05, + "loss": 1.5457, + "step": 1308 + }, + { + "epoch": 0.22582592944017943, + "grad_norm": 0.66796875, + "learning_rate": 1.9418560045295166e-05, + "loss": 1.5831, + "step": 1309 + }, + { + "epoch": 0.22599844733891142, + "grad_norm": 0.65234375, + "learning_rate": 1.9417645099461446e-05, + "loss": 1.407, + "step": 1310 + }, + { + "epoch": 0.2261709652376434, + "grad_norm": 0.671875, + "learning_rate": 1.9416729455909737e-05, + "loss": 1.6039, + "step": 1311 + }, + { + "epoch": 0.2263434831363754, + "grad_norm": 0.6484375, + "learning_rate": 1.941581311470787e-05, + "loss": 1.5484, + "step": 1312 + }, + { + "epoch": 0.22651600103510738, + "grad_norm": 0.734375, + "learning_rate": 1.9414896075923732e-05, + "loss": 1.5051, + "step": 1313 + }, + { + "epoch": 0.22668851893383937, + "grad_norm": 0.6328125, + "learning_rate": 1.9413978339625267e-05, + "loss": 1.4133, + "step": 1314 + }, + { + "epoch": 0.2268610368325714, + "grad_norm": 0.6875, + "learning_rate": 1.9413059905880466e-05, + "loss": 1.5823, + "step": 1315 + }, + { + "epoch": 0.22703355473130338, + "grad_norm": 0.734375, + "learning_rate": 1.941214077475737e-05, + "loss": 1.5071, + "step": 1316 + }, + { + "epoch": 0.22720607263003537, + "grad_norm": 0.78515625, + "learning_rate": 1.941122094632408e-05, + "loss": 1.5494, + "step": 1317 + }, + { + "epoch": 0.22737859052876735, + "grad_norm": 0.73828125, + "learning_rate": 1.9410300420648735e-05, + "loss": 1.4567, + "step": 1318 + }, + { + "epoch": 0.22755110842749934, + "grad_norm": 0.71875, + "learning_rate": 1.9409379197799537e-05, + "loss": 1.5043, + "step": 1319 + }, + { + "epoch": 0.22772362632623136, + "grad_norm": 0.8671875, + "learning_rate": 1.9408457277844738e-05, + "loss": 1.5131, + "step": 1320 + }, + { + "epoch": 0.22789614422496335, + "grad_norm": 0.63671875, + "learning_rate": 1.9407534660852632e-05, + "loss": 1.5591, + "step": 1321 + }, + { + "epoch": 0.22806866212369534, + "grad_norm": 0.74609375, + "learning_rate": 1.940661134689158e-05, + "loss": 1.4588, + "step": 1322 + }, + { + "epoch": 0.22824118002242733, + "grad_norm": 0.64453125, + "learning_rate": 1.9405687336029985e-05, + "loss": 1.519, + "step": 1323 + }, + { + "epoch": 0.22841369792115931, + "grad_norm": 0.73046875, + "learning_rate": 1.94047626283363e-05, + "loss": 1.5361, + "step": 1324 + }, + { + "epoch": 0.2285862158198913, + "grad_norm": 0.8671875, + "learning_rate": 1.9403837223879038e-05, + "loss": 1.5026, + "step": 1325 + }, + { + "epoch": 0.22875873371862332, + "grad_norm": 0.71484375, + "learning_rate": 1.9402911122726756e-05, + "loss": 1.5397, + "step": 1326 + }, + { + "epoch": 0.2289312516173553, + "grad_norm": 0.8046875, + "learning_rate": 1.9401984324948067e-05, + "loss": 1.4693, + "step": 1327 + }, + { + "epoch": 0.2291037695160873, + "grad_norm": 0.84765625, + "learning_rate": 1.9401056830611634e-05, + "loss": 1.5675, + "step": 1328 + }, + { + "epoch": 0.22927628741481929, + "grad_norm": 0.765625, + "learning_rate": 1.9400128639786168e-05, + "loss": 1.4772, + "step": 1329 + }, + { + "epoch": 0.22944880531355127, + "grad_norm": 0.83984375, + "learning_rate": 1.9399199752540435e-05, + "loss": 1.3506, + "step": 1330 + }, + { + "epoch": 0.22962132321228326, + "grad_norm": 0.640625, + "learning_rate": 1.939827016894326e-05, + "loss": 1.4486, + "step": 1331 + }, + { + "epoch": 0.22979384111101528, + "grad_norm": 0.6953125, + "learning_rate": 1.9397339889063506e-05, + "loss": 1.5139, + "step": 1332 + }, + { + "epoch": 0.22996635900974727, + "grad_norm": 0.765625, + "learning_rate": 1.9396408912970096e-05, + "loss": 1.5306, + "step": 1333 + }, + { + "epoch": 0.23013887690847926, + "grad_norm": 0.62890625, + "learning_rate": 1.9395477240732003e-05, + "loss": 1.4321, + "step": 1334 + }, + { + "epoch": 0.23031139480721124, + "grad_norm": 0.84765625, + "learning_rate": 1.939454487241825e-05, + "loss": 1.5289, + "step": 1335 + }, + { + "epoch": 0.23048391270594323, + "grad_norm": 0.65234375, + "learning_rate": 1.9393611808097913e-05, + "loss": 1.5167, + "step": 1336 + }, + { + "epoch": 0.23065643060467522, + "grad_norm": 0.75, + "learning_rate": 1.939267804784012e-05, + "loss": 1.509, + "step": 1337 + }, + { + "epoch": 0.23082894850340724, + "grad_norm": 0.67578125, + "learning_rate": 1.9391743591714046e-05, + "loss": 1.4916, + "step": 1338 + }, + { + "epoch": 0.23100146640213923, + "grad_norm": 0.66015625, + "learning_rate": 1.9390808439788928e-05, + "loss": 1.5742, + "step": 1339 + }, + { + "epoch": 0.23117398430087122, + "grad_norm": 0.75, + "learning_rate": 1.9389872592134044e-05, + "loss": 1.5068, + "step": 1340 + }, + { + "epoch": 0.2313465021996032, + "grad_norm": 1.1484375, + "learning_rate": 1.9388936048818725e-05, + "loss": 1.6008, + "step": 1341 + }, + { + "epoch": 0.2315190200983352, + "grad_norm": 0.59375, + "learning_rate": 1.938799880991236e-05, + "loss": 1.4813, + "step": 1342 + }, + { + "epoch": 0.2316915379970672, + "grad_norm": 0.63671875, + "learning_rate": 1.9387060875484388e-05, + "loss": 1.5196, + "step": 1343 + }, + { + "epoch": 0.2318640558957992, + "grad_norm": 0.78125, + "learning_rate": 1.9386122245604285e-05, + "loss": 1.5514, + "step": 1344 + }, + { + "epoch": 0.2320365737945312, + "grad_norm": 0.63671875, + "learning_rate": 1.938518292034161e-05, + "loss": 1.5391, + "step": 1345 + }, + { + "epoch": 0.23220909169326318, + "grad_norm": 0.703125, + "learning_rate": 1.9384242899765933e-05, + "loss": 1.6399, + "step": 1346 + }, + { + "epoch": 0.23238160959199516, + "grad_norm": 0.6484375, + "learning_rate": 1.938330218394691e-05, + "loss": 1.6369, + "step": 1347 + }, + { + "epoch": 0.23255412749072715, + "grad_norm": 0.65625, + "learning_rate": 1.938236077295423e-05, + "loss": 1.5085, + "step": 1348 + }, + { + "epoch": 0.23272664538945917, + "grad_norm": 0.61328125, + "learning_rate": 1.9381418666857645e-05, + "loss": 1.4524, + "step": 1349 + }, + { + "epoch": 0.23289916328819116, + "grad_norm": 0.67578125, + "learning_rate": 1.938047586572694e-05, + "loss": 1.5181, + "step": 1350 + }, + { + "epoch": 0.23307168118692315, + "grad_norm": 0.63671875, + "learning_rate": 1.937953236963198e-05, + "loss": 1.5584, + "step": 1351 + }, + { + "epoch": 0.23324419908565514, + "grad_norm": 0.70703125, + "learning_rate": 1.937858817864265e-05, + "loss": 1.489, + "step": 1352 + }, + { + "epoch": 0.23341671698438712, + "grad_norm": 0.8671875, + "learning_rate": 1.937764329282891e-05, + "loss": 1.595, + "step": 1353 + }, + { + "epoch": 0.2335892348831191, + "grad_norm": 0.671875, + "learning_rate": 1.9376697712260758e-05, + "loss": 1.5573, + "step": 1354 + }, + { + "epoch": 0.23376175278185113, + "grad_norm": 0.64453125, + "learning_rate": 1.9375751437008253e-05, + "loss": 1.4401, + "step": 1355 + }, + { + "epoch": 0.23393427068058312, + "grad_norm": 0.87890625, + "learning_rate": 1.9374804467141497e-05, + "loss": 1.4892, + "step": 1356 + }, + { + "epoch": 0.2341067885793151, + "grad_norm": 0.85546875, + "learning_rate": 1.937385680273065e-05, + "loss": 1.5684, + "step": 1357 + }, + { + "epoch": 0.2342793064780471, + "grad_norm": 1.3984375, + "learning_rate": 1.9372908443845926e-05, + "loss": 1.6208, + "step": 1358 + }, + { + "epoch": 0.23445182437677908, + "grad_norm": 0.8984375, + "learning_rate": 1.9371959390557573e-05, + "loss": 1.5651, + "step": 1359 + }, + { + "epoch": 0.23462434227551107, + "grad_norm": 0.6015625, + "learning_rate": 1.9371009642935913e-05, + "loss": 1.4277, + "step": 1360 + }, + { + "epoch": 0.2347968601742431, + "grad_norm": 0.8125, + "learning_rate": 1.9370059201051304e-05, + "loss": 1.5014, + "step": 1361 + }, + { + "epoch": 0.23496937807297508, + "grad_norm": 0.71484375, + "learning_rate": 1.936910806497417e-05, + "loss": 1.4315, + "step": 1362 + }, + { + "epoch": 0.23514189597170707, + "grad_norm": 0.71484375, + "learning_rate": 1.936815623477496e-05, + "loss": 1.6346, + "step": 1363 + }, + { + "epoch": 0.23531441387043905, + "grad_norm": 0.7421875, + "learning_rate": 1.9367203710524204e-05, + "loss": 1.5522, + "step": 1364 + }, + { + "epoch": 0.23548693176917104, + "grad_norm": 0.640625, + "learning_rate": 1.936625049229247e-05, + "loss": 1.5137, + "step": 1365 + }, + { + "epoch": 0.23565944966790303, + "grad_norm": 0.640625, + "learning_rate": 1.9365296580150374e-05, + "loss": 1.473, + "step": 1366 + }, + { + "epoch": 0.23583196756663505, + "grad_norm": 0.64453125, + "learning_rate": 1.9364341974168594e-05, + "loss": 1.5776, + "step": 1367 + }, + { + "epoch": 0.23600448546536704, + "grad_norm": 8.9375, + "learning_rate": 1.9363386674417844e-05, + "loss": 1.5133, + "step": 1368 + }, + { + "epoch": 0.23617700336409903, + "grad_norm": 0.7265625, + "learning_rate": 1.9362430680968907e-05, + "loss": 1.5902, + "step": 1369 + }, + { + "epoch": 0.236349521262831, + "grad_norm": 0.65234375, + "learning_rate": 1.9361473993892604e-05, + "loss": 1.4907, + "step": 1370 + }, + { + "epoch": 0.236522039161563, + "grad_norm": 0.609375, + "learning_rate": 1.936051661325982e-05, + "loss": 1.569, + "step": 1371 + }, + { + "epoch": 0.23669455706029502, + "grad_norm": 0.77734375, + "learning_rate": 1.9359558539141474e-05, + "loss": 1.5697, + "step": 1372 + }, + { + "epoch": 0.236867074959027, + "grad_norm": 0.59375, + "learning_rate": 1.935859977160855e-05, + "loss": 1.4963, + "step": 1373 + }, + { + "epoch": 0.237039592857759, + "grad_norm": 0.7265625, + "learning_rate": 1.935764031073208e-05, + "loss": 1.5429, + "step": 1374 + }, + { + "epoch": 0.23721211075649098, + "grad_norm": 0.671875, + "learning_rate": 1.9356680156583147e-05, + "loss": 1.5264, + "step": 1375 + }, + { + "epoch": 0.23738462865522297, + "grad_norm": 0.65234375, + "learning_rate": 1.9355719309232885e-05, + "loss": 1.5749, + "step": 1376 + }, + { + "epoch": 0.23755714655395496, + "grad_norm": 0.6796875, + "learning_rate": 1.935475776875248e-05, + "loss": 1.6136, + "step": 1377 + }, + { + "epoch": 0.23772966445268698, + "grad_norm": 0.61328125, + "learning_rate": 1.9353795535213167e-05, + "loss": 1.4908, + "step": 1378 + }, + { + "epoch": 0.23790218235141897, + "grad_norm": 0.734375, + "learning_rate": 1.9352832608686234e-05, + "loss": 1.439, + "step": 1379 + }, + { + "epoch": 0.23807470025015096, + "grad_norm": 0.62890625, + "learning_rate": 1.9351868989243026e-05, + "loss": 1.5656, + "step": 1380 + }, + { + "epoch": 0.23824721814888294, + "grad_norm": 0.56640625, + "learning_rate": 1.9350904676954927e-05, + "loss": 1.4963, + "step": 1381 + }, + { + "epoch": 0.23841973604761493, + "grad_norm": 0.73046875, + "learning_rate": 1.9349939671893384e-05, + "loss": 1.5902, + "step": 1382 + }, + { + "epoch": 0.23859225394634692, + "grad_norm": 0.6015625, + "learning_rate": 1.934897397412989e-05, + "loss": 1.457, + "step": 1383 + }, + { + "epoch": 0.23876477184507894, + "grad_norm": 0.6796875, + "learning_rate": 1.9348007583735985e-05, + "loss": 1.5349, + "step": 1384 + }, + { + "epoch": 0.23893728974381093, + "grad_norm": 0.62109375, + "learning_rate": 1.9347040500783272e-05, + "loss": 1.4851, + "step": 1385 + }, + { + "epoch": 0.23910980764254292, + "grad_norm": 0.6015625, + "learning_rate": 1.9346072725343394e-05, + "loss": 1.5019, + "step": 1386 + }, + { + "epoch": 0.2392823255412749, + "grad_norm": 0.57421875, + "learning_rate": 1.934510425748805e-05, + "loss": 1.5755, + "step": 1387 + }, + { + "epoch": 0.2394548434400069, + "grad_norm": 0.625, + "learning_rate": 1.9344135097288997e-05, + "loss": 1.5478, + "step": 1388 + }, + { + "epoch": 0.23962736133873888, + "grad_norm": 0.83203125, + "learning_rate": 1.934316524481803e-05, + "loss": 1.5787, + "step": 1389 + }, + { + "epoch": 0.2397998792374709, + "grad_norm": 0.609375, + "learning_rate": 1.9342194700146998e-05, + "loss": 1.5794, + "step": 1390 + }, + { + "epoch": 0.2399723971362029, + "grad_norm": 0.75390625, + "learning_rate": 1.9341223463347815e-05, + "loss": 1.5546, + "step": 1391 + }, + { + "epoch": 0.24014491503493487, + "grad_norm": 1.1171875, + "learning_rate": 1.9340251534492428e-05, + "loss": 1.5541, + "step": 1392 + }, + { + "epoch": 0.24031743293366686, + "grad_norm": 0.7578125, + "learning_rate": 1.933927891365285e-05, + "loss": 1.644, + "step": 1393 + }, + { + "epoch": 0.24048995083239885, + "grad_norm": 0.58203125, + "learning_rate": 1.9338305600901135e-05, + "loss": 1.514, + "step": 1394 + }, + { + "epoch": 0.24066246873113087, + "grad_norm": 0.67578125, + "learning_rate": 1.933733159630939e-05, + "loss": 1.5331, + "step": 1395 + }, + { + "epoch": 0.24083498662986286, + "grad_norm": 0.8046875, + "learning_rate": 1.933635689994978e-05, + "loss": 1.4759, + "step": 1396 + }, + { + "epoch": 0.24100750452859485, + "grad_norm": 0.6875, + "learning_rate": 1.933538151189451e-05, + "loss": 1.4967, + "step": 1397 + }, + { + "epoch": 0.24118002242732683, + "grad_norm": 0.734375, + "learning_rate": 1.9334405432215857e-05, + "loss": 1.5873, + "step": 1398 + }, + { + "epoch": 0.24135254032605882, + "grad_norm": 0.5859375, + "learning_rate": 1.9333428660986118e-05, + "loss": 1.537, + "step": 1399 + }, + { + "epoch": 0.2415250582247908, + "grad_norm": 0.734375, + "learning_rate": 1.9332451198277668e-05, + "loss": 1.5899, + "step": 1400 + }, + { + "epoch": 0.2415250582247908, + "eval_loss": 1.509263277053833, + "eval_runtime": 11.0113, + "eval_samples_per_second": 92.996, + "eval_steps_per_second": 23.249, + "step": 1400 + }, + { + "epoch": 0.24169757612352283, + "grad_norm": 0.6875, + "learning_rate": 1.933147304416292e-05, + "loss": 1.555, + "step": 1401 + }, + { + "epoch": 0.24187009402225482, + "grad_norm": 0.734375, + "learning_rate": 1.9330494198714347e-05, + "loss": 1.5731, + "step": 1402 + }, + { + "epoch": 0.2420426119209868, + "grad_norm": 0.8515625, + "learning_rate": 1.932951466200446e-05, + "loss": 1.5459, + "step": 1403 + }, + { + "epoch": 0.2422151298197188, + "grad_norm": 0.6484375, + "learning_rate": 1.9328534434105835e-05, + "loss": 1.5302, + "step": 1404 + }, + { + "epoch": 0.24238764771845078, + "grad_norm": 0.70703125, + "learning_rate": 1.9327553515091092e-05, + "loss": 1.452, + "step": 1405 + }, + { + "epoch": 0.24256016561718277, + "grad_norm": 0.77734375, + "learning_rate": 1.93265719050329e-05, + "loss": 1.516, + "step": 1406 + }, + { + "epoch": 0.2427326835159148, + "grad_norm": 0.8359375, + "learning_rate": 1.9325589604003992e-05, + "loss": 1.5422, + "step": 1407 + }, + { + "epoch": 0.24290520141464678, + "grad_norm": 0.6640625, + "learning_rate": 1.9324606612077135e-05, + "loss": 1.5611, + "step": 1408 + }, + { + "epoch": 0.24307771931337877, + "grad_norm": 0.703125, + "learning_rate": 1.9323622929325155e-05, + "loss": 1.5626, + "step": 1409 + }, + { + "epoch": 0.24325023721211075, + "grad_norm": 0.6953125, + "learning_rate": 1.9322638555820934e-05, + "loss": 1.5338, + "step": 1410 + }, + { + "epoch": 0.24342275511084274, + "grad_norm": 0.71875, + "learning_rate": 1.9321653491637397e-05, + "loss": 1.6135, + "step": 1411 + }, + { + "epoch": 0.24359527300957473, + "grad_norm": 0.796875, + "learning_rate": 1.9320667736847526e-05, + "loss": 1.5297, + "step": 1412 + }, + { + "epoch": 0.24376779090830675, + "grad_norm": 0.8125, + "learning_rate": 1.931968129152435e-05, + "loss": 1.5739, + "step": 1413 + }, + { + "epoch": 0.24394030880703874, + "grad_norm": 0.83203125, + "learning_rate": 1.931869415574095e-05, + "loss": 1.4873, + "step": 1414 + }, + { + "epoch": 0.24411282670577072, + "grad_norm": 0.85546875, + "learning_rate": 1.9317706329570467e-05, + "loss": 1.5725, + "step": 1415 + }, + { + "epoch": 0.2442853446045027, + "grad_norm": 0.65234375, + "learning_rate": 1.9316717813086073e-05, + "loss": 1.4288, + "step": 1416 + }, + { + "epoch": 0.2444578625032347, + "grad_norm": 0.7265625, + "learning_rate": 1.9315728606361012e-05, + "loss": 1.4411, + "step": 1417 + }, + { + "epoch": 0.2446303804019667, + "grad_norm": 0.7578125, + "learning_rate": 1.931473870946857e-05, + "loss": 1.4901, + "step": 1418 + }, + { + "epoch": 0.2448028983006987, + "grad_norm": 0.65625, + "learning_rate": 1.9313748122482085e-05, + "loss": 1.5489, + "step": 1419 + }, + { + "epoch": 0.2449754161994307, + "grad_norm": 0.828125, + "learning_rate": 1.9312756845474937e-05, + "loss": 1.4666, + "step": 1420 + }, + { + "epoch": 0.24514793409816268, + "grad_norm": 3.109375, + "learning_rate": 1.931176487852058e-05, + "loss": 1.5141, + "step": 1421 + }, + { + "epoch": 0.24532045199689467, + "grad_norm": 0.60546875, + "learning_rate": 1.9310772221692495e-05, + "loss": 1.5155, + "step": 1422 + }, + { + "epoch": 0.24549296989562666, + "grad_norm": 0.703125, + "learning_rate": 1.9309778875064228e-05, + "loss": 1.5135, + "step": 1423 + }, + { + "epoch": 0.24566548779435868, + "grad_norm": 0.97265625, + "learning_rate": 1.930878483870937e-05, + "loss": 1.509, + "step": 1424 + }, + { + "epoch": 0.24583800569309067, + "grad_norm": 0.6796875, + "learning_rate": 1.9307790112701573e-05, + "loss": 1.5216, + "step": 1425 + }, + { + "epoch": 0.24601052359182266, + "grad_norm": 0.66796875, + "learning_rate": 1.9306794697114522e-05, + "loss": 1.5448, + "step": 1426 + }, + { + "epoch": 0.24618304149055464, + "grad_norm": 0.6640625, + "learning_rate": 1.9305798592021968e-05, + "loss": 1.459, + "step": 1427 + }, + { + "epoch": 0.24635555938928663, + "grad_norm": 0.6328125, + "learning_rate": 1.930480179749771e-05, + "loss": 1.5335, + "step": 1428 + }, + { + "epoch": 0.24652807728801862, + "grad_norm": 0.6171875, + "learning_rate": 1.9303804313615596e-05, + "loss": 1.5651, + "step": 1429 + }, + { + "epoch": 0.24670059518675064, + "grad_norm": 0.66015625, + "learning_rate": 1.9302806140449525e-05, + "loss": 1.5367, + "step": 1430 + }, + { + "epoch": 0.24687311308548263, + "grad_norm": 0.6328125, + "learning_rate": 1.9301807278073447e-05, + "loss": 1.503, + "step": 1431 + }, + { + "epoch": 0.24704563098421461, + "grad_norm": 0.68359375, + "learning_rate": 1.9300807726561368e-05, + "loss": 1.5215, + "step": 1432 + }, + { + "epoch": 0.2472181488829466, + "grad_norm": 0.5859375, + "learning_rate": 1.929980748598734e-05, + "loss": 1.5557, + "step": 1433 + }, + { + "epoch": 0.2473906667816786, + "grad_norm": 0.69140625, + "learning_rate": 1.9298806556425462e-05, + "loss": 1.495, + "step": 1434 + }, + { + "epoch": 0.24756318468041058, + "grad_norm": 0.73046875, + "learning_rate": 1.9297804937949894e-05, + "loss": 1.5105, + "step": 1435 + }, + { + "epoch": 0.2477357025791426, + "grad_norm": 0.6640625, + "learning_rate": 1.929680263063484e-05, + "loss": 1.5457, + "step": 1436 + }, + { + "epoch": 0.24790822047787459, + "grad_norm": 0.8984375, + "learning_rate": 1.929579963455456e-05, + "loss": 1.5825, + "step": 1437 + }, + { + "epoch": 0.24808073837660657, + "grad_norm": 0.85546875, + "learning_rate": 1.929479594978336e-05, + "loss": 1.4412, + "step": 1438 + }, + { + "epoch": 0.24825325627533856, + "grad_norm": 0.6484375, + "learning_rate": 1.9293791576395597e-05, + "loss": 1.5191, + "step": 1439 + }, + { + "epoch": 0.24842577417407055, + "grad_norm": 0.72265625, + "learning_rate": 1.9292786514465685e-05, + "loss": 1.4836, + "step": 1440 + }, + { + "epoch": 0.24859829207280254, + "grad_norm": 0.6953125, + "learning_rate": 1.9291780764068085e-05, + "loss": 1.4466, + "step": 1441 + }, + { + "epoch": 0.24877080997153456, + "grad_norm": 0.85546875, + "learning_rate": 1.9290774325277305e-05, + "loss": 1.6016, + "step": 1442 + }, + { + "epoch": 0.24894332787026655, + "grad_norm": 0.82421875, + "learning_rate": 1.9289767198167918e-05, + "loss": 1.5898, + "step": 1443 + }, + { + "epoch": 0.24911584576899853, + "grad_norm": 0.79296875, + "learning_rate": 1.9288759382814526e-05, + "loss": 1.5457, + "step": 1444 + }, + { + "epoch": 0.24928836366773052, + "grad_norm": 0.70703125, + "learning_rate": 1.9287750879291802e-05, + "loss": 1.4422, + "step": 1445 + }, + { + "epoch": 0.2494608815664625, + "grad_norm": 0.66015625, + "learning_rate": 1.928674168767446e-05, + "loss": 1.6013, + "step": 1446 + }, + { + "epoch": 0.24963339946519453, + "grad_norm": 0.6640625, + "learning_rate": 1.928573180803727e-05, + "loss": 1.5306, + "step": 1447 + }, + { + "epoch": 0.24980591736392652, + "grad_norm": 0.77734375, + "learning_rate": 1.9284721240455045e-05, + "loss": 1.4907, + "step": 1448 + }, + { + "epoch": 0.2499784352626585, + "grad_norm": 0.72265625, + "learning_rate": 1.9283709985002655e-05, + "loss": 1.5263, + "step": 1449 + }, + { + "epoch": 0.2501509531613905, + "grad_norm": 0.79296875, + "learning_rate": 1.9282698041755027e-05, + "loss": 1.5184, + "step": 1450 + }, + { + "epoch": 0.2503234710601225, + "grad_norm": 0.6953125, + "learning_rate": 1.9281685410787125e-05, + "loss": 1.5459, + "step": 1451 + }, + { + "epoch": 0.2504959889588545, + "grad_norm": 1.1796875, + "learning_rate": 1.928067209217397e-05, + "loss": 1.6034, + "step": 1452 + }, + { + "epoch": 0.25066850685758646, + "grad_norm": 0.75390625, + "learning_rate": 1.9279658085990642e-05, + "loss": 1.5055, + "step": 1453 + }, + { + "epoch": 0.2508410247563185, + "grad_norm": 0.6015625, + "learning_rate": 1.927864339231226e-05, + "loss": 1.4582, + "step": 1454 + }, + { + "epoch": 0.25101354265505044, + "grad_norm": 0.640625, + "learning_rate": 1.9277628011214e-05, + "loss": 1.5079, + "step": 1455 + }, + { + "epoch": 0.25118606055378245, + "grad_norm": 0.6953125, + "learning_rate": 1.9276611942771083e-05, + "loss": 1.4217, + "step": 1456 + }, + { + "epoch": 0.25135857845251447, + "grad_norm": 0.66796875, + "learning_rate": 1.92755951870588e-05, + "loss": 1.5337, + "step": 1457 + }, + { + "epoch": 0.25153109635124643, + "grad_norm": 0.92578125, + "learning_rate": 1.927457774415246e-05, + "loss": 1.4909, + "step": 1458 + }, + { + "epoch": 0.25170361424997845, + "grad_norm": 0.61328125, + "learning_rate": 1.9273559614127455e-05, + "loss": 1.4921, + "step": 1459 + }, + { + "epoch": 0.2518761321487104, + "grad_norm": 0.625, + "learning_rate": 1.9272540797059208e-05, + "loss": 1.4744, + "step": 1460 + }, + { + "epoch": 0.2520486500474424, + "grad_norm": 0.65234375, + "learning_rate": 1.9271521293023202e-05, + "loss": 1.5649, + "step": 1461 + }, + { + "epoch": 0.25222116794617444, + "grad_norm": 0.625, + "learning_rate": 1.927050110209497e-05, + "loss": 1.5705, + "step": 1462 + }, + { + "epoch": 0.2523936858449064, + "grad_norm": 0.65234375, + "learning_rate": 1.9269480224350087e-05, + "loss": 1.6142, + "step": 1463 + }, + { + "epoch": 0.2525662037436384, + "grad_norm": 0.76171875, + "learning_rate": 1.9268458659864194e-05, + "loss": 1.6079, + "step": 1464 + }, + { + "epoch": 0.2527387216423704, + "grad_norm": 0.66015625, + "learning_rate": 1.926743640871297e-05, + "loss": 1.5139, + "step": 1465 + }, + { + "epoch": 0.2529112395411024, + "grad_norm": 0.6328125, + "learning_rate": 1.9266413470972153e-05, + "loss": 1.5096, + "step": 1466 + }, + { + "epoch": 0.25308375743983436, + "grad_norm": 0.65625, + "learning_rate": 1.9265389846717522e-05, + "loss": 1.5319, + "step": 1467 + }, + { + "epoch": 0.2532562753385664, + "grad_norm": 0.765625, + "learning_rate": 1.926436553602492e-05, + "loss": 1.61, + "step": 1468 + }, + { + "epoch": 0.2534287932372984, + "grad_norm": 0.6171875, + "learning_rate": 1.9263340538970237e-05, + "loss": 1.5454, + "step": 1469 + }, + { + "epoch": 0.25360131113603035, + "grad_norm": 0.77734375, + "learning_rate": 1.9262314855629405e-05, + "loss": 1.5231, + "step": 1470 + }, + { + "epoch": 0.25377382903476237, + "grad_norm": 0.71484375, + "learning_rate": 1.9261288486078414e-05, + "loss": 1.477, + "step": 1471 + }, + { + "epoch": 0.2539463469334943, + "grad_norm": 0.6640625, + "learning_rate": 1.9260261430393306e-05, + "loss": 1.4894, + "step": 1472 + }, + { + "epoch": 0.25411886483222634, + "grad_norm": 0.6484375, + "learning_rate": 1.9259233688650167e-05, + "loss": 1.4668, + "step": 1473 + }, + { + "epoch": 0.25429138273095836, + "grad_norm": 0.67578125, + "learning_rate": 1.9258205260925143e-05, + "loss": 1.4983, + "step": 1474 + }, + { + "epoch": 0.2544639006296903, + "grad_norm": 0.79296875, + "learning_rate": 1.925717614729443e-05, + "loss": 1.5638, + "step": 1475 + }, + { + "epoch": 0.25463641852842234, + "grad_norm": 0.78515625, + "learning_rate": 1.9256146347834262e-05, + "loss": 1.5986, + "step": 1476 + }, + { + "epoch": 0.2548089364271543, + "grad_norm": 1.2578125, + "learning_rate": 1.9255115862620938e-05, + "loss": 1.4828, + "step": 1477 + }, + { + "epoch": 0.2549814543258863, + "grad_norm": 0.69921875, + "learning_rate": 1.9254084691730804e-05, + "loss": 1.6261, + "step": 1478 + }, + { + "epoch": 0.25515397222461833, + "grad_norm": 0.62890625, + "learning_rate": 1.9253052835240252e-05, + "loss": 1.5251, + "step": 1479 + }, + { + "epoch": 0.2553264901233503, + "grad_norm": 0.83984375, + "learning_rate": 1.925202029322573e-05, + "loss": 1.6185, + "step": 1480 + }, + { + "epoch": 0.2554990080220823, + "grad_norm": 0.81640625, + "learning_rate": 1.9250987065763737e-05, + "loss": 1.5007, + "step": 1481 + }, + { + "epoch": 0.25567152592081427, + "grad_norm": 0.83203125, + "learning_rate": 1.9249953152930818e-05, + "loss": 1.6077, + "step": 1482 + }, + { + "epoch": 0.2558440438195463, + "grad_norm": 1.0234375, + "learning_rate": 1.9248918554803576e-05, + "loss": 1.4667, + "step": 1483 + }, + { + "epoch": 0.25601656171827825, + "grad_norm": 0.79296875, + "learning_rate": 1.9247883271458653e-05, + "loss": 1.5333, + "step": 1484 + }, + { + "epoch": 0.25618907961701026, + "grad_norm": 1.40625, + "learning_rate": 1.9246847302972754e-05, + "loss": 1.5543, + "step": 1485 + }, + { + "epoch": 0.2563615975157423, + "grad_norm": 0.77734375, + "learning_rate": 1.9245810649422633e-05, + "loss": 1.5163, + "step": 1486 + }, + { + "epoch": 0.25653411541447424, + "grad_norm": 0.73046875, + "learning_rate": 1.924477331088509e-05, + "loss": 1.5476, + "step": 1487 + }, + { + "epoch": 0.25670663331320626, + "grad_norm": 0.67578125, + "learning_rate": 1.924373528743697e-05, + "loss": 1.4741, + "step": 1488 + }, + { + "epoch": 0.2568791512119382, + "grad_norm": 0.859375, + "learning_rate": 1.924269657915519e-05, + "loss": 1.4808, + "step": 1489 + }, + { + "epoch": 0.25705166911067023, + "grad_norm": 0.78125, + "learning_rate": 1.9241657186116688e-05, + "loss": 1.4003, + "step": 1490 + }, + { + "epoch": 0.25722418700940225, + "grad_norm": 0.61328125, + "learning_rate": 1.9240617108398482e-05, + "loss": 1.4317, + "step": 1491 + }, + { + "epoch": 0.2573967049081342, + "grad_norm": 0.91796875, + "learning_rate": 1.923957634607762e-05, + "loss": 1.4747, + "step": 1492 + }, + { + "epoch": 0.2575692228068662, + "grad_norm": 0.8203125, + "learning_rate": 1.9238534899231216e-05, + "loss": 1.5198, + "step": 1493 + }, + { + "epoch": 0.2577417407055982, + "grad_norm": 0.71484375, + "learning_rate": 1.923749276793642e-05, + "loss": 1.4781, + "step": 1494 + }, + { + "epoch": 0.2579142586043302, + "grad_norm": 0.8203125, + "learning_rate": 1.9236449952270437e-05, + "loss": 1.5068, + "step": 1495 + }, + { + "epoch": 0.25808677650306217, + "grad_norm": 0.83984375, + "learning_rate": 1.923540645231053e-05, + "loss": 1.5955, + "step": 1496 + }, + { + "epoch": 0.2582592944017942, + "grad_norm": 0.80859375, + "learning_rate": 1.923436226813401e-05, + "loss": 1.5487, + "step": 1497 + }, + { + "epoch": 0.2584318123005262, + "grad_norm": 0.89453125, + "learning_rate": 1.9233317399818237e-05, + "loss": 1.6363, + "step": 1498 + }, + { + "epoch": 0.25860433019925816, + "grad_norm": 0.6875, + "learning_rate": 1.9232271847440614e-05, + "loss": 1.588, + "step": 1499 + }, + { + "epoch": 0.2587768480979902, + "grad_norm": 1.140625, + "learning_rate": 1.923122561107861e-05, + "loss": 1.5108, + "step": 1500 + }, + { + "epoch": 0.2587768480979902, + "eval_loss": 1.5035676956176758, + "eval_runtime": 10.9566, + "eval_samples_per_second": 93.46, + "eval_steps_per_second": 23.365, + "step": 1500 + }, + { + "epoch": 0.25894936599672214, + "grad_norm": 1.0859375, + "learning_rate": 1.923017869080973e-05, + "loss": 1.5098, + "step": 1501 + }, + { + "epoch": 0.25912188389545415, + "grad_norm": 0.7109375, + "learning_rate": 1.9229131086711542e-05, + "loss": 1.386, + "step": 1502 + }, + { + "epoch": 0.25929440179418617, + "grad_norm": 0.8359375, + "learning_rate": 1.9228082798861656e-05, + "loss": 1.5323, + "step": 1503 + }, + { + "epoch": 0.25946691969291813, + "grad_norm": 0.7734375, + "learning_rate": 1.9227033827337735e-05, + "loss": 1.5277, + "step": 1504 + }, + { + "epoch": 0.25963943759165015, + "grad_norm": 0.76171875, + "learning_rate": 1.9225984172217497e-05, + "loss": 1.5729, + "step": 1505 + }, + { + "epoch": 0.2598119554903821, + "grad_norm": 0.82421875, + "learning_rate": 1.9224933833578706e-05, + "loss": 1.5361, + "step": 1506 + }, + { + "epoch": 0.2599844733891141, + "grad_norm": 0.99609375, + "learning_rate": 1.9223882811499175e-05, + "loss": 1.465, + "step": 1507 + }, + { + "epoch": 0.26015699128784614, + "grad_norm": 0.8125, + "learning_rate": 1.9222831106056768e-05, + "loss": 1.4906, + "step": 1508 + }, + { + "epoch": 0.2603295091865781, + "grad_norm": 0.828125, + "learning_rate": 1.922177871732941e-05, + "loss": 1.5643, + "step": 1509 + }, + { + "epoch": 0.2605020270853101, + "grad_norm": 0.98828125, + "learning_rate": 1.9220725645395066e-05, + "loss": 1.596, + "step": 1510 + }, + { + "epoch": 0.2606745449840421, + "grad_norm": 0.7421875, + "learning_rate": 1.921967189033175e-05, + "loss": 1.482, + "step": 1511 + }, + { + "epoch": 0.2608470628827741, + "grad_norm": 0.5703125, + "learning_rate": 1.9218617452217534e-05, + "loss": 1.5303, + "step": 1512 + }, + { + "epoch": 0.26101958078150606, + "grad_norm": 0.91796875, + "learning_rate": 1.9217562331130536e-05, + "loss": 1.4882, + "step": 1513 + }, + { + "epoch": 0.26119209868023807, + "grad_norm": 0.80078125, + "learning_rate": 1.9216506527148926e-05, + "loss": 1.5004, + "step": 1514 + }, + { + "epoch": 0.2613646165789701, + "grad_norm": 0.703125, + "learning_rate": 1.9215450040350924e-05, + "loss": 1.5241, + "step": 1515 + }, + { + "epoch": 0.26153713447770205, + "grad_norm": 0.65625, + "learning_rate": 1.9214392870814805e-05, + "loss": 1.5178, + "step": 1516 + }, + { + "epoch": 0.26170965237643407, + "grad_norm": 0.94140625, + "learning_rate": 1.921333501861889e-05, + "loss": 1.5519, + "step": 1517 + }, + { + "epoch": 0.261882170275166, + "grad_norm": 0.88671875, + "learning_rate": 1.9212276483841544e-05, + "loss": 1.5706, + "step": 1518 + }, + { + "epoch": 0.26205468817389804, + "grad_norm": 0.66015625, + "learning_rate": 1.9211217266561197e-05, + "loss": 1.504, + "step": 1519 + }, + { + "epoch": 0.26222720607263006, + "grad_norm": 0.8125, + "learning_rate": 1.921015736685632e-05, + "loss": 1.5253, + "step": 1520 + }, + { + "epoch": 0.262399723971362, + "grad_norm": 0.81640625, + "learning_rate": 1.920909678480544e-05, + "loss": 1.4976, + "step": 1521 + }, + { + "epoch": 0.26257224187009404, + "grad_norm": 0.73046875, + "learning_rate": 1.9208035520487125e-05, + "loss": 1.5356, + "step": 1522 + }, + { + "epoch": 0.262744759768826, + "grad_norm": 0.6484375, + "learning_rate": 1.920697357398001e-05, + "loss": 1.4926, + "step": 1523 + }, + { + "epoch": 0.262917277667558, + "grad_norm": 0.734375, + "learning_rate": 1.920591094536276e-05, + "loss": 1.539, + "step": 1524 + }, + { + "epoch": 0.26308979556629003, + "grad_norm": 0.81640625, + "learning_rate": 1.9204847634714105e-05, + "loss": 1.5248, + "step": 1525 + }, + { + "epoch": 0.263262313465022, + "grad_norm": 0.6484375, + "learning_rate": 1.9203783642112825e-05, + "loss": 1.4683, + "step": 1526 + }, + { + "epoch": 0.263434831363754, + "grad_norm": 0.60546875, + "learning_rate": 1.9202718967637745e-05, + "loss": 1.6038, + "step": 1527 + }, + { + "epoch": 0.26360734926248597, + "grad_norm": 0.65234375, + "learning_rate": 1.9201653611367742e-05, + "loss": 1.5553, + "step": 1528 + }, + { + "epoch": 0.263779867161218, + "grad_norm": 0.671875, + "learning_rate": 1.9200587573381747e-05, + "loss": 1.5876, + "step": 1529 + }, + { + "epoch": 0.26395238505994995, + "grad_norm": 0.6171875, + "learning_rate": 1.9199520853758734e-05, + "loss": 1.6082, + "step": 1530 + }, + { + "epoch": 0.26412490295868196, + "grad_norm": 0.67578125, + "learning_rate": 1.9198453452577738e-05, + "loss": 1.5431, + "step": 1531 + }, + { + "epoch": 0.264297420857414, + "grad_norm": 0.671875, + "learning_rate": 1.9197385369917834e-05, + "loss": 1.5877, + "step": 1532 + }, + { + "epoch": 0.26446993875614594, + "grad_norm": 0.61328125, + "learning_rate": 1.9196316605858152e-05, + "loss": 1.5066, + "step": 1533 + }, + { + "epoch": 0.26464245665487796, + "grad_norm": 0.6640625, + "learning_rate": 1.9195247160477874e-05, + "loss": 1.496, + "step": 1534 + }, + { + "epoch": 0.2648149745536099, + "grad_norm": 0.65234375, + "learning_rate": 1.9194177033856233e-05, + "loss": 1.4877, + "step": 1535 + }, + { + "epoch": 0.26498749245234193, + "grad_norm": 0.62109375, + "learning_rate": 1.919310622607251e-05, + "loss": 1.532, + "step": 1536 + }, + { + "epoch": 0.26516001035107395, + "grad_norm": 0.640625, + "learning_rate": 1.9192034737206037e-05, + "loss": 1.5576, + "step": 1537 + }, + { + "epoch": 0.2653325282498059, + "grad_norm": 0.65625, + "learning_rate": 1.91909625673362e-05, + "loss": 1.4709, + "step": 1538 + }, + { + "epoch": 0.2655050461485379, + "grad_norm": 0.6796875, + "learning_rate": 1.9189889716542423e-05, + "loss": 1.5417, + "step": 1539 + }, + { + "epoch": 0.2656775640472699, + "grad_norm": 0.671875, + "learning_rate": 1.9188816184904194e-05, + "loss": 1.5503, + "step": 1540 + }, + { + "epoch": 0.2658500819460019, + "grad_norm": 0.640625, + "learning_rate": 1.9187741972501052e-05, + "loss": 1.5246, + "step": 1541 + }, + { + "epoch": 0.26602259984473386, + "grad_norm": 0.6484375, + "learning_rate": 1.9186667079412575e-05, + "loss": 1.4977, + "step": 1542 + }, + { + "epoch": 0.2661951177434659, + "grad_norm": 0.59375, + "learning_rate": 1.91855915057184e-05, + "loss": 1.4235, + "step": 1543 + }, + { + "epoch": 0.2663676356421979, + "grad_norm": 0.6171875, + "learning_rate": 1.918451525149821e-05, + "loss": 1.6124, + "step": 1544 + }, + { + "epoch": 0.26654015354092986, + "grad_norm": 1.046875, + "learning_rate": 1.9183438316831743e-05, + "loss": 1.5644, + "step": 1545 + }, + { + "epoch": 0.2667126714396619, + "grad_norm": 0.62890625, + "learning_rate": 1.918236070179879e-05, + "loss": 1.4849, + "step": 1546 + }, + { + "epoch": 0.26688518933839384, + "grad_norm": 0.62890625, + "learning_rate": 1.9181282406479175e-05, + "loss": 1.4897, + "step": 1547 + }, + { + "epoch": 0.26705770723712585, + "grad_norm": 0.6796875, + "learning_rate": 1.9180203430952794e-05, + "loss": 1.4912, + "step": 1548 + }, + { + "epoch": 0.26723022513585787, + "grad_norm": 0.640625, + "learning_rate": 1.9179123775299584e-05, + "loss": 1.5156, + "step": 1549 + }, + { + "epoch": 0.26740274303458983, + "grad_norm": 0.94921875, + "learning_rate": 1.9178043439599528e-05, + "loss": 1.6099, + "step": 1550 + }, + { + "epoch": 0.26757526093332185, + "grad_norm": 0.58984375, + "learning_rate": 1.9176962423932674e-05, + "loss": 1.4946, + "step": 1551 + }, + { + "epoch": 0.2677477788320538, + "grad_norm": 0.66015625, + "learning_rate": 1.9175880728379094e-05, + "loss": 1.5194, + "step": 1552 + }, + { + "epoch": 0.2679202967307858, + "grad_norm": 0.58203125, + "learning_rate": 1.917479835301894e-05, + "loss": 1.5548, + "step": 1553 + }, + { + "epoch": 0.26809281462951784, + "grad_norm": 0.61328125, + "learning_rate": 1.91737152979324e-05, + "loss": 1.4607, + "step": 1554 + }, + { + "epoch": 0.2682653325282498, + "grad_norm": 0.671875, + "learning_rate": 1.917263156319971e-05, + "loss": 1.5371, + "step": 1555 + }, + { + "epoch": 0.2684378504269818, + "grad_norm": 0.7421875, + "learning_rate": 1.9171547148901158e-05, + "loss": 1.5787, + "step": 1556 + }, + { + "epoch": 0.2686103683257138, + "grad_norm": 0.734375, + "learning_rate": 1.9170462055117086e-05, + "loss": 1.4766, + "step": 1557 + }, + { + "epoch": 0.2687828862244458, + "grad_norm": 0.703125, + "learning_rate": 1.916937628192789e-05, + "loss": 1.4528, + "step": 1558 + }, + { + "epoch": 0.26895540412317775, + "grad_norm": 0.7265625, + "learning_rate": 1.9168289829414002e-05, + "loss": 1.5461, + "step": 1559 + }, + { + "epoch": 0.26912792202190977, + "grad_norm": 0.64453125, + "learning_rate": 1.916720269765592e-05, + "loss": 1.4871, + "step": 1560 + }, + { + "epoch": 0.2693004399206418, + "grad_norm": 0.6796875, + "learning_rate": 1.916611488673418e-05, + "loss": 1.3621, + "step": 1561 + }, + { + "epoch": 0.26947295781937375, + "grad_norm": 0.66015625, + "learning_rate": 1.9165026396729377e-05, + "loss": 1.5283, + "step": 1562 + }, + { + "epoch": 0.26964547571810576, + "grad_norm": 0.609375, + "learning_rate": 1.9163937227722154e-05, + "loss": 1.4035, + "step": 1563 + }, + { + "epoch": 0.2698179936168377, + "grad_norm": 0.609375, + "learning_rate": 1.9162847379793203e-05, + "loss": 1.5184, + "step": 1564 + }, + { + "epoch": 0.26999051151556974, + "grad_norm": 0.62890625, + "learning_rate": 1.9161756853023266e-05, + "loss": 1.4852, + "step": 1565 + }, + { + "epoch": 0.27016302941430176, + "grad_norm": 0.6015625, + "learning_rate": 1.9160665647493136e-05, + "loss": 1.542, + "step": 1566 + }, + { + "epoch": 0.2703355473130337, + "grad_norm": 0.6484375, + "learning_rate": 1.9159573763283654e-05, + "loss": 1.4912, + "step": 1567 + }, + { + "epoch": 0.27050806521176574, + "grad_norm": 0.61328125, + "learning_rate": 1.9158481200475715e-05, + "loss": 1.5102, + "step": 1568 + }, + { + "epoch": 0.2706805831104977, + "grad_norm": 0.6328125, + "learning_rate": 1.9157387959150265e-05, + "loss": 1.4076, + "step": 1569 + }, + { + "epoch": 0.2708531010092297, + "grad_norm": 0.65234375, + "learning_rate": 1.91562940393883e-05, + "loss": 1.4054, + "step": 1570 + }, + { + "epoch": 0.2710256189079617, + "grad_norm": 0.64453125, + "learning_rate": 1.9155199441270863e-05, + "loss": 1.5908, + "step": 1571 + }, + { + "epoch": 0.2711981368066937, + "grad_norm": 0.6640625, + "learning_rate": 1.915410416487904e-05, + "loss": 1.518, + "step": 1572 + }, + { + "epoch": 0.2713706547054257, + "grad_norm": 0.625, + "learning_rate": 1.9153008210293988e-05, + "loss": 1.4726, + "step": 1573 + }, + { + "epoch": 0.27154317260415767, + "grad_norm": 1.09375, + "learning_rate": 1.9151911577596895e-05, + "loss": 1.5939, + "step": 1574 + }, + { + "epoch": 0.2717156905028897, + "grad_norm": 0.625, + "learning_rate": 1.915081426686901e-05, + "loss": 1.5122, + "step": 1575 + }, + { + "epoch": 0.27188820840162164, + "grad_norm": 0.76171875, + "learning_rate": 1.9149716278191625e-05, + "loss": 1.4648, + "step": 1576 + }, + { + "epoch": 0.27206072630035366, + "grad_norm": 0.6171875, + "learning_rate": 1.914861761164609e-05, + "loss": 1.4813, + "step": 1577 + }, + { + "epoch": 0.2722332441990857, + "grad_norm": 0.69921875, + "learning_rate": 1.91475182673138e-05, + "loss": 1.5682, + "step": 1578 + }, + { + "epoch": 0.27240576209781764, + "grad_norm": 0.65625, + "learning_rate": 1.91464182452762e-05, + "loss": 1.6082, + "step": 1579 + }, + { + "epoch": 0.27257827999654966, + "grad_norm": 0.9140625, + "learning_rate": 1.9145317545614787e-05, + "loss": 1.6143, + "step": 1580 + }, + { + "epoch": 0.2727507978952816, + "grad_norm": 0.8828125, + "learning_rate": 1.9144216168411105e-05, + "loss": 1.5168, + "step": 1581 + }, + { + "epoch": 0.27292331579401363, + "grad_norm": 0.77734375, + "learning_rate": 1.9143114113746755e-05, + "loss": 1.4507, + "step": 1582 + }, + { + "epoch": 0.27309583369274565, + "grad_norm": 0.6328125, + "learning_rate": 1.9142011381703384e-05, + "loss": 1.738, + "step": 1583 + }, + { + "epoch": 0.2732683515914776, + "grad_norm": 0.859375, + "learning_rate": 1.9140907972362684e-05, + "loss": 1.5551, + "step": 1584 + }, + { + "epoch": 0.2734408694902096, + "grad_norm": 0.65625, + "learning_rate": 1.9139803885806413e-05, + "loss": 1.4516, + "step": 1585 + }, + { + "epoch": 0.2736133873889416, + "grad_norm": 0.7265625, + "learning_rate": 1.9138699122116355e-05, + "loss": 1.5982, + "step": 1586 + }, + { + "epoch": 0.2737859052876736, + "grad_norm": 0.7421875, + "learning_rate": 1.913759368137437e-05, + "loss": 1.5159, + "step": 1587 + }, + { + "epoch": 0.27395842318640556, + "grad_norm": 0.58203125, + "learning_rate": 1.913648756366235e-05, + "loss": 1.5375, + "step": 1588 + }, + { + "epoch": 0.2741309410851376, + "grad_norm": 0.578125, + "learning_rate": 1.913538076906224e-05, + "loss": 1.5159, + "step": 1589 + }, + { + "epoch": 0.2743034589838696, + "grad_norm": 0.76171875, + "learning_rate": 1.913427329765604e-05, + "loss": 1.5594, + "step": 1590 + }, + { + "epoch": 0.27447597688260156, + "grad_norm": 0.64453125, + "learning_rate": 1.913316514952581e-05, + "loss": 1.5788, + "step": 1591 + }, + { + "epoch": 0.2746484947813336, + "grad_norm": 0.65625, + "learning_rate": 1.9132056324753634e-05, + "loss": 1.4806, + "step": 1592 + }, + { + "epoch": 0.27482101268006554, + "grad_norm": 0.59765625, + "learning_rate": 1.9130946823421666e-05, + "loss": 1.6308, + "step": 1593 + }, + { + "epoch": 0.27499353057879755, + "grad_norm": 0.78125, + "learning_rate": 1.9129836645612107e-05, + "loss": 1.5213, + "step": 1594 + }, + { + "epoch": 0.27516604847752957, + "grad_norm": 0.609375, + "learning_rate": 1.91287257914072e-05, + "loss": 1.3756, + "step": 1595 + }, + { + "epoch": 0.27533856637626153, + "grad_norm": 0.62890625, + "learning_rate": 1.912761426088925e-05, + "loss": 1.591, + "step": 1596 + }, + { + "epoch": 0.27551108427499355, + "grad_norm": 0.8046875, + "learning_rate": 1.91265020541406e-05, + "loss": 1.4783, + "step": 1597 + }, + { + "epoch": 0.2756836021737255, + "grad_norm": 0.6015625, + "learning_rate": 1.9125389171243656e-05, + "loss": 1.4279, + "step": 1598 + }, + { + "epoch": 0.2758561200724575, + "grad_norm": 0.74609375, + "learning_rate": 1.912427561228086e-05, + "loss": 1.487, + "step": 1599 + }, + { + "epoch": 0.2760286379711895, + "grad_norm": 0.87890625, + "learning_rate": 1.912316137733472e-05, + "loss": 1.4767, + "step": 1600 + }, + { + "epoch": 0.2760286379711895, + "eval_loss": 1.498151421546936, + "eval_runtime": 10.8117, + "eval_samples_per_second": 94.712, + "eval_steps_per_second": 23.678, + "step": 1600 + }, + { + "epoch": 0.2762011558699215, + "grad_norm": 0.72265625, + "learning_rate": 1.9122046466487776e-05, + "loss": 1.4793, + "step": 1601 + }, + { + "epoch": 0.2763736737686535, + "grad_norm": 1.046875, + "learning_rate": 1.912093087982264e-05, + "loss": 1.6061, + "step": 1602 + }, + { + "epoch": 0.2765461916673855, + "grad_norm": 0.94140625, + "learning_rate": 1.911981461742195e-05, + "loss": 1.4781, + "step": 1603 + }, + { + "epoch": 0.2767187095661175, + "grad_norm": 0.6796875, + "learning_rate": 1.9118697679368412e-05, + "loss": 1.5748, + "step": 1604 + }, + { + "epoch": 0.27689122746484945, + "grad_norm": 1.046875, + "learning_rate": 1.911758006574477e-05, + "loss": 1.5885, + "step": 1605 + }, + { + "epoch": 0.27706374536358147, + "grad_norm": 1.078125, + "learning_rate": 1.911646177663383e-05, + "loss": 1.533, + "step": 1606 + }, + { + "epoch": 0.2772362632623135, + "grad_norm": 0.60546875, + "learning_rate": 1.9115342812118437e-05, + "loss": 1.4983, + "step": 1607 + }, + { + "epoch": 0.27740878116104545, + "grad_norm": 0.7265625, + "learning_rate": 1.9114223172281498e-05, + "loss": 1.4582, + "step": 1608 + }, + { + "epoch": 0.27758129905977746, + "grad_norm": 0.71875, + "learning_rate": 1.9113102857205952e-05, + "loss": 1.4498, + "step": 1609 + }, + { + "epoch": 0.2777538169585094, + "grad_norm": 0.61328125, + "learning_rate": 1.911198186697481e-05, + "loss": 1.575, + "step": 1610 + }, + { + "epoch": 0.27792633485724144, + "grad_norm": 0.66015625, + "learning_rate": 1.9110860201671112e-05, + "loss": 1.5972, + "step": 1611 + }, + { + "epoch": 0.27809885275597346, + "grad_norm": 0.7265625, + "learning_rate": 1.9109737861377967e-05, + "loss": 1.5344, + "step": 1612 + }, + { + "epoch": 0.2782713706547054, + "grad_norm": 0.66796875, + "learning_rate": 1.910861484617852e-05, + "loss": 1.4919, + "step": 1613 + }, + { + "epoch": 0.27844388855343744, + "grad_norm": 0.58984375, + "learning_rate": 1.9107491156155974e-05, + "loss": 1.4568, + "step": 1614 + }, + { + "epoch": 0.2786164064521694, + "grad_norm": 0.96484375, + "learning_rate": 1.9106366791393573e-05, + "loss": 1.5189, + "step": 1615 + }, + { + "epoch": 0.2787889243509014, + "grad_norm": 0.66015625, + "learning_rate": 1.9105241751974624e-05, + "loss": 1.4218, + "step": 1616 + }, + { + "epoch": 0.2789614422496334, + "grad_norm": 0.65625, + "learning_rate": 1.910411603798247e-05, + "loss": 1.5128, + "step": 1617 + }, + { + "epoch": 0.2791339601483654, + "grad_norm": 0.6640625, + "learning_rate": 1.910298964950052e-05, + "loss": 1.5849, + "step": 1618 + }, + { + "epoch": 0.2793064780470974, + "grad_norm": 0.66015625, + "learning_rate": 1.9101862586612214e-05, + "loss": 1.4772, + "step": 1619 + }, + { + "epoch": 0.27947899594582937, + "grad_norm": 0.5703125, + "learning_rate": 1.9100734849401063e-05, + "loss": 1.5158, + "step": 1620 + }, + { + "epoch": 0.2796515138445614, + "grad_norm": 0.83984375, + "learning_rate": 1.9099606437950605e-05, + "loss": 1.5205, + "step": 1621 + }, + { + "epoch": 0.27982403174329334, + "grad_norm": 0.71484375, + "learning_rate": 1.9098477352344443e-05, + "loss": 1.4296, + "step": 1622 + }, + { + "epoch": 0.27999654964202536, + "grad_norm": 0.64453125, + "learning_rate": 1.9097347592666232e-05, + "loss": 1.5096, + "step": 1623 + }, + { + "epoch": 0.2801690675407574, + "grad_norm": 0.71875, + "learning_rate": 1.9096217158999667e-05, + "loss": 1.4957, + "step": 1624 + }, + { + "epoch": 0.28034158543948934, + "grad_norm": 0.640625, + "learning_rate": 1.90950860514285e-05, + "loss": 1.4951, + "step": 1625 + }, + { + "epoch": 0.28051410333822135, + "grad_norm": 0.703125, + "learning_rate": 1.909395427003653e-05, + "loss": 1.5029, + "step": 1626 + }, + { + "epoch": 0.2806866212369533, + "grad_norm": 0.73046875, + "learning_rate": 1.909282181490761e-05, + "loss": 1.512, + "step": 1627 + }, + { + "epoch": 0.28085913913568533, + "grad_norm": 0.625, + "learning_rate": 1.9091688686125628e-05, + "loss": 1.5529, + "step": 1628 + }, + { + "epoch": 0.28103165703441735, + "grad_norm": 0.6171875, + "learning_rate": 1.9090554883774547e-05, + "loss": 1.5084, + "step": 1629 + }, + { + "epoch": 0.2812041749331493, + "grad_norm": 0.58984375, + "learning_rate": 1.9089420407938354e-05, + "loss": 1.4911, + "step": 1630 + }, + { + "epoch": 0.2813766928318813, + "grad_norm": 0.65625, + "learning_rate": 1.9088285258701108e-05, + "loss": 1.5517, + "step": 1631 + }, + { + "epoch": 0.2815492107306133, + "grad_norm": 0.6953125, + "learning_rate": 1.90871494361469e-05, + "loss": 1.3719, + "step": 1632 + }, + { + "epoch": 0.2817217286293453, + "grad_norm": 0.57421875, + "learning_rate": 1.9086012940359887e-05, + "loss": 1.4742, + "step": 1633 + }, + { + "epoch": 0.28189424652807726, + "grad_norm": 0.66015625, + "learning_rate": 1.908487577142426e-05, + "loss": 1.4639, + "step": 1634 + }, + { + "epoch": 0.2820667644268093, + "grad_norm": 0.6796875, + "learning_rate": 1.9083737929424272e-05, + "loss": 1.5469, + "step": 1635 + }, + { + "epoch": 0.2822392823255413, + "grad_norm": 0.59765625, + "learning_rate": 1.9082599414444222e-05, + "loss": 1.5729, + "step": 1636 + }, + { + "epoch": 0.28241180022427326, + "grad_norm": 0.625, + "learning_rate": 1.9081460226568456e-05, + "loss": 1.5184, + "step": 1637 + }, + { + "epoch": 0.2825843181230053, + "grad_norm": 1.0234375, + "learning_rate": 1.908032036588137e-05, + "loss": 1.5415, + "step": 1638 + }, + { + "epoch": 0.28275683602173723, + "grad_norm": 0.71484375, + "learning_rate": 1.9079179832467417e-05, + "loss": 1.5335, + "step": 1639 + }, + { + "epoch": 0.28292935392046925, + "grad_norm": 0.609375, + "learning_rate": 1.9078038626411093e-05, + "loss": 1.5226, + "step": 1640 + }, + { + "epoch": 0.28310187181920127, + "grad_norm": 0.5703125, + "learning_rate": 1.9076896747796945e-05, + "loss": 1.4499, + "step": 1641 + }, + { + "epoch": 0.28327438971793323, + "grad_norm": 0.60546875, + "learning_rate": 1.9075754196709574e-05, + "loss": 1.5911, + "step": 1642 + }, + { + "epoch": 0.28344690761666524, + "grad_norm": 0.640625, + "learning_rate": 1.9074610973233622e-05, + "loss": 1.5831, + "step": 1643 + }, + { + "epoch": 0.2836194255153972, + "grad_norm": 0.62109375, + "learning_rate": 1.9073467077453783e-05, + "loss": 1.4548, + "step": 1644 + }, + { + "epoch": 0.2837919434141292, + "grad_norm": 0.60546875, + "learning_rate": 1.9072322509454814e-05, + "loss": 1.5288, + "step": 1645 + }, + { + "epoch": 0.2839644613128612, + "grad_norm": 0.671875, + "learning_rate": 1.9071177269321507e-05, + "loss": 1.4839, + "step": 1646 + }, + { + "epoch": 0.2841369792115932, + "grad_norm": 0.6640625, + "learning_rate": 1.907003135713871e-05, + "loss": 1.5464, + "step": 1647 + }, + { + "epoch": 0.2843094971103252, + "grad_norm": 0.60546875, + "learning_rate": 1.9068884772991313e-05, + "loss": 1.6519, + "step": 1648 + }, + { + "epoch": 0.2844820150090572, + "grad_norm": 1.890625, + "learning_rate": 1.9067737516964274e-05, + "loss": 1.4762, + "step": 1649 + }, + { + "epoch": 0.2846545329077892, + "grad_norm": 0.6328125, + "learning_rate": 1.9066589589142577e-05, + "loss": 1.5852, + "step": 1650 + }, + { + "epoch": 0.28482705080652115, + "grad_norm": 0.64453125, + "learning_rate": 1.9065440989611274e-05, + "loss": 1.6728, + "step": 1651 + }, + { + "epoch": 0.28499956870525317, + "grad_norm": 0.6015625, + "learning_rate": 1.9064291718455455e-05, + "loss": 1.4933, + "step": 1652 + }, + { + "epoch": 0.2851720866039852, + "grad_norm": 0.72265625, + "learning_rate": 1.9063141775760274e-05, + "loss": 1.5006, + "step": 1653 + }, + { + "epoch": 0.28534460450271715, + "grad_norm": 0.6171875, + "learning_rate": 1.906199116161092e-05, + "loss": 1.4257, + "step": 1654 + }, + { + "epoch": 0.28551712240144916, + "grad_norm": 0.828125, + "learning_rate": 1.906083987609264e-05, + "loss": 1.5513, + "step": 1655 + }, + { + "epoch": 0.2856896403001811, + "grad_norm": 0.63671875, + "learning_rate": 1.9059687919290727e-05, + "loss": 1.5997, + "step": 1656 + }, + { + "epoch": 0.28586215819891314, + "grad_norm": 0.9375, + "learning_rate": 1.9058535291290524e-05, + "loss": 1.6176, + "step": 1657 + }, + { + "epoch": 0.28603467609764516, + "grad_norm": 0.65234375, + "learning_rate": 1.9057381992177426e-05, + "loss": 1.5312, + "step": 1658 + }, + { + "epoch": 0.2862071939963771, + "grad_norm": 0.6875, + "learning_rate": 1.905622802203688e-05, + "loss": 1.6063, + "step": 1659 + }, + { + "epoch": 0.28637971189510913, + "grad_norm": 0.76953125, + "learning_rate": 1.9055073380954372e-05, + "loss": 1.5179, + "step": 1660 + }, + { + "epoch": 0.2865522297938411, + "grad_norm": 0.67578125, + "learning_rate": 1.905391806901545e-05, + "loss": 1.4903, + "step": 1661 + }, + { + "epoch": 0.2867247476925731, + "grad_norm": 0.76953125, + "learning_rate": 1.9052762086305706e-05, + "loss": 1.583, + "step": 1662 + }, + { + "epoch": 0.2868972655913051, + "grad_norm": 0.70703125, + "learning_rate": 1.9051605432910783e-05, + "loss": 1.5453, + "step": 1663 + }, + { + "epoch": 0.2870697834900371, + "grad_norm": 0.75390625, + "learning_rate": 1.9050448108916373e-05, + "loss": 1.474, + "step": 1664 + }, + { + "epoch": 0.2872423013887691, + "grad_norm": 0.6484375, + "learning_rate": 1.904929011440822e-05, + "loss": 1.4971, + "step": 1665 + }, + { + "epoch": 0.28741481928750107, + "grad_norm": 0.65234375, + "learning_rate": 1.9048131449472107e-05, + "loss": 1.5126, + "step": 1666 + }, + { + "epoch": 0.2875873371862331, + "grad_norm": 0.61328125, + "learning_rate": 1.9046972114193884e-05, + "loss": 1.4895, + "step": 1667 + }, + { + "epoch": 0.28775985508496504, + "grad_norm": 1.2890625, + "learning_rate": 1.904581210865944e-05, + "loss": 1.4972, + "step": 1668 + }, + { + "epoch": 0.28793237298369706, + "grad_norm": 0.76953125, + "learning_rate": 1.904465143295471e-05, + "loss": 1.5122, + "step": 1669 + }, + { + "epoch": 0.2881048908824291, + "grad_norm": 0.71875, + "learning_rate": 1.904349008716569e-05, + "loss": 1.4859, + "step": 1670 + }, + { + "epoch": 0.28827740878116104, + "grad_norm": 0.9296875, + "learning_rate": 1.904232807137842e-05, + "loss": 1.5311, + "step": 1671 + }, + { + "epoch": 0.28844992667989305, + "grad_norm": 0.640625, + "learning_rate": 1.9041165385678984e-05, + "loss": 1.4518, + "step": 1672 + }, + { + "epoch": 0.288622444578625, + "grad_norm": 0.72265625, + "learning_rate": 1.9040002030153532e-05, + "loss": 1.5663, + "step": 1673 + }, + { + "epoch": 0.28879496247735703, + "grad_norm": 0.82421875, + "learning_rate": 1.903883800488824e-05, + "loss": 1.4741, + "step": 1674 + }, + { + "epoch": 0.288967480376089, + "grad_norm": 0.62109375, + "learning_rate": 1.9037673309969347e-05, + "loss": 1.5701, + "step": 1675 + }, + { + "epoch": 0.289139998274821, + "grad_norm": 0.71875, + "learning_rate": 1.9036507945483152e-05, + "loss": 1.5381, + "step": 1676 + }, + { + "epoch": 0.289312516173553, + "grad_norm": 0.57421875, + "learning_rate": 1.9035341911515983e-05, + "loss": 1.4083, + "step": 1677 + }, + { + "epoch": 0.289485034072285, + "grad_norm": 0.58203125, + "learning_rate": 1.9034175208154227e-05, + "loss": 1.5595, + "step": 1678 + }, + { + "epoch": 0.289657551971017, + "grad_norm": 0.5859375, + "learning_rate": 1.903300783548433e-05, + "loss": 1.5761, + "step": 1679 + }, + { + "epoch": 0.28983006986974896, + "grad_norm": 0.578125, + "learning_rate": 1.9031839793592764e-05, + "loss": 1.5564, + "step": 1680 + }, + { + "epoch": 0.290002587768481, + "grad_norm": 0.63671875, + "learning_rate": 1.9030671082566076e-05, + "loss": 1.4697, + "step": 1681 + }, + { + "epoch": 0.290175105667213, + "grad_norm": 0.6953125, + "learning_rate": 1.9029501702490848e-05, + "loss": 1.4602, + "step": 1682 + }, + { + "epoch": 0.29034762356594496, + "grad_norm": 0.67578125, + "learning_rate": 1.9028331653453715e-05, + "loss": 1.5515, + "step": 1683 + }, + { + "epoch": 0.290520141464677, + "grad_norm": 0.67578125, + "learning_rate": 1.9027160935541365e-05, + "loss": 1.3977, + "step": 1684 + }, + { + "epoch": 0.29069265936340893, + "grad_norm": 0.73828125, + "learning_rate": 1.902598954884052e-05, + "loss": 1.5655, + "step": 1685 + }, + { + "epoch": 0.29086517726214095, + "grad_norm": 0.5703125, + "learning_rate": 1.902481749343798e-05, + "loss": 1.5035, + "step": 1686 + }, + { + "epoch": 0.29103769516087297, + "grad_norm": 0.99609375, + "learning_rate": 1.9023644769420567e-05, + "loss": 1.5112, + "step": 1687 + }, + { + "epoch": 0.2912102130596049, + "grad_norm": 1.5234375, + "learning_rate": 1.902247137687517e-05, + "loss": 1.5383, + "step": 1688 + }, + { + "epoch": 0.29138273095833694, + "grad_norm": 0.69921875, + "learning_rate": 1.9021297315888715e-05, + "loss": 1.5061, + "step": 1689 + }, + { + "epoch": 0.2915552488570689, + "grad_norm": 0.8125, + "learning_rate": 1.902012258654819e-05, + "loss": 1.5495, + "step": 1690 + }, + { + "epoch": 0.2917277667558009, + "grad_norm": 0.6328125, + "learning_rate": 1.9018947188940623e-05, + "loss": 1.5872, + "step": 1691 + }, + { + "epoch": 0.2919002846545329, + "grad_norm": 0.90625, + "learning_rate": 1.901777112315309e-05, + "loss": 1.4907, + "step": 1692 + }, + { + "epoch": 0.2920728025532649, + "grad_norm": 0.78515625, + "learning_rate": 1.9016594389272734e-05, + "loss": 1.5275, + "step": 1693 + }, + { + "epoch": 0.2922453204519969, + "grad_norm": 0.75, + "learning_rate": 1.9015416987386725e-05, + "loss": 1.5049, + "step": 1694 + }, + { + "epoch": 0.2924178383507289, + "grad_norm": 0.8203125, + "learning_rate": 1.9014238917582297e-05, + "loss": 1.6413, + "step": 1695 + }, + { + "epoch": 0.2925903562494609, + "grad_norm": 0.72265625, + "learning_rate": 1.9013060179946722e-05, + "loss": 1.4572, + "step": 1696 + }, + { + "epoch": 0.29276287414819285, + "grad_norm": 0.83203125, + "learning_rate": 1.901188077456733e-05, + "loss": 1.4106, + "step": 1697 + }, + { + "epoch": 0.29293539204692487, + "grad_norm": 0.6875, + "learning_rate": 1.901070070153151e-05, + "loss": 1.5794, + "step": 1698 + }, + { + "epoch": 0.2931079099456569, + "grad_norm": 0.671875, + "learning_rate": 1.900951996092667e-05, + "loss": 1.5461, + "step": 1699 + }, + { + "epoch": 0.29328042784438885, + "grad_norm": 0.64453125, + "learning_rate": 1.9008338552840308e-05, + "loss": 1.4498, + "step": 1700 + }, + { + "epoch": 0.29328042784438885, + "eval_loss": 1.4925167560577393, + "eval_runtime": 11.161, + "eval_samples_per_second": 91.748, + "eval_steps_per_second": 22.937, + "step": 1700 + }, + { + "epoch": 0.29345294574312086, + "grad_norm": 0.87109375, + "learning_rate": 1.9007156477359935e-05, + "loss": 1.5066, + "step": 1701 + }, + { + "epoch": 0.2936254636418528, + "grad_norm": 0.76171875, + "learning_rate": 1.900597373457313e-05, + "loss": 1.4833, + "step": 1702 + }, + { + "epoch": 0.29379798154058484, + "grad_norm": 0.875, + "learning_rate": 1.900479032456752e-05, + "loss": 1.4667, + "step": 1703 + }, + { + "epoch": 0.2939704994393168, + "grad_norm": 1.625, + "learning_rate": 1.9003606247430774e-05, + "loss": 1.5012, + "step": 1704 + }, + { + "epoch": 0.2941430173380488, + "grad_norm": 0.75, + "learning_rate": 1.9002421503250626e-05, + "loss": 1.5049, + "step": 1705 + }, + { + "epoch": 0.29431553523678083, + "grad_norm": 0.68359375, + "learning_rate": 1.900123609211484e-05, + "loss": 1.5594, + "step": 1706 + }, + { + "epoch": 0.2944880531355128, + "grad_norm": 0.671875, + "learning_rate": 1.9000050014111245e-05, + "loss": 1.4642, + "step": 1707 + }, + { + "epoch": 0.2946605710342448, + "grad_norm": 0.8125, + "learning_rate": 1.8998863269327706e-05, + "loss": 1.5367, + "step": 1708 + }, + { + "epoch": 0.2948330889329768, + "grad_norm": 0.6640625, + "learning_rate": 1.8997675857852148e-05, + "loss": 1.3788, + "step": 1709 + }, + { + "epoch": 0.2950056068317088, + "grad_norm": 0.65625, + "learning_rate": 1.899648777977255e-05, + "loss": 1.5682, + "step": 1710 + }, + { + "epoch": 0.2951781247304408, + "grad_norm": 0.66796875, + "learning_rate": 1.8995299035176914e-05, + "loss": 1.4895, + "step": 1711 + }, + { + "epoch": 0.29535064262917277, + "grad_norm": 0.66796875, + "learning_rate": 1.8994109624153327e-05, + "loss": 1.3757, + "step": 1712 + }, + { + "epoch": 0.2955231605279048, + "grad_norm": 0.69921875, + "learning_rate": 1.89929195467899e-05, + "loss": 1.4907, + "step": 1713 + }, + { + "epoch": 0.29569567842663674, + "grad_norm": 0.875, + "learning_rate": 1.8991728803174804e-05, + "loss": 1.3716, + "step": 1714 + }, + { + "epoch": 0.29586819632536876, + "grad_norm": 0.66015625, + "learning_rate": 1.899053739339625e-05, + "loss": 1.5605, + "step": 1715 + }, + { + "epoch": 0.2960407142241008, + "grad_norm": 0.83203125, + "learning_rate": 1.8989345317542516e-05, + "loss": 1.5747, + "step": 1716 + }, + { + "epoch": 0.29621323212283274, + "grad_norm": 0.95703125, + "learning_rate": 1.8988152575701912e-05, + "loss": 1.5484, + "step": 1717 + }, + { + "epoch": 0.29638575002156475, + "grad_norm": 0.71484375, + "learning_rate": 1.8986959167962806e-05, + "loss": 1.4819, + "step": 1718 + }, + { + "epoch": 0.2965582679202967, + "grad_norm": 0.70703125, + "learning_rate": 1.8985765094413607e-05, + "loss": 1.526, + "step": 1719 + }, + { + "epoch": 0.29673078581902873, + "grad_norm": 0.7265625, + "learning_rate": 1.8984570355142787e-05, + "loss": 1.5645, + "step": 1720 + }, + { + "epoch": 0.2969033037177607, + "grad_norm": 0.6171875, + "learning_rate": 1.8983374950238854e-05, + "loss": 1.5015, + "step": 1721 + }, + { + "epoch": 0.2970758216164927, + "grad_norm": 0.6015625, + "learning_rate": 1.8982178879790377e-05, + "loss": 1.5595, + "step": 1722 + }, + { + "epoch": 0.2972483395152247, + "grad_norm": 0.62890625, + "learning_rate": 1.8980982143885965e-05, + "loss": 1.5058, + "step": 1723 + }, + { + "epoch": 0.2974208574139567, + "grad_norm": 0.87109375, + "learning_rate": 1.8979784742614283e-05, + "loss": 1.5412, + "step": 1724 + }, + { + "epoch": 0.2975933753126887, + "grad_norm": 0.64453125, + "learning_rate": 1.8978586676064036e-05, + "loss": 1.4126, + "step": 1725 + }, + { + "epoch": 0.29776589321142066, + "grad_norm": 0.828125, + "learning_rate": 1.8977387944323985e-05, + "loss": 1.5317, + "step": 1726 + }, + { + "epoch": 0.2979384111101527, + "grad_norm": 0.62109375, + "learning_rate": 1.8976188547482944e-05, + "loss": 1.5283, + "step": 1727 + }, + { + "epoch": 0.2981109290088847, + "grad_norm": 0.6484375, + "learning_rate": 1.897498848562977e-05, + "loss": 1.5747, + "step": 1728 + }, + { + "epoch": 0.29828344690761666, + "grad_norm": 0.6953125, + "learning_rate": 1.8973787758853367e-05, + "loss": 1.4914, + "step": 1729 + }, + { + "epoch": 0.2984559648063487, + "grad_norm": 0.59375, + "learning_rate": 1.89725863672427e-05, + "loss": 1.5348, + "step": 1730 + }, + { + "epoch": 0.29862848270508063, + "grad_norm": 0.63671875, + "learning_rate": 1.8971384310886773e-05, + "loss": 1.468, + "step": 1731 + }, + { + "epoch": 0.29880100060381265, + "grad_norm": 0.625, + "learning_rate": 1.8970181589874637e-05, + "loss": 1.4735, + "step": 1732 + }, + { + "epoch": 0.29897351850254467, + "grad_norm": 0.59765625, + "learning_rate": 1.89689782042954e-05, + "loss": 1.46, + "step": 1733 + }, + { + "epoch": 0.2991460364012766, + "grad_norm": 0.671875, + "learning_rate": 1.896777415423822e-05, + "loss": 1.5731, + "step": 1734 + }, + { + "epoch": 0.29931855430000864, + "grad_norm": 0.57421875, + "learning_rate": 1.8966569439792294e-05, + "loss": 1.4503, + "step": 1735 + }, + { + "epoch": 0.2994910721987406, + "grad_norm": 0.76171875, + "learning_rate": 1.896536406104688e-05, + "loss": 1.5463, + "step": 1736 + }, + { + "epoch": 0.2996635900974726, + "grad_norm": 0.65234375, + "learning_rate": 1.8964158018091278e-05, + "loss": 1.5302, + "step": 1737 + }, + { + "epoch": 0.2998361079962046, + "grad_norm": 0.640625, + "learning_rate": 1.8962951311014842e-05, + "loss": 1.4495, + "step": 1738 + }, + { + "epoch": 0.3000086258949366, + "grad_norm": 0.7265625, + "learning_rate": 1.896174393990697e-05, + "loss": 1.5159, + "step": 1739 + }, + { + "epoch": 0.3001811437936686, + "grad_norm": 0.60546875, + "learning_rate": 1.8960535904857103e-05, + "loss": 1.4311, + "step": 1740 + }, + { + "epoch": 0.3003536616924006, + "grad_norm": 0.69140625, + "learning_rate": 1.8959327205954757e-05, + "loss": 1.5733, + "step": 1741 + }, + { + "epoch": 0.3005261795911326, + "grad_norm": 1.0078125, + "learning_rate": 1.895811784328947e-05, + "loss": 1.4511, + "step": 1742 + }, + { + "epoch": 0.30069869748986455, + "grad_norm": 0.74609375, + "learning_rate": 1.8956907816950837e-05, + "loss": 1.4835, + "step": 1743 + }, + { + "epoch": 0.30087121538859657, + "grad_norm": 0.640625, + "learning_rate": 1.895569712702851e-05, + "loss": 1.4882, + "step": 1744 + }, + { + "epoch": 0.3010437332873286, + "grad_norm": 0.578125, + "learning_rate": 1.895448577361218e-05, + "loss": 1.5585, + "step": 1745 + }, + { + "epoch": 0.30121625118606055, + "grad_norm": 0.6328125, + "learning_rate": 1.8953273756791595e-05, + "loss": 1.4595, + "step": 1746 + }, + { + "epoch": 0.30138876908479256, + "grad_norm": 0.66015625, + "learning_rate": 1.8952061076656547e-05, + "loss": 1.56, + "step": 1747 + }, + { + "epoch": 0.3015612869835245, + "grad_norm": 0.6953125, + "learning_rate": 1.8950847733296877e-05, + "loss": 1.4842, + "step": 1748 + }, + { + "epoch": 0.30173380488225654, + "grad_norm": 0.82421875, + "learning_rate": 1.8949633726802484e-05, + "loss": 1.5915, + "step": 1749 + }, + { + "epoch": 0.3019063227809885, + "grad_norm": 0.89453125, + "learning_rate": 1.89484190572633e-05, + "loss": 1.5447, + "step": 1750 + }, + { + "epoch": 0.3020788406797205, + "grad_norm": 0.9140625, + "learning_rate": 1.8947203724769324e-05, + "loss": 1.5063, + "step": 1751 + }, + { + "epoch": 0.30225135857845253, + "grad_norm": 0.68359375, + "learning_rate": 1.894598772941059e-05, + "loss": 1.4247, + "step": 1752 + }, + { + "epoch": 0.3024238764771845, + "grad_norm": 0.83203125, + "learning_rate": 1.8944771071277188e-05, + "loss": 1.5098, + "step": 1753 + }, + { + "epoch": 0.3025963943759165, + "grad_norm": 0.70703125, + "learning_rate": 1.8943553750459256e-05, + "loss": 1.6315, + "step": 1754 + }, + { + "epoch": 0.30276891227464847, + "grad_norm": 0.75390625, + "learning_rate": 1.8942335767046978e-05, + "loss": 1.5184, + "step": 1755 + }, + { + "epoch": 0.3029414301733805, + "grad_norm": 0.98828125, + "learning_rate": 1.8941117121130594e-05, + "loss": 1.429, + "step": 1756 + }, + { + "epoch": 0.3031139480721125, + "grad_norm": 0.62890625, + "learning_rate": 1.8939897812800385e-05, + "loss": 1.5151, + "step": 1757 + }, + { + "epoch": 0.30328646597084447, + "grad_norm": 0.65234375, + "learning_rate": 1.893867784214669e-05, + "loss": 1.491, + "step": 1758 + }, + { + "epoch": 0.3034589838695765, + "grad_norm": 0.6484375, + "learning_rate": 1.8937457209259888e-05, + "loss": 1.4795, + "step": 1759 + }, + { + "epoch": 0.30363150176830844, + "grad_norm": 0.68359375, + "learning_rate": 1.893623591423041e-05, + "loss": 1.5306, + "step": 1760 + }, + { + "epoch": 0.30380401966704046, + "grad_norm": 0.69140625, + "learning_rate": 1.893501395714874e-05, + "loss": 1.4723, + "step": 1761 + }, + { + "epoch": 0.3039765375657725, + "grad_norm": 0.6875, + "learning_rate": 1.893379133810541e-05, + "loss": 1.4674, + "step": 1762 + }, + { + "epoch": 0.30414905546450444, + "grad_norm": 0.56640625, + "learning_rate": 1.8932568057190995e-05, + "loss": 1.4916, + "step": 1763 + }, + { + "epoch": 0.30432157336323645, + "grad_norm": 0.60546875, + "learning_rate": 1.8931344114496127e-05, + "loss": 1.4793, + "step": 1764 + }, + { + "epoch": 0.3044940912619684, + "grad_norm": 0.83203125, + "learning_rate": 1.8930119510111476e-05, + "loss": 1.5519, + "step": 1765 + }, + { + "epoch": 0.30466660916070043, + "grad_norm": 0.83984375, + "learning_rate": 1.8928894244127782e-05, + "loss": 1.5546, + "step": 1766 + }, + { + "epoch": 0.3048391270594324, + "grad_norm": 0.80859375, + "learning_rate": 1.8927668316635804e-05, + "loss": 1.6417, + "step": 1767 + }, + { + "epoch": 0.3050116449581644, + "grad_norm": 0.87109375, + "learning_rate": 1.892644172772638e-05, + "loss": 1.5674, + "step": 1768 + }, + { + "epoch": 0.3051841628568964, + "grad_norm": 0.77734375, + "learning_rate": 1.8925214477490373e-05, + "loss": 1.5066, + "step": 1769 + }, + { + "epoch": 0.3053566807556284, + "grad_norm": 0.80078125, + "learning_rate": 1.8923986566018717e-05, + "loss": 1.571, + "step": 1770 + }, + { + "epoch": 0.3055291986543604, + "grad_norm": 0.75, + "learning_rate": 1.892275799340237e-05, + "loss": 1.5209, + "step": 1771 + }, + { + "epoch": 0.30570171655309236, + "grad_norm": 0.66015625, + "learning_rate": 1.8921528759732363e-05, + "loss": 1.388, + "step": 1772 + }, + { + "epoch": 0.3058742344518244, + "grad_norm": 0.77734375, + "learning_rate": 1.892029886509976e-05, + "loss": 1.5195, + "step": 1773 + }, + { + "epoch": 0.3060467523505564, + "grad_norm": 0.7109375, + "learning_rate": 1.891906830959568e-05, + "loss": 1.5106, + "step": 1774 + }, + { + "epoch": 0.30621927024928836, + "grad_norm": 0.59765625, + "learning_rate": 1.891783709331129e-05, + "loss": 1.3891, + "step": 1775 + }, + { + "epoch": 0.30639178814802037, + "grad_norm": 0.66015625, + "learning_rate": 1.8916605216337807e-05, + "loss": 1.5229, + "step": 1776 + }, + { + "epoch": 0.30656430604675233, + "grad_norm": 0.78515625, + "learning_rate": 1.8915372678766497e-05, + "loss": 1.4953, + "step": 1777 + }, + { + "epoch": 0.30673682394548435, + "grad_norm": 0.6015625, + "learning_rate": 1.8914139480688672e-05, + "loss": 1.5302, + "step": 1778 + }, + { + "epoch": 0.3069093418442163, + "grad_norm": 0.59765625, + "learning_rate": 1.89129056221957e-05, + "loss": 1.4382, + "step": 1779 + }, + { + "epoch": 0.3070818597429483, + "grad_norm": 0.69921875, + "learning_rate": 1.8911671103378983e-05, + "loss": 1.4877, + "step": 1780 + }, + { + "epoch": 0.30725437764168034, + "grad_norm": 0.6484375, + "learning_rate": 1.8910435924329993e-05, + "loss": 1.4971, + "step": 1781 + }, + { + "epoch": 0.3074268955404123, + "grad_norm": 0.640625, + "learning_rate": 1.8909200085140233e-05, + "loss": 1.6221, + "step": 1782 + }, + { + "epoch": 0.3075994134391443, + "grad_norm": 0.609375, + "learning_rate": 1.890796358590126e-05, + "loss": 1.5684, + "step": 1783 + }, + { + "epoch": 0.3077719313378763, + "grad_norm": 0.6328125, + "learning_rate": 1.890672642670469e-05, + "loss": 1.4768, + "step": 1784 + }, + { + "epoch": 0.3079444492366083, + "grad_norm": 0.6328125, + "learning_rate": 1.8905488607642172e-05, + "loss": 1.5532, + "step": 1785 + }, + { + "epoch": 0.3081169671353403, + "grad_norm": 0.69921875, + "learning_rate": 1.8904250128805418e-05, + "loss": 1.5317, + "step": 1786 + }, + { + "epoch": 0.3082894850340723, + "grad_norm": 0.69140625, + "learning_rate": 1.8903010990286174e-05, + "loss": 1.5159, + "step": 1787 + }, + { + "epoch": 0.3084620029328043, + "grad_norm": 0.67578125, + "learning_rate": 1.8901771192176248e-05, + "loss": 1.4687, + "step": 1788 + }, + { + "epoch": 0.30863452083153625, + "grad_norm": 0.70703125, + "learning_rate": 1.8900530734567492e-05, + "loss": 1.5036, + "step": 1789 + }, + { + "epoch": 0.30880703873026827, + "grad_norm": 1.1484375, + "learning_rate": 1.8899289617551803e-05, + "loss": 1.5833, + "step": 1790 + }, + { + "epoch": 0.3089795566290003, + "grad_norm": 0.68359375, + "learning_rate": 1.889804784122114e-05, + "loss": 1.442, + "step": 1791 + }, + { + "epoch": 0.30915207452773225, + "grad_norm": 0.76953125, + "learning_rate": 1.889680540566749e-05, + "loss": 1.468, + "step": 1792 + }, + { + "epoch": 0.30932459242646426, + "grad_norm": 1.0078125, + "learning_rate": 1.8895562310982907e-05, + "loss": 1.4224, + "step": 1793 + }, + { + "epoch": 0.3094971103251962, + "grad_norm": 0.62890625, + "learning_rate": 1.8894318557259485e-05, + "loss": 1.4192, + "step": 1794 + }, + { + "epoch": 0.30966962822392824, + "grad_norm": 0.81640625, + "learning_rate": 1.889307414458937e-05, + "loss": 1.4496, + "step": 1795 + }, + { + "epoch": 0.3098421461226602, + "grad_norm": 0.6640625, + "learning_rate": 1.8891829073064757e-05, + "loss": 1.5036, + "step": 1796 + }, + { + "epoch": 0.3100146640213922, + "grad_norm": 0.6875, + "learning_rate": 1.889058334277789e-05, + "loss": 1.5235, + "step": 1797 + }, + { + "epoch": 0.31018718192012423, + "grad_norm": 0.6796875, + "learning_rate": 1.8889336953821055e-05, + "loss": 1.6028, + "step": 1798 + }, + { + "epoch": 0.3103596998188562, + "grad_norm": 0.6015625, + "learning_rate": 1.8888089906286598e-05, + "loss": 1.3856, + "step": 1799 + }, + { + "epoch": 0.3105322177175882, + "grad_norm": 0.80078125, + "learning_rate": 1.8886842200266905e-05, + "loss": 1.473, + "step": 1800 + }, + { + "epoch": 0.3105322177175882, + "eval_loss": 1.4874216318130493, + "eval_runtime": 10.8464, + "eval_samples_per_second": 94.409, + "eval_steps_per_second": 23.602, + "step": 1800 + }, + { + "epoch": 0.31070473561632017, + "grad_norm": 0.80078125, + "learning_rate": 1.888559383585441e-05, + "loss": 1.6011, + "step": 1801 + }, + { + "epoch": 0.3108772535150522, + "grad_norm": 3.390625, + "learning_rate": 1.888434481314161e-05, + "loss": 1.4107, + "step": 1802 + }, + { + "epoch": 0.3110497714137842, + "grad_norm": 0.70703125, + "learning_rate": 1.888309513222103e-05, + "loss": 1.5343, + "step": 1803 + }, + { + "epoch": 0.31122228931251616, + "grad_norm": 0.70703125, + "learning_rate": 1.8881844793185257e-05, + "loss": 1.5289, + "step": 1804 + }, + { + "epoch": 0.3113948072112482, + "grad_norm": 0.62890625, + "learning_rate": 1.8880593796126925e-05, + "loss": 1.436, + "step": 1805 + }, + { + "epoch": 0.31156732510998014, + "grad_norm": 0.76171875, + "learning_rate": 1.887934214113872e-05, + "loss": 1.5412, + "step": 1806 + }, + { + "epoch": 0.31173984300871216, + "grad_norm": 0.84375, + "learning_rate": 1.887808982831337e-05, + "loss": 1.5912, + "step": 1807 + }, + { + "epoch": 0.3119123609074442, + "grad_norm": 0.7265625, + "learning_rate": 1.887683685774365e-05, + "loss": 1.4521, + "step": 1808 + }, + { + "epoch": 0.31208487880617614, + "grad_norm": 0.7734375, + "learning_rate": 1.887558322952239e-05, + "loss": 1.5337, + "step": 1809 + }, + { + "epoch": 0.31225739670490815, + "grad_norm": 0.9609375, + "learning_rate": 1.887432894374247e-05, + "loss": 1.6269, + "step": 1810 + }, + { + "epoch": 0.3124299146036401, + "grad_norm": 0.6328125, + "learning_rate": 1.8873074000496808e-05, + "loss": 1.469, + "step": 1811 + }, + { + "epoch": 0.31260243250237213, + "grad_norm": 0.84375, + "learning_rate": 1.8871818399878387e-05, + "loss": 1.495, + "step": 1812 + }, + { + "epoch": 0.3127749504011041, + "grad_norm": 0.8125, + "learning_rate": 1.887056214198022e-05, + "loss": 1.5321, + "step": 1813 + }, + { + "epoch": 0.3129474682998361, + "grad_norm": 0.62109375, + "learning_rate": 1.8869305226895386e-05, + "loss": 1.5165, + "step": 1814 + }, + { + "epoch": 0.3131199861985681, + "grad_norm": 0.640625, + "learning_rate": 1.8868047654717005e-05, + "loss": 1.5412, + "step": 1815 + }, + { + "epoch": 0.3132925040973001, + "grad_norm": 0.73046875, + "learning_rate": 1.886678942553824e-05, + "loss": 1.4872, + "step": 1816 + }, + { + "epoch": 0.3134650219960321, + "grad_norm": 0.64453125, + "learning_rate": 1.8865530539452316e-05, + "loss": 1.5929, + "step": 1817 + }, + { + "epoch": 0.31363753989476406, + "grad_norm": 0.68359375, + "learning_rate": 1.8864270996552494e-05, + "loss": 1.5027, + "step": 1818 + }, + { + "epoch": 0.3138100577934961, + "grad_norm": 0.74609375, + "learning_rate": 1.886301079693209e-05, + "loss": 1.5103, + "step": 1819 + }, + { + "epoch": 0.3139825756922281, + "grad_norm": 0.6484375, + "learning_rate": 1.8861749940684464e-05, + "loss": 1.5923, + "step": 1820 + }, + { + "epoch": 0.31415509359096006, + "grad_norm": 0.6640625, + "learning_rate": 1.8860488427903038e-05, + "loss": 1.4984, + "step": 1821 + }, + { + "epoch": 0.31432761148969207, + "grad_norm": 0.65234375, + "learning_rate": 1.8859226258681262e-05, + "loss": 1.5425, + "step": 1822 + }, + { + "epoch": 0.31450012938842403, + "grad_norm": 0.7421875, + "learning_rate": 1.885796343311265e-05, + "loss": 1.425, + "step": 1823 + }, + { + "epoch": 0.31467264728715605, + "grad_norm": 0.62109375, + "learning_rate": 1.885669995129076e-05, + "loss": 1.4554, + "step": 1824 + }, + { + "epoch": 0.314845165185888, + "grad_norm": 0.59375, + "learning_rate": 1.8855435813309196e-05, + "loss": 1.4866, + "step": 1825 + }, + { + "epoch": 0.31501768308462, + "grad_norm": 0.81640625, + "learning_rate": 1.885417101926162e-05, + "loss": 1.5125, + "step": 1826 + }, + { + "epoch": 0.31519020098335204, + "grad_norm": 0.69140625, + "learning_rate": 1.885290556924173e-05, + "loss": 1.4906, + "step": 1827 + }, + { + "epoch": 0.315362718882084, + "grad_norm": 0.63671875, + "learning_rate": 1.885163946334328e-05, + "loss": 1.5114, + "step": 1828 + }, + { + "epoch": 0.315535236780816, + "grad_norm": 0.69921875, + "learning_rate": 1.8850372701660072e-05, + "loss": 1.4366, + "step": 1829 + }, + { + "epoch": 0.315707754679548, + "grad_norm": 0.84375, + "learning_rate": 1.8849105284285954e-05, + "loss": 1.4678, + "step": 1830 + }, + { + "epoch": 0.31588027257828, + "grad_norm": 0.67578125, + "learning_rate": 1.8847837211314822e-05, + "loss": 1.5652, + "step": 1831 + }, + { + "epoch": 0.316052790477012, + "grad_norm": 0.76171875, + "learning_rate": 1.8846568482840628e-05, + "loss": 1.5195, + "step": 1832 + }, + { + "epoch": 0.316225308375744, + "grad_norm": 0.6484375, + "learning_rate": 1.8845299098957366e-05, + "loss": 1.5487, + "step": 1833 + }, + { + "epoch": 0.316397826274476, + "grad_norm": 0.81640625, + "learning_rate": 1.8844029059759076e-05, + "loss": 1.4679, + "step": 1834 + }, + { + "epoch": 0.31657034417320795, + "grad_norm": 0.75, + "learning_rate": 1.8842758365339856e-05, + "loss": 1.4689, + "step": 1835 + }, + { + "epoch": 0.31674286207193997, + "grad_norm": 0.6171875, + "learning_rate": 1.884148701579384e-05, + "loss": 1.5574, + "step": 1836 + }, + { + "epoch": 0.316915379970672, + "grad_norm": 1.375, + "learning_rate": 1.884021501121523e-05, + "loss": 1.5667, + "step": 1837 + }, + { + "epoch": 0.31708789786940395, + "grad_norm": 0.62109375, + "learning_rate": 1.883894235169825e-05, + "loss": 1.4987, + "step": 1838 + }, + { + "epoch": 0.31726041576813596, + "grad_norm": 0.65234375, + "learning_rate": 1.8837669037337188e-05, + "loss": 1.5074, + "step": 1839 + }, + { + "epoch": 0.3174329336668679, + "grad_norm": 0.68359375, + "learning_rate": 1.883639506822639e-05, + "loss": 1.4288, + "step": 1840 + }, + { + "epoch": 0.31760545156559994, + "grad_norm": 0.6328125, + "learning_rate": 1.883512044446023e-05, + "loss": 1.5972, + "step": 1841 + }, + { + "epoch": 0.3177779694643319, + "grad_norm": 0.57421875, + "learning_rate": 1.8833845166133145e-05, + "loss": 1.3981, + "step": 1842 + }, + { + "epoch": 0.3179504873630639, + "grad_norm": 0.65234375, + "learning_rate": 1.883256923333961e-05, + "loss": 1.5039, + "step": 1843 + }, + { + "epoch": 0.31812300526179593, + "grad_norm": 0.734375, + "learning_rate": 1.8831292646174163e-05, + "loss": 1.4579, + "step": 1844 + }, + { + "epoch": 0.3182955231605279, + "grad_norm": 0.60546875, + "learning_rate": 1.8830015404731375e-05, + "loss": 1.5289, + "step": 1845 + }, + { + "epoch": 0.3184680410592599, + "grad_norm": 0.58984375, + "learning_rate": 1.8828737509105873e-05, + "loss": 1.5776, + "step": 1846 + }, + { + "epoch": 0.31864055895799187, + "grad_norm": 0.6015625, + "learning_rate": 1.882745895939233e-05, + "loss": 1.4838, + "step": 1847 + }, + { + "epoch": 0.3188130768567239, + "grad_norm": 0.63671875, + "learning_rate": 1.882617975568547e-05, + "loss": 1.4642, + "step": 1848 + }, + { + "epoch": 0.3189855947554559, + "grad_norm": 0.69921875, + "learning_rate": 1.882489989808007e-05, + "loss": 1.4736, + "step": 1849 + }, + { + "epoch": 0.31915811265418786, + "grad_norm": 1.9609375, + "learning_rate": 1.882361938667094e-05, + "loss": 1.5478, + "step": 1850 + }, + { + "epoch": 0.3193306305529199, + "grad_norm": 0.70703125, + "learning_rate": 1.8822338221552955e-05, + "loss": 1.5369, + "step": 1851 + }, + { + "epoch": 0.31950314845165184, + "grad_norm": 0.72265625, + "learning_rate": 1.882105640282103e-05, + "loss": 1.4478, + "step": 1852 + }, + { + "epoch": 0.31967566635038386, + "grad_norm": 0.671875, + "learning_rate": 1.881977393057013e-05, + "loss": 1.5034, + "step": 1853 + }, + { + "epoch": 0.3198481842491158, + "grad_norm": 0.62890625, + "learning_rate": 1.881849080489527e-05, + "loss": 1.5675, + "step": 1854 + }, + { + "epoch": 0.32002070214784784, + "grad_norm": 0.63671875, + "learning_rate": 1.881720702589151e-05, + "loss": 1.5108, + "step": 1855 + }, + { + "epoch": 0.32019322004657985, + "grad_norm": 0.6328125, + "learning_rate": 1.881592259365396e-05, + "loss": 1.5053, + "step": 1856 + }, + { + "epoch": 0.3203657379453118, + "grad_norm": 0.61328125, + "learning_rate": 1.881463750827778e-05, + "loss": 1.5251, + "step": 1857 + }, + { + "epoch": 0.32053825584404383, + "grad_norm": 0.65234375, + "learning_rate": 1.881335176985818e-05, + "loss": 1.4477, + "step": 1858 + }, + { + "epoch": 0.3207107737427758, + "grad_norm": 0.703125, + "learning_rate": 1.881206537849041e-05, + "loss": 1.5098, + "step": 1859 + }, + { + "epoch": 0.3208832916415078, + "grad_norm": 0.671875, + "learning_rate": 1.8810778334269778e-05, + "loss": 1.4897, + "step": 1860 + }, + { + "epoch": 0.3210558095402398, + "grad_norm": 0.640625, + "learning_rate": 1.880949063729163e-05, + "loss": 1.4835, + "step": 1861 + }, + { + "epoch": 0.3212283274389718, + "grad_norm": 0.609375, + "learning_rate": 1.8808202287651375e-05, + "loss": 1.5093, + "step": 1862 + }, + { + "epoch": 0.3214008453377038, + "grad_norm": 0.84375, + "learning_rate": 1.880691328544446e-05, + "loss": 1.5226, + "step": 1863 + }, + { + "epoch": 0.32157336323643576, + "grad_norm": 0.734375, + "learning_rate": 1.880562363076638e-05, + "loss": 1.498, + "step": 1864 + }, + { + "epoch": 0.3217458811351678, + "grad_norm": 0.82421875, + "learning_rate": 1.880433332371268e-05, + "loss": 1.543, + "step": 1865 + }, + { + "epoch": 0.3219183990338998, + "grad_norm": 0.6796875, + "learning_rate": 1.8803042364378955e-05, + "loss": 1.5232, + "step": 1866 + }, + { + "epoch": 0.32209091693263175, + "grad_norm": 0.640625, + "learning_rate": 1.8801750752860847e-05, + "loss": 1.574, + "step": 1867 + }, + { + "epoch": 0.32226343483136377, + "grad_norm": 0.875, + "learning_rate": 1.880045848925405e-05, + "loss": 1.5205, + "step": 1868 + }, + { + "epoch": 0.32243595273009573, + "grad_norm": 0.93359375, + "learning_rate": 1.87991655736543e-05, + "loss": 1.5055, + "step": 1869 + }, + { + "epoch": 0.32260847062882775, + "grad_norm": 0.6328125, + "learning_rate": 1.8797872006157382e-05, + "loss": 1.5527, + "step": 1870 + }, + { + "epoch": 0.3227809885275597, + "grad_norm": 1.1796875, + "learning_rate": 1.8796577786859136e-05, + "loss": 1.5301, + "step": 1871 + }, + { + "epoch": 0.3229535064262917, + "grad_norm": 0.890625, + "learning_rate": 1.8795282915855445e-05, + "loss": 1.5342, + "step": 1872 + }, + { + "epoch": 0.32312602432502374, + "grad_norm": 0.8515625, + "learning_rate": 1.8793987393242236e-05, + "loss": 1.5328, + "step": 1873 + }, + { + "epoch": 0.3232985422237557, + "grad_norm": 1.1796875, + "learning_rate": 1.8792691219115496e-05, + "loss": 1.4506, + "step": 1874 + }, + { + "epoch": 0.3234710601224877, + "grad_norm": 0.609375, + "learning_rate": 1.8791394393571255e-05, + "loss": 1.5116, + "step": 1875 + }, + { + "epoch": 0.3236435780212197, + "grad_norm": 0.65234375, + "learning_rate": 1.879009691670558e-05, + "loss": 1.5311, + "step": 1876 + }, + { + "epoch": 0.3238160959199517, + "grad_norm": 0.75, + "learning_rate": 1.8788798788614606e-05, + "loss": 1.4899, + "step": 1877 + }, + { + "epoch": 0.3239886138186837, + "grad_norm": 1.015625, + "learning_rate": 1.8787500009394503e-05, + "loss": 1.5621, + "step": 1878 + }, + { + "epoch": 0.3241611317174157, + "grad_norm": 0.66015625, + "learning_rate": 1.8786200579141488e-05, + "loss": 1.5273, + "step": 1879 + }, + { + "epoch": 0.3243336496161477, + "grad_norm": 0.6328125, + "learning_rate": 1.878490049795184e-05, + "loss": 1.4859, + "step": 1880 + }, + { + "epoch": 0.32450616751487965, + "grad_norm": 0.76953125, + "learning_rate": 1.878359976592187e-05, + "loss": 1.4424, + "step": 1881 + }, + { + "epoch": 0.32467868541361167, + "grad_norm": 0.8359375, + "learning_rate": 1.8782298383147946e-05, + "loss": 1.4807, + "step": 1882 + }, + { + "epoch": 0.32485120331234363, + "grad_norm": 0.6640625, + "learning_rate": 1.8780996349726488e-05, + "loss": 1.5346, + "step": 1883 + }, + { + "epoch": 0.32502372121107564, + "grad_norm": 0.72265625, + "learning_rate": 1.8779693665753954e-05, + "loss": 1.5185, + "step": 1884 + }, + { + "epoch": 0.32519623910980766, + "grad_norm": 0.8984375, + "learning_rate": 1.877839033132685e-05, + "loss": 1.5993, + "step": 1885 + }, + { + "epoch": 0.3253687570085396, + "grad_norm": 0.765625, + "learning_rate": 1.8777086346541743e-05, + "loss": 1.5468, + "step": 1886 + }, + { + "epoch": 0.32554127490727164, + "grad_norm": 0.609375, + "learning_rate": 1.8775781711495237e-05, + "loss": 1.5053, + "step": 1887 + }, + { + "epoch": 0.3257137928060036, + "grad_norm": 0.92578125, + "learning_rate": 1.8774476426283993e-05, + "loss": 1.4857, + "step": 1888 + }, + { + "epoch": 0.3258863107047356, + "grad_norm": 0.96875, + "learning_rate": 1.8773170491004704e-05, + "loss": 1.52, + "step": 1889 + }, + { + "epoch": 0.32605882860346763, + "grad_norm": 0.82421875, + "learning_rate": 1.877186390575413e-05, + "loss": 1.5068, + "step": 1890 + }, + { + "epoch": 0.3262313465021996, + "grad_norm": 0.94140625, + "learning_rate": 1.877055667062907e-05, + "loss": 1.4402, + "step": 1891 + }, + { + "epoch": 0.3264038644009316, + "grad_norm": 0.63671875, + "learning_rate": 1.8769248785726367e-05, + "loss": 1.4986, + "step": 1892 + }, + { + "epoch": 0.32657638229966357, + "grad_norm": 1.2734375, + "learning_rate": 1.8767940251142924e-05, + "loss": 1.4641, + "step": 1893 + }, + { + "epoch": 0.3267489001983956, + "grad_norm": 1.015625, + "learning_rate": 1.8766631066975684e-05, + "loss": 1.5435, + "step": 1894 + }, + { + "epoch": 0.3269214180971276, + "grad_norm": 0.8046875, + "learning_rate": 1.8765321233321634e-05, + "loss": 1.5584, + "step": 1895 + }, + { + "epoch": 0.32709393599585956, + "grad_norm": 0.7734375, + "learning_rate": 1.876401075027782e-05, + "loss": 1.5136, + "step": 1896 + }, + { + "epoch": 0.3272664538945916, + "grad_norm": 1.234375, + "learning_rate": 1.8762699617941333e-05, + "loss": 1.5428, + "step": 1897 + }, + { + "epoch": 0.32743897179332354, + "grad_norm": 1.1875, + "learning_rate": 1.87613878364093e-05, + "loss": 1.5206, + "step": 1898 + }, + { + "epoch": 0.32761148969205556, + "grad_norm": 0.75, + "learning_rate": 1.8760075405778918e-05, + "loss": 1.5195, + "step": 1899 + }, + { + "epoch": 0.3277840075907875, + "grad_norm": 0.87890625, + "learning_rate": 1.8758762326147414e-05, + "loss": 1.4737, + "step": 1900 + }, + { + "epoch": 0.3277840075907875, + "eval_loss": 1.4825938940048218, + "eval_runtime": 10.8069, + "eval_samples_per_second": 94.755, + "eval_steps_per_second": 23.689, + "step": 1900 + }, + { + "epoch": 0.32795652548951953, + "grad_norm": 1.09375, + "learning_rate": 1.875744859761207e-05, + "loss": 1.6023, + "step": 1901 + }, + { + "epoch": 0.32812904338825155, + "grad_norm": 0.8828125, + "learning_rate": 1.875613422027021e-05, + "loss": 1.5445, + "step": 1902 + }, + { + "epoch": 0.3283015612869835, + "grad_norm": 0.65234375, + "learning_rate": 1.875481919421922e-05, + "loss": 1.4026, + "step": 1903 + }, + { + "epoch": 0.32847407918571553, + "grad_norm": 1.0859375, + "learning_rate": 1.875350351955652e-05, + "loss": 1.5081, + "step": 1904 + }, + { + "epoch": 0.3286465970844475, + "grad_norm": 1.046875, + "learning_rate": 1.8752187196379585e-05, + "loss": 1.5393, + "step": 1905 + }, + { + "epoch": 0.3288191149831795, + "grad_norm": 0.6328125, + "learning_rate": 1.875087022478594e-05, + "loss": 1.5503, + "step": 1906 + }, + { + "epoch": 0.3289916328819115, + "grad_norm": 0.78125, + "learning_rate": 1.8749552604873146e-05, + "loss": 1.4422, + "step": 1907 + }, + { + "epoch": 0.3291641507806435, + "grad_norm": 0.98046875, + "learning_rate": 1.874823433673883e-05, + "loss": 1.4446, + "step": 1908 + }, + { + "epoch": 0.3293366686793755, + "grad_norm": 0.73828125, + "learning_rate": 1.8746915420480646e-05, + "loss": 1.6119, + "step": 1909 + }, + { + "epoch": 0.32950918657810746, + "grad_norm": 0.83984375, + "learning_rate": 1.8745595856196318e-05, + "loss": 1.4752, + "step": 1910 + }, + { + "epoch": 0.3296817044768395, + "grad_norm": 0.64453125, + "learning_rate": 1.8744275643983606e-05, + "loss": 1.4901, + "step": 1911 + }, + { + "epoch": 0.3298542223755715, + "grad_norm": 0.87890625, + "learning_rate": 1.8742954783940313e-05, + "loss": 1.4867, + "step": 1912 + }, + { + "epoch": 0.33002674027430345, + "grad_norm": 0.96484375, + "learning_rate": 1.8741633276164305e-05, + "loss": 1.446, + "step": 1913 + }, + { + "epoch": 0.33019925817303547, + "grad_norm": 1.359375, + "learning_rate": 1.8740311120753482e-05, + "loss": 1.5655, + "step": 1914 + }, + { + "epoch": 0.33037177607176743, + "grad_norm": 0.78515625, + "learning_rate": 1.87389883178058e-05, + "loss": 1.4538, + "step": 1915 + }, + { + "epoch": 0.33054429397049945, + "grad_norm": 0.859375, + "learning_rate": 1.8737664867419262e-05, + "loss": 1.4357, + "step": 1916 + }, + { + "epoch": 0.3307168118692314, + "grad_norm": 0.7421875, + "learning_rate": 1.8736340769691912e-05, + "loss": 1.5557, + "step": 1917 + }, + { + "epoch": 0.3308893297679634, + "grad_norm": 0.62890625, + "learning_rate": 1.873501602472185e-05, + "loss": 1.5924, + "step": 1918 + }, + { + "epoch": 0.33106184766669544, + "grad_norm": 0.71875, + "learning_rate": 1.8733690632607223e-05, + "loss": 1.5446, + "step": 1919 + }, + { + "epoch": 0.3312343655654274, + "grad_norm": 0.69140625, + "learning_rate": 1.8732364593446223e-05, + "loss": 1.4492, + "step": 1920 + }, + { + "epoch": 0.3314068834641594, + "grad_norm": 0.71875, + "learning_rate": 1.8731037907337094e-05, + "loss": 1.4965, + "step": 1921 + }, + { + "epoch": 0.3315794013628914, + "grad_norm": 0.58203125, + "learning_rate": 1.872971057437812e-05, + "loss": 1.49, + "step": 1922 + }, + { + "epoch": 0.3317519192616234, + "grad_norm": 0.59765625, + "learning_rate": 1.872838259466764e-05, + "loss": 1.4051, + "step": 1923 + }, + { + "epoch": 0.3319244371603554, + "grad_norm": 0.734375, + "learning_rate": 1.8727053968304044e-05, + "loss": 1.5263, + "step": 1924 + }, + { + "epoch": 0.3320969550590874, + "grad_norm": 0.73046875, + "learning_rate": 1.8725724695385757e-05, + "loss": 1.5625, + "step": 1925 + }, + { + "epoch": 0.3322694729578194, + "grad_norm": 0.6640625, + "learning_rate": 1.8724394776011264e-05, + "loss": 1.461, + "step": 1926 + }, + { + "epoch": 0.33244199085655135, + "grad_norm": 0.68359375, + "learning_rate": 1.8723064210279096e-05, + "loss": 1.4699, + "step": 1927 + }, + { + "epoch": 0.33261450875528337, + "grad_norm": 0.66796875, + "learning_rate": 1.8721732998287825e-05, + "loss": 1.4403, + "step": 1928 + }, + { + "epoch": 0.3327870266540153, + "grad_norm": 0.6640625, + "learning_rate": 1.872040114013608e-05, + "loss": 1.5245, + "step": 1929 + }, + { + "epoch": 0.33295954455274734, + "grad_norm": 0.6875, + "learning_rate": 1.871906863592253e-05, + "loss": 1.5167, + "step": 1930 + }, + { + "epoch": 0.33313206245147936, + "grad_norm": 0.66015625, + "learning_rate": 1.8717735485745895e-05, + "loss": 1.5321, + "step": 1931 + }, + { + "epoch": 0.3333045803502113, + "grad_norm": 0.6328125, + "learning_rate": 1.8716401689704945e-05, + "loss": 1.538, + "step": 1932 + }, + { + "epoch": 0.33347709824894334, + "grad_norm": 0.7890625, + "learning_rate": 1.8715067247898493e-05, + "loss": 1.4214, + "step": 1933 + }, + { + "epoch": 0.3336496161476753, + "grad_norm": 0.58984375, + "learning_rate": 1.871373216042541e-05, + "loss": 1.491, + "step": 1934 + }, + { + "epoch": 0.3338221340464073, + "grad_norm": 0.65234375, + "learning_rate": 1.8712396427384595e-05, + "loss": 1.5567, + "step": 1935 + }, + { + "epoch": 0.33399465194513933, + "grad_norm": 0.7421875, + "learning_rate": 1.8711060048875023e-05, + "loss": 1.526, + "step": 1936 + }, + { + "epoch": 0.3341671698438713, + "grad_norm": 0.7578125, + "learning_rate": 1.870972302499569e-05, + "loss": 1.4795, + "step": 1937 + }, + { + "epoch": 0.3343396877426033, + "grad_norm": 0.82421875, + "learning_rate": 1.8708385355845654e-05, + "loss": 1.3779, + "step": 1938 + }, + { + "epoch": 0.33451220564133527, + "grad_norm": 0.671875, + "learning_rate": 1.870704704152402e-05, + "loss": 1.3989, + "step": 1939 + }, + { + "epoch": 0.3346847235400673, + "grad_norm": 0.6875, + "learning_rate": 1.8705708082129935e-05, + "loss": 1.4858, + "step": 1940 + }, + { + "epoch": 0.3348572414387993, + "grad_norm": 0.7265625, + "learning_rate": 1.87043684777626e-05, + "loss": 1.5937, + "step": 1941 + }, + { + "epoch": 0.33502975933753126, + "grad_norm": 0.65234375, + "learning_rate": 1.8703028228521263e-05, + "loss": 1.5191, + "step": 1942 + }, + { + "epoch": 0.3352022772362633, + "grad_norm": 0.6953125, + "learning_rate": 1.8701687334505215e-05, + "loss": 1.4377, + "step": 1943 + }, + { + "epoch": 0.33537479513499524, + "grad_norm": 0.70703125, + "learning_rate": 1.8700345795813794e-05, + "loss": 1.4953, + "step": 1944 + }, + { + "epoch": 0.33554731303372726, + "grad_norm": 0.5859375, + "learning_rate": 1.8699003612546397e-05, + "loss": 1.5811, + "step": 1945 + }, + { + "epoch": 0.3357198309324592, + "grad_norm": 0.64453125, + "learning_rate": 1.8697660784802463e-05, + "loss": 1.577, + "step": 1946 + }, + { + "epoch": 0.33589234883119123, + "grad_norm": 0.6953125, + "learning_rate": 1.8696317312681466e-05, + "loss": 1.3651, + "step": 1947 + }, + { + "epoch": 0.33606486672992325, + "grad_norm": 0.71875, + "learning_rate": 1.869497319628295e-05, + "loss": 1.5111, + "step": 1948 + }, + { + "epoch": 0.3362373846286552, + "grad_norm": 0.61328125, + "learning_rate": 1.8693628435706487e-05, + "loss": 1.4774, + "step": 1949 + }, + { + "epoch": 0.33640990252738723, + "grad_norm": 0.921875, + "learning_rate": 1.8692283031051714e-05, + "loss": 1.5871, + "step": 1950 + }, + { + "epoch": 0.3365824204261192, + "grad_norm": 0.578125, + "learning_rate": 1.86909369824183e-05, + "loss": 1.4237, + "step": 1951 + }, + { + "epoch": 0.3367549383248512, + "grad_norm": 0.67578125, + "learning_rate": 1.868959028990597e-05, + "loss": 1.4145, + "step": 1952 + }, + { + "epoch": 0.3369274562235832, + "grad_norm": 0.6796875, + "learning_rate": 1.8688242953614496e-05, + "loss": 1.4504, + "step": 1953 + }, + { + "epoch": 0.3370999741223152, + "grad_norm": 0.65625, + "learning_rate": 1.8686894973643698e-05, + "loss": 1.5252, + "step": 1954 + }, + { + "epoch": 0.3372724920210472, + "grad_norm": 0.6484375, + "learning_rate": 1.868554635009344e-05, + "loss": 1.4336, + "step": 1955 + }, + { + "epoch": 0.33744500991977916, + "grad_norm": 0.61328125, + "learning_rate": 1.868419708306364e-05, + "loss": 1.4343, + "step": 1956 + }, + { + "epoch": 0.3376175278185112, + "grad_norm": 0.64453125, + "learning_rate": 1.868284717265426e-05, + "loss": 1.4825, + "step": 1957 + }, + { + "epoch": 0.33779004571724314, + "grad_norm": 0.6328125, + "learning_rate": 1.8681496618965308e-05, + "loss": 1.5555, + "step": 1958 + }, + { + "epoch": 0.33796256361597515, + "grad_norm": 0.5703125, + "learning_rate": 1.8680145422096844e-05, + "loss": 1.5049, + "step": 1959 + }, + { + "epoch": 0.33813508151470717, + "grad_norm": 0.57421875, + "learning_rate": 1.867879358214897e-05, + "loss": 1.3421, + "step": 1960 + }, + { + "epoch": 0.33830759941343913, + "grad_norm": 0.625, + "learning_rate": 1.8677441099221836e-05, + "loss": 1.4644, + "step": 1961 + }, + { + "epoch": 0.33848011731217115, + "grad_norm": 0.62109375, + "learning_rate": 1.867608797341565e-05, + "loss": 1.6562, + "step": 1962 + }, + { + "epoch": 0.3386526352109031, + "grad_norm": 0.671875, + "learning_rate": 1.8674734204830655e-05, + "loss": 1.4937, + "step": 1963 + }, + { + "epoch": 0.3388251531096351, + "grad_norm": 0.78515625, + "learning_rate": 1.867337979356715e-05, + "loss": 1.5014, + "step": 1964 + }, + { + "epoch": 0.33899767100836714, + "grad_norm": 0.6484375, + "learning_rate": 1.8672024739725473e-05, + "loss": 1.487, + "step": 1965 + }, + { + "epoch": 0.3391701889070991, + "grad_norm": 0.77734375, + "learning_rate": 1.867066904340602e-05, + "loss": 1.5148, + "step": 1966 + }, + { + "epoch": 0.3393427068058311, + "grad_norm": 0.60546875, + "learning_rate": 1.8669312704709224e-05, + "loss": 1.4718, + "step": 1967 + }, + { + "epoch": 0.3395152247045631, + "grad_norm": 0.68359375, + "learning_rate": 1.8667955723735577e-05, + "loss": 1.5579, + "step": 1968 + }, + { + "epoch": 0.3396877426032951, + "grad_norm": 0.6015625, + "learning_rate": 1.8666598100585613e-05, + "loss": 1.5187, + "step": 1969 + }, + { + "epoch": 0.3398602605020271, + "grad_norm": 0.65234375, + "learning_rate": 1.8665239835359904e-05, + "loss": 1.4936, + "step": 1970 + }, + { + "epoch": 0.3400327784007591, + "grad_norm": 0.578125, + "learning_rate": 1.866388092815909e-05, + "loss": 1.5325, + "step": 1971 + }, + { + "epoch": 0.3402052962994911, + "grad_norm": 0.625, + "learning_rate": 1.8662521379083843e-05, + "loss": 1.439, + "step": 1972 + }, + { + "epoch": 0.34037781419822305, + "grad_norm": 0.640625, + "learning_rate": 1.8661161188234882e-05, + "loss": 1.4927, + "step": 1973 + }, + { + "epoch": 0.34055033209695507, + "grad_norm": 0.61328125, + "learning_rate": 1.8659800355712984e-05, + "loss": 1.556, + "step": 1974 + }, + { + "epoch": 0.340722849995687, + "grad_norm": 0.77734375, + "learning_rate": 1.865843888161897e-05, + "loss": 1.4048, + "step": 1975 + }, + { + "epoch": 0.34089536789441904, + "grad_norm": 0.58203125, + "learning_rate": 1.86570767660537e-05, + "loss": 1.3183, + "step": 1976 + }, + { + "epoch": 0.34106788579315106, + "grad_norm": 0.61328125, + "learning_rate": 1.865571400911809e-05, + "loss": 1.4521, + "step": 1977 + }, + { + "epoch": 0.341240403691883, + "grad_norm": 0.5546875, + "learning_rate": 1.8654350610913106e-05, + "loss": 1.5282, + "step": 1978 + }, + { + "epoch": 0.34141292159061504, + "grad_norm": 0.640625, + "learning_rate": 1.8652986571539754e-05, + "loss": 1.467, + "step": 1979 + }, + { + "epoch": 0.341585439489347, + "grad_norm": 0.6484375, + "learning_rate": 1.865162189109909e-05, + "loss": 1.4525, + "step": 1980 + }, + { + "epoch": 0.341757957388079, + "grad_norm": 0.98828125, + "learning_rate": 1.8650256569692215e-05, + "loss": 1.5296, + "step": 1981 + }, + { + "epoch": 0.34193047528681103, + "grad_norm": 0.71875, + "learning_rate": 1.864889060742029e-05, + "loss": 1.5094, + "step": 1982 + }, + { + "epoch": 0.342102993185543, + "grad_norm": 0.87890625, + "learning_rate": 1.8647524004384503e-05, + "loss": 1.5443, + "step": 1983 + }, + { + "epoch": 0.342275511084275, + "grad_norm": 0.81640625, + "learning_rate": 1.8646156760686108e-05, + "loss": 1.5589, + "step": 1984 + }, + { + "epoch": 0.34244802898300697, + "grad_norm": 0.76171875, + "learning_rate": 1.8644788876426395e-05, + "loss": 1.5559, + "step": 1985 + }, + { + "epoch": 0.342620546881739, + "grad_norm": 0.69921875, + "learning_rate": 1.8643420351706707e-05, + "loss": 1.4188, + "step": 1986 + }, + { + "epoch": 0.34279306478047095, + "grad_norm": 0.6328125, + "learning_rate": 1.8642051186628434e-05, + "loss": 1.499, + "step": 1987 + }, + { + "epoch": 0.34296558267920296, + "grad_norm": 0.63671875, + "learning_rate": 1.864068138129301e-05, + "loss": 1.5922, + "step": 1988 + }, + { + "epoch": 0.343138100577935, + "grad_norm": 0.765625, + "learning_rate": 1.8639310935801922e-05, + "loss": 1.4714, + "step": 1989 + }, + { + "epoch": 0.34331061847666694, + "grad_norm": 0.65625, + "learning_rate": 1.8637939850256697e-05, + "loss": 1.5472, + "step": 1990 + }, + { + "epoch": 0.34348313637539896, + "grad_norm": 0.7890625, + "learning_rate": 1.8636568124758917e-05, + "loss": 1.4931, + "step": 1991 + }, + { + "epoch": 0.3436556542741309, + "grad_norm": 0.71484375, + "learning_rate": 1.8635195759410205e-05, + "loss": 1.4338, + "step": 1992 + }, + { + "epoch": 0.34382817217286293, + "grad_norm": 0.625, + "learning_rate": 1.8633822754312233e-05, + "loss": 1.4708, + "step": 1993 + }, + { + "epoch": 0.34400069007159495, + "grad_norm": 0.6484375, + "learning_rate": 1.863244910956673e-05, + "loss": 1.4769, + "step": 1994 + }, + { + "epoch": 0.3441732079703269, + "grad_norm": 0.60546875, + "learning_rate": 1.8631074825275457e-05, + "loss": 1.4068, + "step": 1995 + }, + { + "epoch": 0.3443457258690589, + "grad_norm": 0.76953125, + "learning_rate": 1.8629699901540232e-05, + "loss": 1.4729, + "step": 1996 + }, + { + "epoch": 0.3445182437677909, + "grad_norm": 0.63671875, + "learning_rate": 1.8628324338462915e-05, + "loss": 1.513, + "step": 1997 + }, + { + "epoch": 0.3446907616665229, + "grad_norm": 0.609375, + "learning_rate": 1.8626948136145422e-05, + "loss": 1.5212, + "step": 1998 + }, + { + "epoch": 0.3448632795652549, + "grad_norm": 0.69140625, + "learning_rate": 1.8625571294689705e-05, + "loss": 1.5238, + "step": 1999 + }, + { + "epoch": 0.3450357974639869, + "grad_norm": 0.70703125, + "learning_rate": 1.8624193814197774e-05, + "loss": 1.542, + "step": 2000 + }, + { + "epoch": 0.3450357974639869, + "eval_loss": 1.4780633449554443, + "eval_runtime": 11.8304, + "eval_samples_per_second": 86.556, + "eval_steps_per_second": 21.639, + "step": 2000 + }, + { + "epoch": 0.3452083153627189, + "grad_norm": 0.6328125, + "learning_rate": 1.8622815694771675e-05, + "loss": 1.5014, + "step": 2001 + }, + { + "epoch": 0.34538083326145086, + "grad_norm": 0.703125, + "learning_rate": 1.8621436936513517e-05, + "loss": 1.5214, + "step": 2002 + }, + { + "epoch": 0.3455533511601829, + "grad_norm": 0.59765625, + "learning_rate": 1.862005753952544e-05, + "loss": 1.4511, + "step": 2003 + }, + { + "epoch": 0.34572586905891484, + "grad_norm": 0.625, + "learning_rate": 1.8618677503909637e-05, + "loss": 1.5055, + "step": 2004 + }, + { + "epoch": 0.34589838695764685, + "grad_norm": 0.69921875, + "learning_rate": 1.8617296829768354e-05, + "loss": 1.4664, + "step": 2005 + }, + { + "epoch": 0.34607090485637887, + "grad_norm": 0.828125, + "learning_rate": 1.8615915517203878e-05, + "loss": 1.5586, + "step": 2006 + }, + { + "epoch": 0.34624342275511083, + "grad_norm": 0.63671875, + "learning_rate": 1.8614533566318544e-05, + "loss": 1.5295, + "step": 2007 + }, + { + "epoch": 0.34641594065384285, + "grad_norm": 0.59765625, + "learning_rate": 1.861315097721474e-05, + "loss": 1.4864, + "step": 2008 + }, + { + "epoch": 0.3465884585525748, + "grad_norm": 0.796875, + "learning_rate": 1.8611767749994894e-05, + "loss": 1.4829, + "step": 2009 + }, + { + "epoch": 0.3467609764513068, + "grad_norm": 0.671875, + "learning_rate": 1.8610383884761486e-05, + "loss": 1.4033, + "step": 2010 + }, + { + "epoch": 0.34693349435003884, + "grad_norm": 0.72265625, + "learning_rate": 1.8608999381617035e-05, + "loss": 1.5237, + "step": 2011 + }, + { + "epoch": 0.3471060122487708, + "grad_norm": 0.62109375, + "learning_rate": 1.860761424066412e-05, + "loss": 1.5261, + "step": 2012 + }, + { + "epoch": 0.3472785301475028, + "grad_norm": 0.65234375, + "learning_rate": 1.860622846200536e-05, + "loss": 1.5967, + "step": 2013 + }, + { + "epoch": 0.3474510480462348, + "grad_norm": 0.7890625, + "learning_rate": 1.860484204574342e-05, + "loss": 1.4249, + "step": 2014 + }, + { + "epoch": 0.3476235659449668, + "grad_norm": 0.59375, + "learning_rate": 1.860345499198102e-05, + "loss": 1.5006, + "step": 2015 + }, + { + "epoch": 0.3477960838436988, + "grad_norm": 0.68359375, + "learning_rate": 1.860206730082091e-05, + "loss": 1.4401, + "step": 2016 + }, + { + "epoch": 0.34796860174243077, + "grad_norm": 0.66015625, + "learning_rate": 1.860067897236591e-05, + "loss": 1.5685, + "step": 2017 + }, + { + "epoch": 0.3481411196411628, + "grad_norm": 0.671875, + "learning_rate": 1.8599290006718878e-05, + "loss": 1.528, + "step": 2018 + }, + { + "epoch": 0.34831363753989475, + "grad_norm": 0.65625, + "learning_rate": 1.8597900403982705e-05, + "loss": 1.3706, + "step": 2019 + }, + { + "epoch": 0.34848615543862677, + "grad_norm": 0.6171875, + "learning_rate": 1.8596510164260347e-05, + "loss": 1.4227, + "step": 2020 + }, + { + "epoch": 0.3486586733373587, + "grad_norm": 0.64453125, + "learning_rate": 1.8595119287654808e-05, + "loss": 1.4319, + "step": 2021 + }, + { + "epoch": 0.34883119123609074, + "grad_norm": 0.61328125, + "learning_rate": 1.8593727774269122e-05, + "loss": 1.5133, + "step": 2022 + }, + { + "epoch": 0.34900370913482276, + "grad_norm": 0.65625, + "learning_rate": 1.859233562420639e-05, + "loss": 1.553, + "step": 2023 + }, + { + "epoch": 0.3491762270335547, + "grad_norm": 0.63671875, + "learning_rate": 1.8590942837569743e-05, + "loss": 1.4991, + "step": 2024 + }, + { + "epoch": 0.34934874493228674, + "grad_norm": 0.66015625, + "learning_rate": 1.8589549414462375e-05, + "loss": 1.5245, + "step": 2025 + }, + { + "epoch": 0.3495212628310187, + "grad_norm": 0.7578125, + "learning_rate": 1.8588155354987517e-05, + "loss": 1.5654, + "step": 2026 + }, + { + "epoch": 0.3496937807297507, + "grad_norm": 0.67578125, + "learning_rate": 1.8586760659248447e-05, + "loss": 1.4404, + "step": 2027 + }, + { + "epoch": 0.34986629862848273, + "grad_norm": 0.6015625, + "learning_rate": 1.8585365327348497e-05, + "loss": 1.502, + "step": 2028 + }, + { + "epoch": 0.3500388165272147, + "grad_norm": 0.7265625, + "learning_rate": 1.858396935939104e-05, + "loss": 1.4986, + "step": 2029 + }, + { + "epoch": 0.3502113344259467, + "grad_norm": 0.703125, + "learning_rate": 1.8582572755479494e-05, + "loss": 1.49, + "step": 2030 + }, + { + "epoch": 0.35038385232467867, + "grad_norm": 1.0703125, + "learning_rate": 1.8581175515717337e-05, + "loss": 1.5376, + "step": 2031 + }, + { + "epoch": 0.3505563702234107, + "grad_norm": 0.7578125, + "learning_rate": 1.8579777640208076e-05, + "loss": 1.6789, + "step": 2032 + }, + { + "epoch": 0.35072888812214265, + "grad_norm": 0.63671875, + "learning_rate": 1.857837912905528e-05, + "loss": 1.5079, + "step": 2033 + }, + { + "epoch": 0.35090140602087466, + "grad_norm": 0.8515625, + "learning_rate": 1.857697998236256e-05, + "loss": 1.4556, + "step": 2034 + }, + { + "epoch": 0.3510739239196067, + "grad_norm": 0.61328125, + "learning_rate": 1.857558020023357e-05, + "loss": 1.4302, + "step": 2035 + }, + { + "epoch": 0.35124644181833864, + "grad_norm": 0.6484375, + "learning_rate": 1.8574179782772012e-05, + "loss": 1.4337, + "step": 2036 + }, + { + "epoch": 0.35141895971707066, + "grad_norm": 0.6953125, + "learning_rate": 1.8572778730081644e-05, + "loss": 1.543, + "step": 2037 + }, + { + "epoch": 0.3515914776158026, + "grad_norm": 0.609375, + "learning_rate": 1.8571377042266267e-05, + "loss": 1.5619, + "step": 2038 + }, + { + "epoch": 0.35176399551453463, + "grad_norm": 0.62890625, + "learning_rate": 1.8569974719429716e-05, + "loss": 1.4384, + "step": 2039 + }, + { + "epoch": 0.35193651341326665, + "grad_norm": 0.6953125, + "learning_rate": 1.8568571761675893e-05, + "loss": 1.5401, + "step": 2040 + }, + { + "epoch": 0.3521090313119986, + "grad_norm": 0.68359375, + "learning_rate": 1.8567168169108735e-05, + "loss": 1.4847, + "step": 2041 + }, + { + "epoch": 0.3522815492107306, + "grad_norm": 0.65234375, + "learning_rate": 1.856576394183223e-05, + "loss": 1.4928, + "step": 2042 + }, + { + "epoch": 0.3524540671094626, + "grad_norm": 0.6640625, + "learning_rate": 1.856435907995041e-05, + "loss": 1.5223, + "step": 2043 + }, + { + "epoch": 0.3526265850081946, + "grad_norm": 0.75390625, + "learning_rate": 1.8562953583567357e-05, + "loss": 1.4796, + "step": 2044 + }, + { + "epoch": 0.3527991029069266, + "grad_norm": 0.66796875, + "learning_rate": 1.85615474527872e-05, + "loss": 1.4604, + "step": 2045 + }, + { + "epoch": 0.3529716208056586, + "grad_norm": 0.703125, + "learning_rate": 1.856014068771411e-05, + "loss": 1.5434, + "step": 2046 + }, + { + "epoch": 0.3531441387043906, + "grad_norm": 0.71875, + "learning_rate": 1.855873328845231e-05, + "loss": 1.6396, + "step": 2047 + }, + { + "epoch": 0.35331665660312256, + "grad_norm": 0.8515625, + "learning_rate": 1.8557325255106074e-05, + "loss": 1.498, + "step": 2048 + }, + { + "epoch": 0.3534891745018546, + "grad_norm": 0.7578125, + "learning_rate": 1.8555916587779713e-05, + "loss": 1.4599, + "step": 2049 + }, + { + "epoch": 0.35366169240058654, + "grad_norm": 0.61328125, + "learning_rate": 1.855450728657759e-05, + "loss": 1.5217, + "step": 2050 + }, + { + "epoch": 0.35383421029931855, + "grad_norm": 0.58203125, + "learning_rate": 1.855309735160412e-05, + "loss": 1.4141, + "step": 2051 + }, + { + "epoch": 0.35400672819805057, + "grad_norm": 0.69140625, + "learning_rate": 1.8551686782963757e-05, + "loss": 1.4475, + "step": 2052 + }, + { + "epoch": 0.35417924609678253, + "grad_norm": 0.68359375, + "learning_rate": 1.8550275580761e-05, + "loss": 1.5216, + "step": 2053 + }, + { + "epoch": 0.35435176399551455, + "grad_norm": 0.62890625, + "learning_rate": 1.8548863745100403e-05, + "loss": 1.5242, + "step": 2054 + }, + { + "epoch": 0.3545242818942465, + "grad_norm": 0.64453125, + "learning_rate": 1.8547451276086565e-05, + "loss": 1.5738, + "step": 2055 + }, + { + "epoch": 0.3546967997929785, + "grad_norm": 0.61328125, + "learning_rate": 1.854603817382413e-05, + "loss": 1.5531, + "step": 2056 + }, + { + "epoch": 0.35486931769171054, + "grad_norm": 0.61328125, + "learning_rate": 1.854462443841779e-05, + "loss": 1.4921, + "step": 2057 + }, + { + "epoch": 0.3550418355904425, + "grad_norm": 0.875, + "learning_rate": 1.854321006997228e-05, + "loss": 1.4861, + "step": 2058 + }, + { + "epoch": 0.3552143534891745, + "grad_norm": 0.6015625, + "learning_rate": 1.8541795068592388e-05, + "loss": 1.5131, + "step": 2059 + }, + { + "epoch": 0.3553868713879065, + "grad_norm": 0.6875, + "learning_rate": 1.8540379434382946e-05, + "loss": 1.4874, + "step": 2060 + }, + { + "epoch": 0.3555593892866385, + "grad_norm": 0.7265625, + "learning_rate": 1.853896316744883e-05, + "loss": 1.4067, + "step": 2061 + }, + { + "epoch": 0.35573190718537046, + "grad_norm": 0.68359375, + "learning_rate": 1.8537546267894975e-05, + "loss": 1.5423, + "step": 2062 + }, + { + "epoch": 0.35590442508410247, + "grad_norm": 0.64453125, + "learning_rate": 1.8536128735826344e-05, + "loss": 1.5336, + "step": 2063 + }, + { + "epoch": 0.3560769429828345, + "grad_norm": 0.73046875, + "learning_rate": 1.853471057134796e-05, + "loss": 1.5081, + "step": 2064 + }, + { + "epoch": 0.35624946088156645, + "grad_norm": 0.6171875, + "learning_rate": 1.8533291774564887e-05, + "loss": 1.4295, + "step": 2065 + }, + { + "epoch": 0.35642197878029847, + "grad_norm": 0.59375, + "learning_rate": 1.8531872345582247e-05, + "loss": 1.5259, + "step": 2066 + }, + { + "epoch": 0.3565944966790304, + "grad_norm": 1.5859375, + "learning_rate": 1.853045228450519e-05, + "loss": 1.4738, + "step": 2067 + }, + { + "epoch": 0.35676701457776244, + "grad_norm": 0.64453125, + "learning_rate": 1.8529031591438926e-05, + "loss": 1.484, + "step": 2068 + }, + { + "epoch": 0.35693953247649446, + "grad_norm": 1.203125, + "learning_rate": 1.8527610266488714e-05, + "loss": 1.4609, + "step": 2069 + }, + { + "epoch": 0.3571120503752264, + "grad_norm": 2.515625, + "learning_rate": 1.8526188309759847e-05, + "loss": 1.5005, + "step": 2070 + }, + { + "epoch": 0.35728456827395844, + "grad_norm": 0.703125, + "learning_rate": 1.8524765721357676e-05, + "loss": 1.495, + "step": 2071 + }, + { + "epoch": 0.3574570861726904, + "grad_norm": 0.64453125, + "learning_rate": 1.8523342501387595e-05, + "loss": 1.48, + "step": 2072 + }, + { + "epoch": 0.3576296040714224, + "grad_norm": 0.73046875, + "learning_rate": 1.8521918649955047e-05, + "loss": 1.3926, + "step": 2073 + }, + { + "epoch": 0.35780212197015443, + "grad_norm": 0.6640625, + "learning_rate": 1.8520494167165517e-05, + "loss": 1.5589, + "step": 2074 + }, + { + "epoch": 0.3579746398688864, + "grad_norm": 0.609375, + "learning_rate": 1.851906905312454e-05, + "loss": 1.51, + "step": 2075 + }, + { + "epoch": 0.3581471577676184, + "grad_norm": 1.359375, + "learning_rate": 1.85176433079377e-05, + "loss": 1.6039, + "step": 2076 + }, + { + "epoch": 0.35831967566635037, + "grad_norm": 0.77734375, + "learning_rate": 1.8516216931710622e-05, + "loss": 1.4755, + "step": 2077 + }, + { + "epoch": 0.3584921935650824, + "grad_norm": 0.640625, + "learning_rate": 1.8514789924548982e-05, + "loss": 1.46, + "step": 2078 + }, + { + "epoch": 0.35866471146381435, + "grad_norm": 0.7109375, + "learning_rate": 1.85133622865585e-05, + "loss": 1.4687, + "step": 2079 + }, + { + "epoch": 0.35883722936254636, + "grad_norm": 0.8203125, + "learning_rate": 1.851193401784495e-05, + "loss": 1.4535, + "step": 2080 + }, + { + "epoch": 0.3590097472612784, + "grad_norm": 0.6171875, + "learning_rate": 1.8510505118514138e-05, + "loss": 1.5059, + "step": 2081 + }, + { + "epoch": 0.35918226516001034, + "grad_norm": 0.69921875, + "learning_rate": 1.8509075588671934e-05, + "loss": 1.5184, + "step": 2082 + }, + { + "epoch": 0.35935478305874236, + "grad_norm": 0.76171875, + "learning_rate": 1.850764542842424e-05, + "loss": 1.5078, + "step": 2083 + }, + { + "epoch": 0.3595273009574743, + "grad_norm": 0.76953125, + "learning_rate": 1.8506214637877017e-05, + "loss": 1.4467, + "step": 2084 + }, + { + "epoch": 0.35969981885620633, + "grad_norm": 0.9453125, + "learning_rate": 1.8504783217136265e-05, + "loss": 1.5681, + "step": 2085 + }, + { + "epoch": 0.35987233675493835, + "grad_norm": 0.6015625, + "learning_rate": 1.8503351166308027e-05, + "loss": 1.5403, + "step": 2086 + }, + { + "epoch": 0.3600448546536703, + "grad_norm": 0.73046875, + "learning_rate": 1.850191848549841e-05, + "loss": 1.5065, + "step": 2087 + }, + { + "epoch": 0.3602173725524023, + "grad_norm": 0.8125, + "learning_rate": 1.8500485174813545e-05, + "loss": 1.5095, + "step": 2088 + }, + { + "epoch": 0.3603898904511343, + "grad_norm": 0.640625, + "learning_rate": 1.8499051234359627e-05, + "loss": 1.4728, + "step": 2089 + }, + { + "epoch": 0.3605624083498663, + "grad_norm": 0.70703125, + "learning_rate": 1.849761666424289e-05, + "loss": 1.4538, + "step": 2090 + }, + { + "epoch": 0.36073492624859826, + "grad_norm": 0.62109375, + "learning_rate": 1.8496181464569608e-05, + "loss": 1.5757, + "step": 2091 + }, + { + "epoch": 0.3609074441473303, + "grad_norm": 0.71484375, + "learning_rate": 1.8494745635446124e-05, + "loss": 1.4783, + "step": 2092 + }, + { + "epoch": 0.3610799620460623, + "grad_norm": 0.625, + "learning_rate": 1.8493309176978802e-05, + "loss": 1.4694, + "step": 2093 + }, + { + "epoch": 0.36125247994479426, + "grad_norm": 0.609375, + "learning_rate": 1.849187208927407e-05, + "loss": 1.4574, + "step": 2094 + }, + { + "epoch": 0.3614249978435263, + "grad_norm": 0.6796875, + "learning_rate": 1.849043437243839e-05, + "loss": 1.5345, + "step": 2095 + }, + { + "epoch": 0.36159751574225824, + "grad_norm": 0.58984375, + "learning_rate": 1.8488996026578286e-05, + "loss": 1.4606, + "step": 2096 + }, + { + "epoch": 0.36177003364099025, + "grad_norm": 0.66796875, + "learning_rate": 1.848755705180031e-05, + "loss": 1.4965, + "step": 2097 + }, + { + "epoch": 0.36194255153972227, + "grad_norm": 0.65625, + "learning_rate": 1.8486117448211078e-05, + "loss": 1.5199, + "step": 2098 + }, + { + "epoch": 0.36211506943845423, + "grad_norm": 0.703125, + "learning_rate": 1.8484677215917243e-05, + "loss": 1.4694, + "step": 2099 + }, + { + "epoch": 0.36228758733718625, + "grad_norm": 0.65234375, + "learning_rate": 1.84832363550255e-05, + "loss": 1.6452, + "step": 2100 + }, + { + "epoch": 0.36228758733718625, + "eval_loss": 1.473473072052002, + "eval_runtime": 10.9071, + "eval_samples_per_second": 93.884, + "eval_steps_per_second": 23.471, + "step": 2100 + }, + { + "epoch": 0.3624601052359182, + "grad_norm": 0.7109375, + "learning_rate": 1.8481794865642607e-05, + "loss": 1.6222, + "step": 2101 + }, + { + "epoch": 0.3626326231346502, + "grad_norm": 0.859375, + "learning_rate": 1.848035274787535e-05, + "loss": 1.4544, + "step": 2102 + }, + { + "epoch": 0.36280514103338224, + "grad_norm": 0.6328125, + "learning_rate": 1.8478910001830578e-05, + "loss": 1.4132, + "step": 2103 + }, + { + "epoch": 0.3629776589321142, + "grad_norm": 0.796875, + "learning_rate": 1.8477466627615172e-05, + "loss": 1.468, + "step": 2104 + }, + { + "epoch": 0.3631501768308462, + "grad_norm": 0.59765625, + "learning_rate": 1.8476022625336067e-05, + "loss": 1.3662, + "step": 2105 + }, + { + "epoch": 0.3633226947295782, + "grad_norm": 0.61328125, + "learning_rate": 1.8474577995100247e-05, + "loss": 1.5097, + "step": 2106 + }, + { + "epoch": 0.3634952126283102, + "grad_norm": 0.62890625, + "learning_rate": 1.847313273701474e-05, + "loss": 1.3676, + "step": 2107 + }, + { + "epoch": 0.36366773052704215, + "grad_norm": 0.73828125, + "learning_rate": 1.847168685118661e-05, + "loss": 1.5271, + "step": 2108 + }, + { + "epoch": 0.36384024842577417, + "grad_norm": 0.6484375, + "learning_rate": 1.847024033772299e-05, + "loss": 1.4227, + "step": 2109 + }, + { + "epoch": 0.3640127663245062, + "grad_norm": 0.68359375, + "learning_rate": 1.846879319673104e-05, + "loss": 1.3847, + "step": 2110 + }, + { + "epoch": 0.36418528422323815, + "grad_norm": 0.640625, + "learning_rate": 1.8467345428317976e-05, + "loss": 1.4954, + "step": 2111 + }, + { + "epoch": 0.36435780212197016, + "grad_norm": 0.67578125, + "learning_rate": 1.8465897032591057e-05, + "loss": 1.5108, + "step": 2112 + }, + { + "epoch": 0.3645303200207021, + "grad_norm": 0.66796875, + "learning_rate": 1.8464448009657582e-05, + "loss": 1.4386, + "step": 2113 + }, + { + "epoch": 0.36470283791943414, + "grad_norm": 0.74609375, + "learning_rate": 1.8462998359624914e-05, + "loss": 1.4996, + "step": 2114 + }, + { + "epoch": 0.36487535581816616, + "grad_norm": 0.73046875, + "learning_rate": 1.846154808260045e-05, + "loss": 1.5164, + "step": 2115 + }, + { + "epoch": 0.3650478737168981, + "grad_norm": 0.6953125, + "learning_rate": 1.8460097178691634e-05, + "loss": 1.5406, + "step": 2116 + }, + { + "epoch": 0.36522039161563014, + "grad_norm": 0.640625, + "learning_rate": 1.8458645648005957e-05, + "loss": 1.4248, + "step": 2117 + }, + { + "epoch": 0.3653929095143621, + "grad_norm": 0.77734375, + "learning_rate": 1.8457193490650957e-05, + "loss": 1.4584, + "step": 2118 + }, + { + "epoch": 0.3655654274130941, + "grad_norm": 0.66015625, + "learning_rate": 1.845574070673422e-05, + "loss": 1.4591, + "step": 2119 + }, + { + "epoch": 0.36573794531182613, + "grad_norm": 0.875, + "learning_rate": 1.8454287296363382e-05, + "loss": 1.3891, + "step": 2120 + }, + { + "epoch": 0.3659104632105581, + "grad_norm": 0.671875, + "learning_rate": 1.8452833259646113e-05, + "loss": 1.4364, + "step": 2121 + }, + { + "epoch": 0.3660829811092901, + "grad_norm": 0.72265625, + "learning_rate": 1.8451378596690138e-05, + "loss": 1.3792, + "step": 2122 + }, + { + "epoch": 0.36625549900802207, + "grad_norm": 1.046875, + "learning_rate": 1.8449923307603234e-05, + "loss": 1.6282, + "step": 2123 + }, + { + "epoch": 0.3664280169067541, + "grad_norm": 0.671875, + "learning_rate": 1.844846739249321e-05, + "loss": 1.5017, + "step": 2124 + }, + { + "epoch": 0.36660053480548604, + "grad_norm": 0.7109375, + "learning_rate": 1.8447010851467936e-05, + "loss": 1.52, + "step": 2125 + }, + { + "epoch": 0.36677305270421806, + "grad_norm": 1.0546875, + "learning_rate": 1.844555368463532e-05, + "loss": 1.5371, + "step": 2126 + }, + { + "epoch": 0.3669455706029501, + "grad_norm": 0.6015625, + "learning_rate": 1.8444095892103314e-05, + "loss": 1.4878, + "step": 2127 + }, + { + "epoch": 0.36711808850168204, + "grad_norm": 0.73046875, + "learning_rate": 1.844263747397992e-05, + "loss": 1.4632, + "step": 2128 + }, + { + "epoch": 0.36729060640041405, + "grad_norm": 0.77734375, + "learning_rate": 1.8441178430373197e-05, + "loss": 1.4249, + "step": 2129 + }, + { + "epoch": 0.367463124299146, + "grad_norm": 0.67578125, + "learning_rate": 1.843971876139123e-05, + "loss": 1.4135, + "step": 2130 + }, + { + "epoch": 0.36763564219787803, + "grad_norm": 0.66015625, + "learning_rate": 1.843825846714216e-05, + "loss": 1.3984, + "step": 2131 + }, + { + "epoch": 0.36780816009661005, + "grad_norm": 0.890625, + "learning_rate": 1.8436797547734185e-05, + "loss": 1.4832, + "step": 2132 + }, + { + "epoch": 0.367980677995342, + "grad_norm": 0.68359375, + "learning_rate": 1.8435336003275525e-05, + "loss": 1.476, + "step": 2133 + }, + { + "epoch": 0.368153195894074, + "grad_norm": 0.59765625, + "learning_rate": 1.8433873833874473e-05, + "loss": 1.4526, + "step": 2134 + }, + { + "epoch": 0.368325713792806, + "grad_norm": 0.78125, + "learning_rate": 1.843241103963935e-05, + "loss": 1.4606, + "step": 2135 + }, + { + "epoch": 0.368498231691538, + "grad_norm": 0.9140625, + "learning_rate": 1.8430947620678522e-05, + "loss": 1.463, + "step": 2136 + }, + { + "epoch": 0.36867074959026996, + "grad_norm": 0.68359375, + "learning_rate": 1.8429483577100424e-05, + "loss": 1.5067, + "step": 2137 + }, + { + "epoch": 0.368843267489002, + "grad_norm": 0.91796875, + "learning_rate": 1.842801890901351e-05, + "loss": 1.4845, + "step": 2138 + }, + { + "epoch": 0.369015785387734, + "grad_norm": 0.9375, + "learning_rate": 1.842655361652629e-05, + "loss": 1.5341, + "step": 2139 + }, + { + "epoch": 0.36918830328646596, + "grad_norm": 0.69921875, + "learning_rate": 1.842508769974733e-05, + "loss": 1.5092, + "step": 2140 + }, + { + "epoch": 0.369360821185198, + "grad_norm": 1.03125, + "learning_rate": 1.8423621158785232e-05, + "loss": 1.4552, + "step": 2141 + }, + { + "epoch": 0.36953333908392993, + "grad_norm": 0.98046875, + "learning_rate": 1.8422153993748645e-05, + "loss": 1.5087, + "step": 2142 + }, + { + "epoch": 0.36970585698266195, + "grad_norm": 0.6328125, + "learning_rate": 1.8420686204746264e-05, + "loss": 1.6148, + "step": 2143 + }, + { + "epoch": 0.36987837488139397, + "grad_norm": 0.703125, + "learning_rate": 1.8419217791886838e-05, + "loss": 1.3866, + "step": 2144 + }, + { + "epoch": 0.37005089278012593, + "grad_norm": 0.8515625, + "learning_rate": 1.8417748755279146e-05, + "loss": 1.5025, + "step": 2145 + }, + { + "epoch": 0.37022341067885794, + "grad_norm": 0.63671875, + "learning_rate": 1.8416279095032036e-05, + "loss": 1.4978, + "step": 2146 + }, + { + "epoch": 0.3703959285775899, + "grad_norm": 0.703125, + "learning_rate": 1.8414808811254378e-05, + "loss": 1.4778, + "step": 2147 + }, + { + "epoch": 0.3705684464763219, + "grad_norm": 0.81640625, + "learning_rate": 1.841333790405511e-05, + "loss": 1.4975, + "step": 2148 + }, + { + "epoch": 0.37074096437505394, + "grad_norm": 0.90234375, + "learning_rate": 1.8411866373543196e-05, + "loss": 1.5895, + "step": 2149 + }, + { + "epoch": 0.3709134822737859, + "grad_norm": 0.6015625, + "learning_rate": 1.8410394219827665e-05, + "loss": 1.4017, + "step": 2150 + }, + { + "epoch": 0.3710860001725179, + "grad_norm": 0.7890625, + "learning_rate": 1.840892144301758e-05, + "loss": 1.5247, + "step": 2151 + }, + { + "epoch": 0.3712585180712499, + "grad_norm": 0.64453125, + "learning_rate": 1.8407448043222052e-05, + "loss": 1.5868, + "step": 2152 + }, + { + "epoch": 0.3714310359699819, + "grad_norm": 0.796875, + "learning_rate": 1.8405974020550238e-05, + "loss": 1.4678, + "step": 2153 + }, + { + "epoch": 0.37160355386871385, + "grad_norm": 0.63671875, + "learning_rate": 1.840449937511135e-05, + "loss": 1.4668, + "step": 2154 + }, + { + "epoch": 0.37177607176744587, + "grad_norm": 0.60546875, + "learning_rate": 1.8403024107014633e-05, + "loss": 1.4756, + "step": 2155 + }, + { + "epoch": 0.3719485896661779, + "grad_norm": 0.60546875, + "learning_rate": 1.8401548216369387e-05, + "loss": 1.4489, + "step": 2156 + }, + { + "epoch": 0.37212110756490985, + "grad_norm": 0.9296875, + "learning_rate": 1.8400071703284952e-05, + "loss": 1.5294, + "step": 2157 + }, + { + "epoch": 0.37229362546364186, + "grad_norm": 0.58203125, + "learning_rate": 1.839859456787072e-05, + "loss": 1.4748, + "step": 2158 + }, + { + "epoch": 0.3724661433623738, + "grad_norm": 0.63671875, + "learning_rate": 1.8397116810236123e-05, + "loss": 1.5687, + "step": 2159 + }, + { + "epoch": 0.37263866126110584, + "grad_norm": 0.60546875, + "learning_rate": 1.839563843049065e-05, + "loss": 1.4819, + "step": 2160 + }, + { + "epoch": 0.37281117915983786, + "grad_norm": 0.67578125, + "learning_rate": 1.8394159428743818e-05, + "loss": 1.507, + "step": 2161 + }, + { + "epoch": 0.3729836970585698, + "grad_norm": 0.69140625, + "learning_rate": 1.839267980510521e-05, + "loss": 1.4811, + "step": 2162 + }, + { + "epoch": 0.37315621495730183, + "grad_norm": 0.58984375, + "learning_rate": 1.839119955968444e-05, + "loss": 1.4537, + "step": 2163 + }, + { + "epoch": 0.3733287328560338, + "grad_norm": 0.7734375, + "learning_rate": 1.8389718692591177e-05, + "loss": 1.5256, + "step": 2164 + }, + { + "epoch": 0.3735012507547658, + "grad_norm": 0.59765625, + "learning_rate": 1.8388237203935134e-05, + "loss": 1.3834, + "step": 2165 + }, + { + "epoch": 0.3736737686534978, + "grad_norm": 0.640625, + "learning_rate": 1.838675509382606e-05, + "loss": 1.5364, + "step": 2166 + }, + { + "epoch": 0.3738462865522298, + "grad_norm": 0.65234375, + "learning_rate": 1.8385272362373775e-05, + "loss": 1.5554, + "step": 2167 + }, + { + "epoch": 0.3740188044509618, + "grad_norm": 0.5625, + "learning_rate": 1.8383789009688117e-05, + "loss": 1.4733, + "step": 2168 + }, + { + "epoch": 0.37419132234969377, + "grad_norm": 0.6953125, + "learning_rate": 1.8382305035878983e-05, + "loss": 1.4997, + "step": 2169 + }, + { + "epoch": 0.3743638402484258, + "grad_norm": 0.84375, + "learning_rate": 1.8380820441056317e-05, + "loss": 1.4882, + "step": 2170 + }, + { + "epoch": 0.37453635814715774, + "grad_norm": 0.66015625, + "learning_rate": 1.8379335225330108e-05, + "loss": 1.5061, + "step": 2171 + }, + { + "epoch": 0.37470887604588976, + "grad_norm": 0.6328125, + "learning_rate": 1.8377849388810386e-05, + "loss": 1.5237, + "step": 2172 + }, + { + "epoch": 0.3748813939446218, + "grad_norm": 0.76953125, + "learning_rate": 1.8376362931607237e-05, + "loss": 1.485, + "step": 2173 + }, + { + "epoch": 0.37505391184335374, + "grad_norm": 0.7734375, + "learning_rate": 1.8374875853830784e-05, + "loss": 1.437, + "step": 2174 + }, + { + "epoch": 0.37522642974208575, + "grad_norm": 0.921875, + "learning_rate": 1.8373388155591197e-05, + "loss": 1.4549, + "step": 2175 + }, + { + "epoch": 0.3753989476408177, + "grad_norm": 0.68359375, + "learning_rate": 1.8371899836998697e-05, + "loss": 1.464, + "step": 2176 + }, + { + "epoch": 0.37557146553954973, + "grad_norm": 0.84375, + "learning_rate": 1.8370410898163553e-05, + "loss": 1.508, + "step": 2177 + }, + { + "epoch": 0.37574398343828175, + "grad_norm": 0.62890625, + "learning_rate": 1.836892133919606e-05, + "loss": 1.5496, + "step": 2178 + }, + { + "epoch": 0.3759165013370137, + "grad_norm": 0.63671875, + "learning_rate": 1.8367431160206586e-05, + "loss": 1.4058, + "step": 2179 + }, + { + "epoch": 0.3760890192357457, + "grad_norm": 0.60546875, + "learning_rate": 1.8365940361305528e-05, + "loss": 1.4875, + "step": 2180 + }, + { + "epoch": 0.3762615371344777, + "grad_norm": 0.6640625, + "learning_rate": 1.836444894260334e-05, + "loss": 1.5064, + "step": 2181 + }, + { + "epoch": 0.3764340550332097, + "grad_norm": 0.6015625, + "learning_rate": 1.8362956904210507e-05, + "loss": 1.4744, + "step": 2182 + }, + { + "epoch": 0.37660657293194166, + "grad_norm": 0.65625, + "learning_rate": 1.8361464246237575e-05, + "loss": 1.483, + "step": 2183 + }, + { + "epoch": 0.3767790908306737, + "grad_norm": 0.6484375, + "learning_rate": 1.835997096879512e-05, + "loss": 1.4102, + "step": 2184 + }, + { + "epoch": 0.3769516087294057, + "grad_norm": 0.71484375, + "learning_rate": 1.8358477071993787e-05, + "loss": 1.5903, + "step": 2185 + }, + { + "epoch": 0.37712412662813766, + "grad_norm": 0.765625, + "learning_rate": 1.8356982555944245e-05, + "loss": 1.4794, + "step": 2186 + }, + { + "epoch": 0.3772966445268697, + "grad_norm": 0.68359375, + "learning_rate": 1.8355487420757218e-05, + "loss": 1.4062, + "step": 2187 + }, + { + "epoch": 0.37746916242560163, + "grad_norm": 0.703125, + "learning_rate": 1.8353991666543477e-05, + "loss": 1.4301, + "step": 2188 + }, + { + "epoch": 0.37764168032433365, + "grad_norm": 0.74609375, + "learning_rate": 1.8352495293413833e-05, + "loss": 1.4539, + "step": 2189 + }, + { + "epoch": 0.37781419822306567, + "grad_norm": 0.7578125, + "learning_rate": 1.8350998301479147e-05, + "loss": 1.5107, + "step": 2190 + }, + { + "epoch": 0.37798671612179763, + "grad_norm": 0.7890625, + "learning_rate": 1.834950069085033e-05, + "loss": 1.4468, + "step": 2191 + }, + { + "epoch": 0.37815923402052964, + "grad_norm": 0.58984375, + "learning_rate": 1.8348002461638333e-05, + "loss": 1.4679, + "step": 2192 + }, + { + "epoch": 0.3783317519192616, + "grad_norm": 0.59765625, + "learning_rate": 1.834650361395415e-05, + "loss": 1.4242, + "step": 2193 + }, + { + "epoch": 0.3785042698179936, + "grad_norm": 0.66796875, + "learning_rate": 1.8345004147908828e-05, + "loss": 1.4978, + "step": 2194 + }, + { + "epoch": 0.37867678771672564, + "grad_norm": 0.734375, + "learning_rate": 1.834350406361346e-05, + "loss": 1.4935, + "step": 2195 + }, + { + "epoch": 0.3788493056154576, + "grad_norm": 0.765625, + "learning_rate": 1.834200336117918e-05, + "loss": 1.5364, + "step": 2196 + }, + { + "epoch": 0.3790218235141896, + "grad_norm": 0.60546875, + "learning_rate": 1.8340502040717162e-05, + "loss": 1.5463, + "step": 2197 + }, + { + "epoch": 0.3791943414129216, + "grad_norm": 0.63671875, + "learning_rate": 1.833900010233864e-05, + "loss": 1.4713, + "step": 2198 + }, + { + "epoch": 0.3793668593116536, + "grad_norm": 0.5859375, + "learning_rate": 1.8337497546154888e-05, + "loss": 1.548, + "step": 2199 + }, + { + "epoch": 0.37953937721038555, + "grad_norm": 0.7890625, + "learning_rate": 1.833599437227722e-05, + "loss": 1.4916, + "step": 2200 + }, + { + "epoch": 0.37953937721038555, + "eval_loss": 1.4695838689804077, + "eval_runtime": 10.8414, + "eval_samples_per_second": 94.453, + "eval_steps_per_second": 23.613, + "step": 2200 + }, + { + "epoch": 0.37971189510911757, + "grad_norm": 0.67578125, + "learning_rate": 1.8334490580817005e-05, + "loss": 1.5291, + "step": 2201 + }, + { + "epoch": 0.3798844130078496, + "grad_norm": 0.74609375, + "learning_rate": 1.8332986171885652e-05, + "loss": 1.5046, + "step": 2202 + }, + { + "epoch": 0.38005693090658155, + "grad_norm": 0.6484375, + "learning_rate": 1.8331481145594617e-05, + "loss": 1.4797, + "step": 2203 + }, + { + "epoch": 0.38022944880531356, + "grad_norm": 0.7109375, + "learning_rate": 1.83299755020554e-05, + "loss": 1.5314, + "step": 2204 + }, + { + "epoch": 0.3804019667040455, + "grad_norm": 0.83984375, + "learning_rate": 1.8328469241379546e-05, + "loss": 1.5771, + "step": 2205 + }, + { + "epoch": 0.38057448460277754, + "grad_norm": 0.6875, + "learning_rate": 1.8326962363678656e-05, + "loss": 1.474, + "step": 2206 + }, + { + "epoch": 0.38074700250150956, + "grad_norm": 0.87890625, + "learning_rate": 1.8325454869064366e-05, + "loss": 1.4271, + "step": 2207 + }, + { + "epoch": 0.3809195204002415, + "grad_norm": 0.703125, + "learning_rate": 1.8323946757648357e-05, + "loss": 1.5268, + "step": 2208 + }, + { + "epoch": 0.38109203829897353, + "grad_norm": 1.0234375, + "learning_rate": 1.8322438029542364e-05, + "loss": 1.5208, + "step": 2209 + }, + { + "epoch": 0.3812645561977055, + "grad_norm": 0.9296875, + "learning_rate": 1.8320928684858162e-05, + "loss": 1.4641, + "step": 2210 + }, + { + "epoch": 0.3814370740964375, + "grad_norm": 0.77734375, + "learning_rate": 1.8319418723707568e-05, + "loss": 1.4852, + "step": 2211 + }, + { + "epoch": 0.3816095919951695, + "grad_norm": 1.03125, + "learning_rate": 1.8317908146202455e-05, + "loss": 1.4708, + "step": 2212 + }, + { + "epoch": 0.3817821098939015, + "grad_norm": 0.890625, + "learning_rate": 1.831639695245473e-05, + "loss": 1.5509, + "step": 2213 + }, + { + "epoch": 0.3819546277926335, + "grad_norm": 0.6484375, + "learning_rate": 1.831488514257636e-05, + "loss": 1.5525, + "step": 2214 + }, + { + "epoch": 0.38212714569136547, + "grad_norm": 0.828125, + "learning_rate": 1.8313372716679344e-05, + "loss": 1.4788, + "step": 2215 + }, + { + "epoch": 0.3822996635900975, + "grad_norm": 0.86328125, + "learning_rate": 1.8311859674875728e-05, + "loss": 1.5408, + "step": 2216 + }, + { + "epoch": 0.38247218148882944, + "grad_norm": 0.8828125, + "learning_rate": 1.8310346017277618e-05, + "loss": 1.4896, + "step": 2217 + }, + { + "epoch": 0.38264469938756146, + "grad_norm": 0.78515625, + "learning_rate": 1.8308831743997147e-05, + "loss": 1.4985, + "step": 2218 + }, + { + "epoch": 0.3828172172862935, + "grad_norm": 0.7734375, + "learning_rate": 1.8307316855146507e-05, + "loss": 1.5242, + "step": 2219 + }, + { + "epoch": 0.38298973518502544, + "grad_norm": 0.7578125, + "learning_rate": 1.8305801350837926e-05, + "loss": 1.5283, + "step": 2220 + }, + { + "epoch": 0.38316225308375745, + "grad_norm": 0.63671875, + "learning_rate": 1.8304285231183683e-05, + "loss": 1.4785, + "step": 2221 + }, + { + "epoch": 0.3833347709824894, + "grad_norm": 0.77734375, + "learning_rate": 1.8302768496296105e-05, + "loss": 1.4535, + "step": 2222 + }, + { + "epoch": 0.38350728888122143, + "grad_norm": 0.9296875, + "learning_rate": 1.8301251146287557e-05, + "loss": 1.5102, + "step": 2223 + }, + { + "epoch": 0.38367980677995345, + "grad_norm": 0.62109375, + "learning_rate": 1.8299733181270455e-05, + "loss": 1.4878, + "step": 2224 + }, + { + "epoch": 0.3838523246786854, + "grad_norm": 0.58984375, + "learning_rate": 1.829821460135726e-05, + "loss": 1.5581, + "step": 2225 + }, + { + "epoch": 0.3840248425774174, + "grad_norm": 0.7265625, + "learning_rate": 1.8296695406660477e-05, + "loss": 1.4887, + "step": 2226 + }, + { + "epoch": 0.3841973604761494, + "grad_norm": 0.6953125, + "learning_rate": 1.829517559729266e-05, + "loss": 1.3807, + "step": 2227 + }, + { + "epoch": 0.3843698783748814, + "grad_norm": 0.77734375, + "learning_rate": 1.8293655173366405e-05, + "loss": 1.355, + "step": 2228 + }, + { + "epoch": 0.38454239627361336, + "grad_norm": 0.61328125, + "learning_rate": 1.829213413499435e-05, + "loss": 1.516, + "step": 2229 + }, + { + "epoch": 0.3847149141723454, + "grad_norm": 0.65625, + "learning_rate": 1.829061248228919e-05, + "loss": 1.4854, + "step": 2230 + }, + { + "epoch": 0.3848874320710774, + "grad_norm": 0.640625, + "learning_rate": 1.8289090215363653e-05, + "loss": 1.4833, + "step": 2231 + }, + { + "epoch": 0.38505994996980936, + "grad_norm": 0.72265625, + "learning_rate": 1.8287567334330522e-05, + "loss": 1.6318, + "step": 2232 + }, + { + "epoch": 0.3852324678685414, + "grad_norm": 0.8046875, + "learning_rate": 1.8286043839302618e-05, + "loss": 1.5281, + "step": 2233 + }, + { + "epoch": 0.38540498576727333, + "grad_norm": 0.64453125, + "learning_rate": 1.8284519730392813e-05, + "loss": 1.5238, + "step": 2234 + }, + { + "epoch": 0.38557750366600535, + "grad_norm": 0.75390625, + "learning_rate": 1.828299500771402e-05, + "loss": 1.4661, + "step": 2235 + }, + { + "epoch": 0.38575002156473737, + "grad_norm": 0.66796875, + "learning_rate": 1.8281469671379208e-05, + "loss": 1.4864, + "step": 2236 + }, + { + "epoch": 0.3859225394634693, + "grad_norm": 0.73828125, + "learning_rate": 1.8279943721501376e-05, + "loss": 1.5738, + "step": 2237 + }, + { + "epoch": 0.38609505736220134, + "grad_norm": 0.71875, + "learning_rate": 1.8278417158193575e-05, + "loss": 1.5128, + "step": 2238 + }, + { + "epoch": 0.3862675752609333, + "grad_norm": 0.73046875, + "learning_rate": 1.827688998156891e-05, + "loss": 1.4928, + "step": 2239 + }, + { + "epoch": 0.3864400931596653, + "grad_norm": 0.59375, + "learning_rate": 1.8275362191740514e-05, + "loss": 1.4992, + "step": 2240 + }, + { + "epoch": 0.3866126110583973, + "grad_norm": 1.0390625, + "learning_rate": 1.827383378882158e-05, + "loss": 1.5278, + "step": 2241 + }, + { + "epoch": 0.3867851289571293, + "grad_norm": 0.59375, + "learning_rate": 1.8272304772925342e-05, + "loss": 1.494, + "step": 2242 + }, + { + "epoch": 0.3869576468558613, + "grad_norm": 0.7578125, + "learning_rate": 1.8270775144165082e-05, + "loss": 1.3458, + "step": 2243 + }, + { + "epoch": 0.3871301647545933, + "grad_norm": 0.6875, + "learning_rate": 1.8269244902654116e-05, + "loss": 1.4868, + "step": 2244 + }, + { + "epoch": 0.3873026826533253, + "grad_norm": 0.6171875, + "learning_rate": 1.8267714048505822e-05, + "loss": 1.5205, + "step": 2245 + }, + { + "epoch": 0.38747520055205725, + "grad_norm": 0.73828125, + "learning_rate": 1.8266182581833615e-05, + "loss": 1.4998, + "step": 2246 + }, + { + "epoch": 0.38764771845078927, + "grad_norm": 0.609375, + "learning_rate": 1.8264650502750946e-05, + "loss": 1.5161, + "step": 2247 + }, + { + "epoch": 0.3878202363495213, + "grad_norm": 0.65625, + "learning_rate": 1.8263117811371333e-05, + "loss": 1.4975, + "step": 2248 + }, + { + "epoch": 0.38799275424825325, + "grad_norm": 0.80859375, + "learning_rate": 1.8261584507808318e-05, + "loss": 1.4637, + "step": 2249 + }, + { + "epoch": 0.38816527214698526, + "grad_norm": 0.64453125, + "learning_rate": 1.8260050592175507e-05, + "loss": 1.4393, + "step": 2250 + }, + { + "epoch": 0.3883377900457172, + "grad_norm": 0.5859375, + "learning_rate": 1.8258516064586532e-05, + "loss": 1.4482, + "step": 2251 + }, + { + "epoch": 0.38851030794444924, + "grad_norm": 0.8359375, + "learning_rate": 1.825698092515509e-05, + "loss": 1.5549, + "step": 2252 + }, + { + "epoch": 0.38868282584318126, + "grad_norm": 0.6640625, + "learning_rate": 1.8255445173994907e-05, + "loss": 1.4593, + "step": 2253 + }, + { + "epoch": 0.3888553437419132, + "grad_norm": 0.828125, + "learning_rate": 1.8253908811219764e-05, + "loss": 1.5047, + "step": 2254 + }, + { + "epoch": 0.38902786164064523, + "grad_norm": 0.69921875, + "learning_rate": 1.8252371836943483e-05, + "loss": 1.5405, + "step": 2255 + }, + { + "epoch": 0.3892003795393772, + "grad_norm": 0.9296875, + "learning_rate": 1.825083425127993e-05, + "loss": 1.4635, + "step": 2256 + }, + { + "epoch": 0.3893728974381092, + "grad_norm": 0.66796875, + "learning_rate": 1.8249296054343026e-05, + "loss": 1.5867, + "step": 2257 + }, + { + "epoch": 0.38954541533684117, + "grad_norm": 0.87109375, + "learning_rate": 1.8247757246246726e-05, + "loss": 1.4289, + "step": 2258 + }, + { + "epoch": 0.3897179332355732, + "grad_norm": 0.65625, + "learning_rate": 1.824621782710503e-05, + "loss": 1.4538, + "step": 2259 + }, + { + "epoch": 0.3898904511343052, + "grad_norm": 0.72265625, + "learning_rate": 1.8244677797032e-05, + "loss": 1.4613, + "step": 2260 + }, + { + "epoch": 0.39006296903303717, + "grad_norm": 0.76171875, + "learning_rate": 1.824313715614172e-05, + "loss": 1.5135, + "step": 2261 + }, + { + "epoch": 0.3902354869317692, + "grad_norm": 0.62890625, + "learning_rate": 1.8241595904548336e-05, + "loss": 1.4293, + "step": 2262 + }, + { + "epoch": 0.39040800483050114, + "grad_norm": 0.703125, + "learning_rate": 1.8240054042366026e-05, + "loss": 1.4713, + "step": 2263 + }, + { + "epoch": 0.39058052272923316, + "grad_norm": 0.73046875, + "learning_rate": 1.8238511569709033e-05, + "loss": 1.5722, + "step": 2264 + }, + { + "epoch": 0.3907530406279652, + "grad_norm": 0.625, + "learning_rate": 1.823696848669162e-05, + "loss": 1.5584, + "step": 2265 + }, + { + "epoch": 0.39092555852669714, + "grad_norm": 0.84375, + "learning_rate": 1.823542479342812e-05, + "loss": 1.5078, + "step": 2266 + }, + { + "epoch": 0.39109807642542915, + "grad_norm": 0.7421875, + "learning_rate": 1.823388049003289e-05, + "loss": 1.5375, + "step": 2267 + }, + { + "epoch": 0.3912705943241611, + "grad_norm": 0.62109375, + "learning_rate": 1.823233557662035e-05, + "loss": 1.5285, + "step": 2268 + }, + { + "epoch": 0.39144311222289313, + "grad_norm": 0.9609375, + "learning_rate": 1.823079005330495e-05, + "loss": 1.4881, + "step": 2269 + }, + { + "epoch": 0.3916156301216251, + "grad_norm": 0.7109375, + "learning_rate": 1.8229243920201194e-05, + "loss": 1.5836, + "step": 2270 + }, + { + "epoch": 0.3917881480203571, + "grad_norm": 0.625, + "learning_rate": 1.822769717742363e-05, + "loss": 1.498, + "step": 2271 + }, + { + "epoch": 0.3919606659190891, + "grad_norm": 0.75390625, + "learning_rate": 1.822614982508685e-05, + "loss": 1.5081, + "step": 2272 + }, + { + "epoch": 0.3921331838178211, + "grad_norm": 0.66796875, + "learning_rate": 1.8224601863305495e-05, + "loss": 1.4186, + "step": 2273 + }, + { + "epoch": 0.3923057017165531, + "grad_norm": 0.62109375, + "learning_rate": 1.822305329219424e-05, + "loss": 1.4982, + "step": 2274 + }, + { + "epoch": 0.39247821961528506, + "grad_norm": 0.83984375, + "learning_rate": 1.8221504111867817e-05, + "loss": 1.4796, + "step": 2275 + }, + { + "epoch": 0.3926507375140171, + "grad_norm": 0.76171875, + "learning_rate": 1.8219954322441e-05, + "loss": 1.5749, + "step": 2276 + }, + { + "epoch": 0.3928232554127491, + "grad_norm": 0.859375, + "learning_rate": 1.8218403924028608e-05, + "loss": 1.4309, + "step": 2277 + }, + { + "epoch": 0.39299577331148106, + "grad_norm": 0.85546875, + "learning_rate": 1.82168529167455e-05, + "loss": 1.5298, + "step": 2278 + }, + { + "epoch": 0.3931682912102131, + "grad_norm": 0.65234375, + "learning_rate": 1.8215301300706584e-05, + "loss": 1.4536, + "step": 2279 + }, + { + "epoch": 0.39334080910894503, + "grad_norm": 0.71875, + "learning_rate": 1.821374907602682e-05, + "loss": 1.4692, + "step": 2280 + }, + { + "epoch": 0.39351332700767705, + "grad_norm": 0.890625, + "learning_rate": 1.8212196242821206e-05, + "loss": 1.4213, + "step": 2281 + }, + { + "epoch": 0.39368584490640907, + "grad_norm": 0.59765625, + "learning_rate": 1.8210642801204775e-05, + "loss": 1.4134, + "step": 2282 + }, + { + "epoch": 0.393858362805141, + "grad_norm": 0.66015625, + "learning_rate": 1.820908875129263e-05, + "loss": 1.4815, + "step": 2283 + }, + { + "epoch": 0.39403088070387304, + "grad_norm": 0.65234375, + "learning_rate": 1.820753409319989e-05, + "loss": 1.4424, + "step": 2284 + }, + { + "epoch": 0.394203398602605, + "grad_norm": 0.6484375, + "learning_rate": 1.8205978827041745e-05, + "loss": 1.4839, + "step": 2285 + }, + { + "epoch": 0.394375916501337, + "grad_norm": 1.171875, + "learning_rate": 1.8204422952933416e-05, + "loss": 1.4836, + "step": 2286 + }, + { + "epoch": 0.394548434400069, + "grad_norm": 0.74609375, + "learning_rate": 1.8202866470990172e-05, + "loss": 1.5224, + "step": 2287 + }, + { + "epoch": 0.394720952298801, + "grad_norm": 0.6484375, + "learning_rate": 1.8201309381327324e-05, + "loss": 1.3705, + "step": 2288 + }, + { + "epoch": 0.394893470197533, + "grad_norm": 0.67578125, + "learning_rate": 1.819975168406023e-05, + "loss": 1.4925, + "step": 2289 + }, + { + "epoch": 0.395065988096265, + "grad_norm": 0.77734375, + "learning_rate": 1.81981933793043e-05, + "loss": 1.5281, + "step": 2290 + }, + { + "epoch": 0.395238505994997, + "grad_norm": 0.85546875, + "learning_rate": 1.8196634467174982e-05, + "loss": 1.5847, + "step": 2291 + }, + { + "epoch": 0.39541102389372895, + "grad_norm": 0.859375, + "learning_rate": 1.8195074947787764e-05, + "loss": 1.5413, + "step": 2292 + }, + { + "epoch": 0.39558354179246097, + "grad_norm": 0.734375, + "learning_rate": 1.8193514821258188e-05, + "loss": 1.306, + "step": 2293 + }, + { + "epoch": 0.395756059691193, + "grad_norm": 0.5859375, + "learning_rate": 1.819195408770184e-05, + "loss": 1.4656, + "step": 2294 + }, + { + "epoch": 0.39592857758992495, + "grad_norm": 0.65234375, + "learning_rate": 1.819039274723435e-05, + "loss": 1.4589, + "step": 2295 + }, + { + "epoch": 0.39610109548865696, + "grad_norm": 0.69921875, + "learning_rate": 1.818883079997139e-05, + "loss": 1.5112, + "step": 2296 + }, + { + "epoch": 0.3962736133873889, + "grad_norm": 0.6640625, + "learning_rate": 1.8187268246028672e-05, + "loss": 1.4949, + "step": 2297 + }, + { + "epoch": 0.39644613128612094, + "grad_norm": 0.59375, + "learning_rate": 1.818570508552197e-05, + "loss": 1.4868, + "step": 2298 + }, + { + "epoch": 0.39661864918485296, + "grad_norm": 0.57421875, + "learning_rate": 1.818414131856709e-05, + "loss": 1.521, + "step": 2299 + }, + { + "epoch": 0.3967911670835849, + "grad_norm": 0.6484375, + "learning_rate": 1.818257694527988e-05, + "loss": 1.3609, + "step": 2300 + }, + { + "epoch": 0.3967911670835849, + "eval_loss": 1.4656120538711548, + "eval_runtime": 10.8637, + "eval_samples_per_second": 94.259, + "eval_steps_per_second": 23.565, + "step": 2300 + }, + { + "epoch": 0.39696368498231693, + "grad_norm": 0.6328125, + "learning_rate": 1.8181011965776244e-05, + "loss": 1.4911, + "step": 2301 + }, + { + "epoch": 0.3971362028810489, + "grad_norm": 0.60546875, + "learning_rate": 1.8179446380172127e-05, + "loss": 1.6361, + "step": 2302 + }, + { + "epoch": 0.3973087207797809, + "grad_norm": 0.625, + "learning_rate": 1.8177880188583513e-05, + "loss": 1.4886, + "step": 2303 + }, + { + "epoch": 0.39748123867851287, + "grad_norm": 0.578125, + "learning_rate": 1.8176313391126438e-05, + "loss": 1.3953, + "step": 2304 + }, + { + "epoch": 0.3976537565772449, + "grad_norm": 0.76953125, + "learning_rate": 1.817474598791698e-05, + "loss": 1.5849, + "step": 2305 + }, + { + "epoch": 0.3978262744759769, + "grad_norm": 0.59375, + "learning_rate": 1.8173177979071256e-05, + "loss": 1.434, + "step": 2306 + }, + { + "epoch": 0.39799879237470887, + "grad_norm": 0.640625, + "learning_rate": 1.8171609364705443e-05, + "loss": 1.6028, + "step": 2307 + }, + { + "epoch": 0.3981713102734409, + "grad_norm": 0.7734375, + "learning_rate": 1.8170040144935747e-05, + "loss": 1.4931, + "step": 2308 + }, + { + "epoch": 0.39834382817217284, + "grad_norm": 0.65625, + "learning_rate": 1.816847031987843e-05, + "loss": 1.4978, + "step": 2309 + }, + { + "epoch": 0.39851634607090486, + "grad_norm": 0.79296875, + "learning_rate": 1.8166899889649795e-05, + "loss": 1.4645, + "step": 2310 + }, + { + "epoch": 0.3986888639696369, + "grad_norm": 0.67578125, + "learning_rate": 1.8165328854366183e-05, + "loss": 1.5468, + "step": 2311 + }, + { + "epoch": 0.39886138186836884, + "grad_norm": 0.66796875, + "learning_rate": 1.8163757214143993e-05, + "loss": 1.5296, + "step": 2312 + }, + { + "epoch": 0.39903389976710085, + "grad_norm": 0.671875, + "learning_rate": 1.8162184969099658e-05, + "loss": 1.4244, + "step": 2313 + }, + { + "epoch": 0.3992064176658328, + "grad_norm": 0.59765625, + "learning_rate": 1.816061211934966e-05, + "loss": 1.5313, + "step": 2314 + }, + { + "epoch": 0.39937893556456483, + "grad_norm": 0.65625, + "learning_rate": 1.8159038665010528e-05, + "loss": 1.4613, + "step": 2315 + }, + { + "epoch": 0.3995514534632968, + "grad_norm": 0.65625, + "learning_rate": 1.8157464606198832e-05, + "loss": 1.4243, + "step": 2316 + }, + { + "epoch": 0.3997239713620288, + "grad_norm": 0.609375, + "learning_rate": 1.8155889943031186e-05, + "loss": 1.5109, + "step": 2317 + }, + { + "epoch": 0.3998964892607608, + "grad_norm": 0.6171875, + "learning_rate": 1.815431467562425e-05, + "loss": 1.5602, + "step": 2318 + }, + { + "epoch": 0.4000690071594928, + "grad_norm": 0.65625, + "learning_rate": 1.8152738804094737e-05, + "loss": 1.499, + "step": 2319 + }, + { + "epoch": 0.4002415250582248, + "grad_norm": 0.62890625, + "learning_rate": 1.815116232855939e-05, + "loss": 1.4352, + "step": 2320 + }, + { + "epoch": 0.40041404295695676, + "grad_norm": 0.67578125, + "learning_rate": 1.8149585249135008e-05, + "loss": 1.552, + "step": 2321 + }, + { + "epoch": 0.4005865608556888, + "grad_norm": 0.64453125, + "learning_rate": 1.814800756593843e-05, + "loss": 1.5098, + "step": 2322 + }, + { + "epoch": 0.4007590787544208, + "grad_norm": 0.95703125, + "learning_rate": 1.814642927908654e-05, + "loss": 1.5874, + "step": 2323 + }, + { + "epoch": 0.40093159665315276, + "grad_norm": 0.81640625, + "learning_rate": 1.814485038869627e-05, + "loss": 1.5504, + "step": 2324 + }, + { + "epoch": 0.40110411455188477, + "grad_norm": 0.83984375, + "learning_rate": 1.814327089488459e-05, + "loss": 1.5221, + "step": 2325 + }, + { + "epoch": 0.40127663245061673, + "grad_norm": 0.62109375, + "learning_rate": 1.814169079776852e-05, + "loss": 1.512, + "step": 2326 + }, + { + "epoch": 0.40144915034934875, + "grad_norm": 0.73046875, + "learning_rate": 1.8140110097465123e-05, + "loss": 1.5012, + "step": 2327 + }, + { + "epoch": 0.40162166824808077, + "grad_norm": 0.828125, + "learning_rate": 1.8138528794091514e-05, + "loss": 1.5045, + "step": 2328 + }, + { + "epoch": 0.4017941861468127, + "grad_norm": 1.1796875, + "learning_rate": 1.813694688776483e-05, + "loss": 1.4746, + "step": 2329 + }, + { + "epoch": 0.40196670404554474, + "grad_norm": 0.7734375, + "learning_rate": 1.8135364378602288e-05, + "loss": 1.5166, + "step": 2330 + }, + { + "epoch": 0.4021392219442767, + "grad_norm": 0.59765625, + "learning_rate": 1.8133781266721114e-05, + "loss": 1.4975, + "step": 2331 + }, + { + "epoch": 0.4023117398430087, + "grad_norm": 0.6796875, + "learning_rate": 1.8132197552238608e-05, + "loss": 1.5039, + "step": 2332 + }, + { + "epoch": 0.4024842577417407, + "grad_norm": 0.65234375, + "learning_rate": 1.813061323527209e-05, + "loss": 1.5281, + "step": 2333 + }, + { + "epoch": 0.4026567756404727, + "grad_norm": 0.67578125, + "learning_rate": 1.8129028315938944e-05, + "loss": 1.5463, + "step": 2334 + }, + { + "epoch": 0.4028292935392047, + "grad_norm": 0.59765625, + "learning_rate": 1.8127442794356585e-05, + "loss": 1.4572, + "step": 2335 + }, + { + "epoch": 0.4030018114379367, + "grad_norm": 0.8359375, + "learning_rate": 1.8125856670642485e-05, + "loss": 1.5668, + "step": 2336 + }, + { + "epoch": 0.4031743293366687, + "grad_norm": 0.703125, + "learning_rate": 1.8124269944914147e-05, + "loss": 1.4871, + "step": 2337 + }, + { + "epoch": 0.40334684723540065, + "grad_norm": 1.046875, + "learning_rate": 1.812268261728913e-05, + "loss": 1.5133, + "step": 2338 + }, + { + "epoch": 0.40351936513413267, + "grad_norm": 0.69140625, + "learning_rate": 1.812109468788503e-05, + "loss": 1.3911, + "step": 2339 + }, + { + "epoch": 0.4036918830328647, + "grad_norm": 0.90234375, + "learning_rate": 1.8119506156819495e-05, + "loss": 1.5736, + "step": 2340 + }, + { + "epoch": 0.40386440093159665, + "grad_norm": 0.8125, + "learning_rate": 1.811791702421021e-05, + "loss": 1.5105, + "step": 2341 + }, + { + "epoch": 0.40403691883032866, + "grad_norm": 0.87890625, + "learning_rate": 1.811632729017491e-05, + "loss": 1.5973, + "step": 2342 + }, + { + "epoch": 0.4042094367290606, + "grad_norm": 0.890625, + "learning_rate": 1.8114736954831367e-05, + "loss": 1.4065, + "step": 2343 + }, + { + "epoch": 0.40438195462779264, + "grad_norm": 0.90234375, + "learning_rate": 1.8113146018297413e-05, + "loss": 1.4329, + "step": 2344 + }, + { + "epoch": 0.4045544725265246, + "grad_norm": 0.6875, + "learning_rate": 1.81115544806909e-05, + "loss": 1.55, + "step": 2345 + }, + { + "epoch": 0.4047269904252566, + "grad_norm": 0.75390625, + "learning_rate": 1.8109962342129757e-05, + "loss": 1.4663, + "step": 2346 + }, + { + "epoch": 0.40489950832398863, + "grad_norm": 0.69921875, + "learning_rate": 1.8108369602731928e-05, + "loss": 1.5616, + "step": 2347 + }, + { + "epoch": 0.4050720262227206, + "grad_norm": 0.88671875, + "learning_rate": 1.810677626261541e-05, + "loss": 1.4522, + "step": 2348 + }, + { + "epoch": 0.4052445441214526, + "grad_norm": 0.66796875, + "learning_rate": 1.810518232189826e-05, + "loss": 1.5224, + "step": 2349 + }, + { + "epoch": 0.40541706202018457, + "grad_norm": 0.640625, + "learning_rate": 1.8103587780698556e-05, + "loss": 1.428, + "step": 2350 + }, + { + "epoch": 0.4055895799189166, + "grad_norm": 2.296875, + "learning_rate": 1.8101992639134438e-05, + "loss": 1.5132, + "step": 2351 + }, + { + "epoch": 0.4057620978176486, + "grad_norm": 0.7421875, + "learning_rate": 1.810039689732408e-05, + "loss": 1.4774, + "step": 2352 + }, + { + "epoch": 0.40593461571638056, + "grad_norm": 0.65234375, + "learning_rate": 1.8098800555385707e-05, + "loss": 1.5203, + "step": 2353 + }, + { + "epoch": 0.4061071336151126, + "grad_norm": 0.640625, + "learning_rate": 1.8097203613437586e-05, + "loss": 1.4957, + "step": 2354 + }, + { + "epoch": 0.40627965151384454, + "grad_norm": 0.625, + "learning_rate": 1.8095606071598028e-05, + "loss": 1.4267, + "step": 2355 + }, + { + "epoch": 0.40645216941257656, + "grad_norm": 0.67578125, + "learning_rate": 1.8094007929985387e-05, + "loss": 1.5352, + "step": 2356 + }, + { + "epoch": 0.4066246873113086, + "grad_norm": 0.55859375, + "learning_rate": 1.8092409188718064e-05, + "loss": 1.5219, + "step": 2357 + }, + { + "epoch": 0.40679720521004054, + "grad_norm": 0.59375, + "learning_rate": 1.8090809847914506e-05, + "loss": 1.4196, + "step": 2358 + }, + { + "epoch": 0.40696972310877255, + "grad_norm": 0.65625, + "learning_rate": 1.80892099076932e-05, + "loss": 1.5713, + "step": 2359 + }, + { + "epoch": 0.4071422410075045, + "grad_norm": 0.6015625, + "learning_rate": 1.808760936817268e-05, + "loss": 1.4479, + "step": 2360 + }, + { + "epoch": 0.40731475890623653, + "grad_norm": 0.609375, + "learning_rate": 1.8086008229471527e-05, + "loss": 1.5341, + "step": 2361 + }, + { + "epoch": 0.4074872768049685, + "grad_norm": 0.62109375, + "learning_rate": 1.8084406491708358e-05, + "loss": 1.5566, + "step": 2362 + }, + { + "epoch": 0.4076597947037005, + "grad_norm": 0.64453125, + "learning_rate": 1.8082804155001842e-05, + "loss": 1.4349, + "step": 2363 + }, + { + "epoch": 0.4078323126024325, + "grad_norm": 0.78515625, + "learning_rate": 1.8081201219470694e-05, + "loss": 1.4505, + "step": 2364 + }, + { + "epoch": 0.4080048305011645, + "grad_norm": 0.640625, + "learning_rate": 1.807959768523366e-05, + "loss": 1.4418, + "step": 2365 + }, + { + "epoch": 0.4081773483998965, + "grad_norm": 0.66796875, + "learning_rate": 1.807799355240955e-05, + "loss": 1.5607, + "step": 2366 + }, + { + "epoch": 0.40834986629862846, + "grad_norm": 0.87109375, + "learning_rate": 1.8076388821117204e-05, + "loss": 1.5266, + "step": 2367 + }, + { + "epoch": 0.4085223841973605, + "grad_norm": 0.609375, + "learning_rate": 1.807478349147551e-05, + "loss": 1.5094, + "step": 2368 + }, + { + "epoch": 0.4086949020960925, + "grad_norm": 0.71875, + "learning_rate": 1.8073177563603403e-05, + "loss": 1.4372, + "step": 2369 + }, + { + "epoch": 0.40886741999482445, + "grad_norm": 0.69921875, + "learning_rate": 1.8071571037619856e-05, + "loss": 1.5019, + "step": 2370 + }, + { + "epoch": 0.40903993789355647, + "grad_norm": 0.60546875, + "learning_rate": 1.8069963913643893e-05, + "loss": 1.5055, + "step": 2371 + }, + { + "epoch": 0.40921245579228843, + "grad_norm": 0.62109375, + "learning_rate": 1.806835619179458e-05, + "loss": 1.4157, + "step": 2372 + }, + { + "epoch": 0.40938497369102045, + "grad_norm": 0.69140625, + "learning_rate": 1.8066747872191028e-05, + "loss": 1.4324, + "step": 2373 + }, + { + "epoch": 0.4095574915897524, + "grad_norm": 0.60546875, + "learning_rate": 1.806513895495239e-05, + "loss": 1.4795, + "step": 2374 + }, + { + "epoch": 0.4097300094884844, + "grad_norm": 0.6015625, + "learning_rate": 1.8063529440197866e-05, + "loss": 1.504, + "step": 2375 + }, + { + "epoch": 0.40990252738721644, + "grad_norm": 0.703125, + "learning_rate": 1.8061919328046695e-05, + "loss": 1.4982, + "step": 2376 + }, + { + "epoch": 0.4100750452859484, + "grad_norm": 1.0625, + "learning_rate": 1.806030861861817e-05, + "loss": 1.4587, + "step": 2377 + }, + { + "epoch": 0.4102475631846804, + "grad_norm": 0.59765625, + "learning_rate": 1.8058697312031615e-05, + "loss": 1.4981, + "step": 2378 + }, + { + "epoch": 0.4104200810834124, + "grad_norm": 0.72265625, + "learning_rate": 1.8057085408406415e-05, + "loss": 1.4198, + "step": 2379 + }, + { + "epoch": 0.4105925989821444, + "grad_norm": 0.78515625, + "learning_rate": 1.805547290786198e-05, + "loss": 1.4076, + "step": 2380 + }, + { + "epoch": 0.4107651168808764, + "grad_norm": 0.640625, + "learning_rate": 1.8053859810517785e-05, + "loss": 1.4019, + "step": 2381 + }, + { + "epoch": 0.4109376347796084, + "grad_norm": 0.7734375, + "learning_rate": 1.805224611649333e-05, + "loss": 1.4232, + "step": 2382 + }, + { + "epoch": 0.4111101526783404, + "grad_norm": 0.77734375, + "learning_rate": 1.805063182590817e-05, + "loss": 1.4582, + "step": 2383 + }, + { + "epoch": 0.41128267057707235, + "grad_norm": 0.60546875, + "learning_rate": 1.8049016938881897e-05, + "loss": 1.4875, + "step": 2384 + }, + { + "epoch": 0.41145518847580437, + "grad_norm": 0.7265625, + "learning_rate": 1.8047401455534162e-05, + "loss": 1.4733, + "step": 2385 + }, + { + "epoch": 0.4116277063745364, + "grad_norm": 0.703125, + "learning_rate": 1.8045785375984642e-05, + "loss": 1.5484, + "step": 2386 + }, + { + "epoch": 0.41180022427326834, + "grad_norm": 0.6640625, + "learning_rate": 1.8044168700353073e-05, + "loss": 1.4825, + "step": 2387 + }, + { + "epoch": 0.41197274217200036, + "grad_norm": 0.8984375, + "learning_rate": 1.804255142875922e-05, + "loss": 1.5638, + "step": 2388 + }, + { + "epoch": 0.4121452600707323, + "grad_norm": 0.80078125, + "learning_rate": 1.8040933561322905e-05, + "loss": 1.5131, + "step": 2389 + }, + { + "epoch": 0.41231777796946434, + "grad_norm": 0.85546875, + "learning_rate": 1.8039315098163993e-05, + "loss": 1.421, + "step": 2390 + }, + { + "epoch": 0.4124902958681963, + "grad_norm": 1.0, + "learning_rate": 1.8037696039402385e-05, + "loss": 1.4433, + "step": 2391 + }, + { + "epoch": 0.4126628137669283, + "grad_norm": 0.69140625, + "learning_rate": 1.8036076385158034e-05, + "loss": 1.4687, + "step": 2392 + }, + { + "epoch": 0.41283533166566033, + "grad_norm": 0.8359375, + "learning_rate": 1.803445613555093e-05, + "loss": 1.5166, + "step": 2393 + }, + { + "epoch": 0.4130078495643923, + "grad_norm": 0.58984375, + "learning_rate": 1.8032835290701115e-05, + "loss": 1.4204, + "step": 2394 + }, + { + "epoch": 0.4131803674631243, + "grad_norm": 0.73046875, + "learning_rate": 1.803121385072867e-05, + "loss": 1.4271, + "step": 2395 + }, + { + "epoch": 0.41335288536185627, + "grad_norm": 1.359375, + "learning_rate": 1.802959181575372e-05, + "loss": 1.4582, + "step": 2396 + }, + { + "epoch": 0.4135254032605883, + "grad_norm": 0.76953125, + "learning_rate": 1.802796918589644e-05, + "loss": 1.5298, + "step": 2397 + }, + { + "epoch": 0.4136979211593203, + "grad_norm": 0.578125, + "learning_rate": 1.802634596127704e-05, + "loss": 1.459, + "step": 2398 + }, + { + "epoch": 0.41387043905805226, + "grad_norm": 0.625, + "learning_rate": 1.8024722142015784e-05, + "loss": 1.4324, + "step": 2399 + }, + { + "epoch": 0.4140429569567843, + "grad_norm": 0.90234375, + "learning_rate": 1.8023097728232967e-05, + "loss": 1.437, + "step": 2400 + }, + { + "epoch": 0.4140429569567843, + "eval_loss": 1.4613255262374878, + "eval_runtime": 10.8914, + "eval_samples_per_second": 94.019, + "eval_steps_per_second": 23.505, + "step": 2400 + }, + { + "epoch": 0.41421547485551624, + "grad_norm": 0.62109375, + "learning_rate": 1.802147272004894e-05, + "loss": 1.4324, + "step": 2401 + }, + { + "epoch": 0.41438799275424826, + "grad_norm": 0.99609375, + "learning_rate": 1.8019847117584092e-05, + "loss": 1.441, + "step": 2402 + }, + { + "epoch": 0.4145605106529803, + "grad_norm": 0.61328125, + "learning_rate": 1.8018220920958864e-05, + "loss": 1.4981, + "step": 2403 + }, + { + "epoch": 0.41473302855171224, + "grad_norm": 0.71484375, + "learning_rate": 1.8016594130293725e-05, + "loss": 1.5251, + "step": 2404 + }, + { + "epoch": 0.41490554645044425, + "grad_norm": 0.6640625, + "learning_rate": 1.8014966745709202e-05, + "loss": 1.5364, + "step": 2405 + }, + { + "epoch": 0.4150780643491762, + "grad_norm": 0.75, + "learning_rate": 1.8013338767325866e-05, + "loss": 1.4724, + "step": 2406 + }, + { + "epoch": 0.41525058224790823, + "grad_norm": 0.83203125, + "learning_rate": 1.8011710195264323e-05, + "loss": 1.5819, + "step": 2407 + }, + { + "epoch": 0.4154231001466402, + "grad_norm": 0.65625, + "learning_rate": 1.8010081029645232e-05, + "loss": 1.4644, + "step": 2408 + }, + { + "epoch": 0.4155956180453722, + "grad_norm": 0.67578125, + "learning_rate": 1.8008451270589288e-05, + "loss": 1.519, + "step": 2409 + }, + { + "epoch": 0.4157681359441042, + "grad_norm": 0.58984375, + "learning_rate": 1.8006820918217233e-05, + "loss": 1.4889, + "step": 2410 + }, + { + "epoch": 0.4159406538428362, + "grad_norm": 0.6640625, + "learning_rate": 1.8005189972649856e-05, + "loss": 1.4611, + "step": 2411 + }, + { + "epoch": 0.4161131717415682, + "grad_norm": 0.66015625, + "learning_rate": 1.800355843400799e-05, + "loss": 1.5453, + "step": 2412 + }, + { + "epoch": 0.41628568964030016, + "grad_norm": 0.82421875, + "learning_rate": 1.8001926302412503e-05, + "loss": 1.4353, + "step": 2413 + }, + { + "epoch": 0.4164582075390322, + "grad_norm": 0.65234375, + "learning_rate": 1.8000293577984318e-05, + "loss": 1.4559, + "step": 2414 + }, + { + "epoch": 0.4166307254377642, + "grad_norm": 0.66015625, + "learning_rate": 1.7998660260844397e-05, + "loss": 1.395, + "step": 2415 + }, + { + "epoch": 0.41680324333649615, + "grad_norm": 0.73828125, + "learning_rate": 1.7997026351113746e-05, + "loss": 1.3569, + "step": 2416 + }, + { + "epoch": 0.41697576123522817, + "grad_norm": 0.66796875, + "learning_rate": 1.7995391848913414e-05, + "loss": 1.5016, + "step": 2417 + }, + { + "epoch": 0.41714827913396013, + "grad_norm": 0.67578125, + "learning_rate": 1.7993756754364497e-05, + "loss": 1.5362, + "step": 2418 + }, + { + "epoch": 0.41732079703269215, + "grad_norm": 0.71875, + "learning_rate": 1.7992121067588134e-05, + "loss": 1.4501, + "step": 2419 + }, + { + "epoch": 0.4174933149314241, + "grad_norm": 0.9140625, + "learning_rate": 1.7990484788705506e-05, + "loss": 1.4937, + "step": 2420 + }, + { + "epoch": 0.4176658328301561, + "grad_norm": 0.6328125, + "learning_rate": 1.7988847917837832e-05, + "loss": 1.4663, + "step": 2421 + }, + { + "epoch": 0.41783835072888814, + "grad_norm": 0.61328125, + "learning_rate": 1.7987210455106395e-05, + "loss": 1.5524, + "step": 2422 + }, + { + "epoch": 0.4180108686276201, + "grad_norm": 0.76171875, + "learning_rate": 1.7985572400632496e-05, + "loss": 1.5553, + "step": 2423 + }, + { + "epoch": 0.4181833865263521, + "grad_norm": 0.71484375, + "learning_rate": 1.7983933754537498e-05, + "loss": 1.4366, + "step": 2424 + }, + { + "epoch": 0.4183559044250841, + "grad_norm": 24.875, + "learning_rate": 1.7982294516942804e-05, + "loss": 1.5607, + "step": 2425 + }, + { + "epoch": 0.4185284223238161, + "grad_norm": 0.69921875, + "learning_rate": 1.7980654687969853e-05, + "loss": 1.4021, + "step": 2426 + }, + { + "epoch": 0.4187009402225481, + "grad_norm": 0.69140625, + "learning_rate": 1.797901426774014e-05, + "loss": 1.4621, + "step": 2427 + }, + { + "epoch": 0.4188734581212801, + "grad_norm": 0.6640625, + "learning_rate": 1.7977373256375194e-05, + "loss": 1.4756, + "step": 2428 + }, + { + "epoch": 0.4190459760200121, + "grad_norm": 0.67578125, + "learning_rate": 1.7975731653996594e-05, + "loss": 1.4211, + "step": 2429 + }, + { + "epoch": 0.41921849391874405, + "grad_norm": 0.62890625, + "learning_rate": 1.7974089460725958e-05, + "loss": 1.4056, + "step": 2430 + }, + { + "epoch": 0.41939101181747607, + "grad_norm": 0.640625, + "learning_rate": 1.797244667668495e-05, + "loss": 1.4528, + "step": 2431 + }, + { + "epoch": 0.4195635297162081, + "grad_norm": 0.98046875, + "learning_rate": 1.7970803301995278e-05, + "loss": 1.4564, + "step": 2432 + }, + { + "epoch": 0.41973604761494004, + "grad_norm": 0.65234375, + "learning_rate": 1.796915933677869e-05, + "loss": 1.3605, + "step": 2433 + }, + { + "epoch": 0.41990856551367206, + "grad_norm": 0.78515625, + "learning_rate": 1.7967514781156988e-05, + "loss": 1.4233, + "step": 2434 + }, + { + "epoch": 0.420081083412404, + "grad_norm": 0.703125, + "learning_rate": 1.7965869635252005e-05, + "loss": 1.3589, + "step": 2435 + }, + { + "epoch": 0.42025360131113604, + "grad_norm": 0.78125, + "learning_rate": 1.7964223899185633e-05, + "loss": 1.564, + "step": 2436 + }, + { + "epoch": 0.420426119209868, + "grad_norm": 0.86328125, + "learning_rate": 1.7962577573079785e-05, + "loss": 1.4458, + "step": 2437 + }, + { + "epoch": 0.4205986371086, + "grad_norm": 0.73828125, + "learning_rate": 1.796093065705644e-05, + "loss": 1.4488, + "step": 2438 + }, + { + "epoch": 0.42077115500733203, + "grad_norm": 1.0234375, + "learning_rate": 1.795928315123761e-05, + "loss": 1.6364, + "step": 2439 + }, + { + "epoch": 0.420943672906064, + "grad_norm": 1.03125, + "learning_rate": 1.7957635055745354e-05, + "loss": 1.5654, + "step": 2440 + }, + { + "epoch": 0.421116190804796, + "grad_norm": 0.921875, + "learning_rate": 1.7955986370701768e-05, + "loss": 1.5302, + "step": 2441 + }, + { + "epoch": 0.42128870870352797, + "grad_norm": 1.734375, + "learning_rate": 1.7954337096229e-05, + "loss": 1.4639, + "step": 2442 + }, + { + "epoch": 0.42146122660226, + "grad_norm": 0.94921875, + "learning_rate": 1.7952687232449242e-05, + "loss": 1.422, + "step": 2443 + }, + { + "epoch": 0.421633744500992, + "grad_norm": 0.71875, + "learning_rate": 1.7951036779484723e-05, + "loss": 1.4756, + "step": 2444 + }, + { + "epoch": 0.42180626239972396, + "grad_norm": 0.59375, + "learning_rate": 1.7949385737457716e-05, + "loss": 1.4052, + "step": 2445 + }, + { + "epoch": 0.421978780298456, + "grad_norm": 0.77734375, + "learning_rate": 1.7947734106490545e-05, + "loss": 1.4661, + "step": 2446 + }, + { + "epoch": 0.42215129819718794, + "grad_norm": 0.75390625, + "learning_rate": 1.7946081886705574e-05, + "loss": 1.5564, + "step": 2447 + }, + { + "epoch": 0.42232381609591996, + "grad_norm": 0.6953125, + "learning_rate": 1.7944429078225204e-05, + "loss": 1.4868, + "step": 2448 + }, + { + "epoch": 0.4224963339946519, + "grad_norm": 0.640625, + "learning_rate": 1.794277568117189e-05, + "loss": 1.4422, + "step": 2449 + }, + { + "epoch": 0.42266885189338393, + "grad_norm": 0.74609375, + "learning_rate": 1.794112169566813e-05, + "loss": 1.4882, + "step": 2450 + }, + { + "epoch": 0.42284136979211595, + "grad_norm": 0.83984375, + "learning_rate": 1.793946712183645e-05, + "loss": 1.6392, + "step": 2451 + }, + { + "epoch": 0.4230138876908479, + "grad_norm": 0.66015625, + "learning_rate": 1.793781195979944e-05, + "loss": 1.517, + "step": 2452 + }, + { + "epoch": 0.42318640558957993, + "grad_norm": 0.79296875, + "learning_rate": 1.793615620967972e-05, + "loss": 1.4967, + "step": 2453 + }, + { + "epoch": 0.4233589234883119, + "grad_norm": 0.6953125, + "learning_rate": 1.7934499871599962e-05, + "loss": 1.5063, + "step": 2454 + }, + { + "epoch": 0.4235314413870439, + "grad_norm": 0.609375, + "learning_rate": 1.7932842945682877e-05, + "loss": 1.4665, + "step": 2455 + }, + { + "epoch": 0.4237039592857759, + "grad_norm": 0.765625, + "learning_rate": 1.7931185432051216e-05, + "loss": 1.4857, + "step": 2456 + }, + { + "epoch": 0.4238764771845079, + "grad_norm": 0.8828125, + "learning_rate": 1.7929527330827788e-05, + "loss": 1.4581, + "step": 2457 + }, + { + "epoch": 0.4240489950832399, + "grad_norm": 0.72265625, + "learning_rate": 1.7927868642135427e-05, + "loss": 1.5367, + "step": 2458 + }, + { + "epoch": 0.42422151298197186, + "grad_norm": 0.78125, + "learning_rate": 1.792620936609702e-05, + "loss": 1.5317, + "step": 2459 + }, + { + "epoch": 0.4243940308807039, + "grad_norm": 0.625, + "learning_rate": 1.79245495028355e-05, + "loss": 1.4564, + "step": 2460 + }, + { + "epoch": 0.4245665487794359, + "grad_norm": 0.66796875, + "learning_rate": 1.7922889052473834e-05, + "loss": 1.4537, + "step": 2461 + }, + { + "epoch": 0.42473906667816785, + "grad_norm": 0.91015625, + "learning_rate": 1.792122801513505e-05, + "loss": 1.4462, + "step": 2462 + }, + { + "epoch": 0.42491158457689987, + "grad_norm": 0.71875, + "learning_rate": 1.7919566390942193e-05, + "loss": 1.4502, + "step": 2463 + }, + { + "epoch": 0.42508410247563183, + "grad_norm": 0.671875, + "learning_rate": 1.791790418001838e-05, + "loss": 1.5052, + "step": 2464 + }, + { + "epoch": 0.42525662037436385, + "grad_norm": 0.76171875, + "learning_rate": 1.7916241382486745e-05, + "loss": 1.4147, + "step": 2465 + }, + { + "epoch": 0.4254291382730958, + "grad_norm": 0.62109375, + "learning_rate": 1.791457799847049e-05, + "loss": 1.4445, + "step": 2466 + }, + { + "epoch": 0.4256016561718278, + "grad_norm": 0.71875, + "learning_rate": 1.7912914028092845e-05, + "loss": 1.5408, + "step": 2467 + }, + { + "epoch": 0.42577417407055984, + "grad_norm": 0.6796875, + "learning_rate": 1.7911249471477085e-05, + "loss": 1.4918, + "step": 2468 + }, + { + "epoch": 0.4259466919692918, + "grad_norm": 0.66796875, + "learning_rate": 1.7909584328746533e-05, + "loss": 1.3957, + "step": 2469 + }, + { + "epoch": 0.4261192098680238, + "grad_norm": 0.5859375, + "learning_rate": 1.790791860002455e-05, + "loss": 1.4524, + "step": 2470 + }, + { + "epoch": 0.4262917277667558, + "grad_norm": 0.6171875, + "learning_rate": 1.790625228543455e-05, + "loss": 1.5006, + "step": 2471 + }, + { + "epoch": 0.4264642456654878, + "grad_norm": 0.6640625, + "learning_rate": 1.790458538509998e-05, + "loss": 1.6025, + "step": 2472 + }, + { + "epoch": 0.4266367635642198, + "grad_norm": 0.58984375, + "learning_rate": 1.790291789914433e-05, + "loss": 1.5626, + "step": 2473 + }, + { + "epoch": 0.4268092814629518, + "grad_norm": 0.734375, + "learning_rate": 1.7901249827691148e-05, + "loss": 1.5277, + "step": 2474 + }, + { + "epoch": 0.4269817993616838, + "grad_norm": 0.609375, + "learning_rate": 1.7899581170864003e-05, + "loss": 1.4732, + "step": 2475 + }, + { + "epoch": 0.42715431726041575, + "grad_norm": 0.8203125, + "learning_rate": 1.7897911928786535e-05, + "loss": 1.5614, + "step": 2476 + }, + { + "epoch": 0.42732683515914777, + "grad_norm": 0.62109375, + "learning_rate": 1.7896242101582393e-05, + "loss": 1.5511, + "step": 2477 + }, + { + "epoch": 0.4274993530578797, + "grad_norm": 0.5703125, + "learning_rate": 1.7894571689375305e-05, + "loss": 1.4692, + "step": 2478 + }, + { + "epoch": 0.42767187095661174, + "grad_norm": 0.59375, + "learning_rate": 1.7892900692289016e-05, + "loss": 1.4351, + "step": 2479 + }, + { + "epoch": 0.42784438885534376, + "grad_norm": 0.60546875, + "learning_rate": 1.7891229110447325e-05, + "loss": 1.4667, + "step": 2480 + }, + { + "epoch": 0.4280169067540757, + "grad_norm": 0.625, + "learning_rate": 1.7889556943974078e-05, + "loss": 1.4757, + "step": 2481 + }, + { + "epoch": 0.42818942465280774, + "grad_norm": 0.640625, + "learning_rate": 1.7887884192993155e-05, + "loss": 1.5014, + "step": 2482 + }, + { + "epoch": 0.4283619425515397, + "grad_norm": 0.6953125, + "learning_rate": 1.7886210857628485e-05, + "loss": 1.5169, + "step": 2483 + }, + { + "epoch": 0.4285344604502717, + "grad_norm": 0.75, + "learning_rate": 1.788453693800404e-05, + "loss": 1.4848, + "step": 2484 + }, + { + "epoch": 0.42870697834900373, + "grad_norm": 0.7265625, + "learning_rate": 1.7882862434243835e-05, + "loss": 1.4559, + "step": 2485 + }, + { + "epoch": 0.4288794962477357, + "grad_norm": 0.7265625, + "learning_rate": 1.7881187346471924e-05, + "loss": 1.5918, + "step": 2486 + }, + { + "epoch": 0.4290520141464677, + "grad_norm": 0.84765625, + "learning_rate": 1.7879511674812413e-05, + "loss": 1.4414, + "step": 2487 + }, + { + "epoch": 0.42922453204519967, + "grad_norm": 0.828125, + "learning_rate": 1.7877835419389444e-05, + "loss": 1.4403, + "step": 2488 + }, + { + "epoch": 0.4293970499439317, + "grad_norm": 0.6796875, + "learning_rate": 1.78761585803272e-05, + "loss": 1.57, + "step": 2489 + }, + { + "epoch": 0.4295695678426637, + "grad_norm": 0.87109375, + "learning_rate": 1.787448115774992e-05, + "loss": 1.4331, + "step": 2490 + }, + { + "epoch": 0.42974208574139566, + "grad_norm": 1.0390625, + "learning_rate": 1.787280315178187e-05, + "loss": 1.4587, + "step": 2491 + }, + { + "epoch": 0.4299146036401277, + "grad_norm": 0.61328125, + "learning_rate": 1.7871124562547372e-05, + "loss": 1.537, + "step": 2492 + }, + { + "epoch": 0.43008712153885964, + "grad_norm": 0.921875, + "learning_rate": 1.7869445390170787e-05, + "loss": 1.4619, + "step": 2493 + }, + { + "epoch": 0.43025963943759166, + "grad_norm": 0.59375, + "learning_rate": 1.7867765634776516e-05, + "loss": 1.4177, + "step": 2494 + }, + { + "epoch": 0.4304321573363236, + "grad_norm": 0.578125, + "learning_rate": 1.786608529648901e-05, + "loss": 1.4781, + "step": 2495 + }, + { + "epoch": 0.43060467523505563, + "grad_norm": 0.73046875, + "learning_rate": 1.786440437543275e-05, + "loss": 1.5143, + "step": 2496 + }, + { + "epoch": 0.43077719313378765, + "grad_norm": 0.6328125, + "learning_rate": 1.786272287173228e-05, + "loss": 1.383, + "step": 2497 + }, + { + "epoch": 0.4309497110325196, + "grad_norm": 0.73046875, + "learning_rate": 1.786104078551217e-05, + "loss": 1.551, + "step": 2498 + }, + { + "epoch": 0.4311222289312516, + "grad_norm": 0.60546875, + "learning_rate": 1.7859358116897034e-05, + "loss": 1.4651, + "step": 2499 + }, + { + "epoch": 0.4312947468299836, + "grad_norm": 0.77734375, + "learning_rate": 1.7857674866011546e-05, + "loss": 1.5387, + "step": 2500 + }, + { + "epoch": 0.4312947468299836, + "eval_loss": 1.4578332901000977, + "eval_runtime": 10.8462, + "eval_samples_per_second": 94.411, + "eval_steps_per_second": 23.603, + "step": 2500 + }, + { + "epoch": 0.4314672647287156, + "grad_norm": 0.7578125, + "learning_rate": 1.785599103298041e-05, + "loss": 1.3842, + "step": 2501 + }, + { + "epoch": 0.4316397826274476, + "grad_norm": 0.6171875, + "learning_rate": 1.7854306617928366e-05, + "loss": 1.5601, + "step": 2502 + }, + { + "epoch": 0.4318123005261796, + "grad_norm": 0.66796875, + "learning_rate": 1.7852621620980216e-05, + "loss": 1.4351, + "step": 2503 + }, + { + "epoch": 0.4319848184249116, + "grad_norm": 0.7578125, + "learning_rate": 1.785093604226079e-05, + "loss": 1.5191, + "step": 2504 + }, + { + "epoch": 0.43215733632364356, + "grad_norm": 0.69140625, + "learning_rate": 1.784924988189497e-05, + "loss": 1.4955, + "step": 2505 + }, + { + "epoch": 0.4323298542223756, + "grad_norm": 0.58203125, + "learning_rate": 1.7847563140007665e-05, + "loss": 1.435, + "step": 2506 + }, + { + "epoch": 0.4325023721211076, + "grad_norm": 0.71484375, + "learning_rate": 1.7845875816723855e-05, + "loss": 1.4883, + "step": 2507 + }, + { + "epoch": 0.43267489001983955, + "grad_norm": 0.58984375, + "learning_rate": 1.7844187912168543e-05, + "loss": 1.4455, + "step": 2508 + }, + { + "epoch": 0.43284740791857157, + "grad_norm": 0.65625, + "learning_rate": 1.784249942646678e-05, + "loss": 1.4211, + "step": 2509 + }, + { + "epoch": 0.43301992581730353, + "grad_norm": 0.87890625, + "learning_rate": 1.784081035974365e-05, + "loss": 1.4285, + "step": 2510 + }, + { + "epoch": 0.43319244371603555, + "grad_norm": 0.65234375, + "learning_rate": 1.7839120712124297e-05, + "loss": 1.3663, + "step": 2511 + }, + { + "epoch": 0.4333649616147675, + "grad_norm": 0.765625, + "learning_rate": 1.7837430483733908e-05, + "loss": 1.5058, + "step": 2512 + }, + { + "epoch": 0.4335374795134995, + "grad_norm": 0.74609375, + "learning_rate": 1.783573967469769e-05, + "loss": 1.555, + "step": 2513 + }, + { + "epoch": 0.43370999741223154, + "grad_norm": 0.66796875, + "learning_rate": 1.7834048285140923e-05, + "loss": 1.5479, + "step": 2514 + }, + { + "epoch": 0.4338825153109635, + "grad_norm": 0.7109375, + "learning_rate": 1.7832356315188907e-05, + "loss": 1.5398, + "step": 2515 + }, + { + "epoch": 0.4340550332096955, + "grad_norm": 0.70703125, + "learning_rate": 1.7830663764966995e-05, + "loss": 1.4866, + "step": 2516 + }, + { + "epoch": 0.4342275511084275, + "grad_norm": 0.65625, + "learning_rate": 1.7828970634600584e-05, + "loss": 1.4726, + "step": 2517 + }, + { + "epoch": 0.4344000690071595, + "grad_norm": 0.62109375, + "learning_rate": 1.7827276924215113e-05, + "loss": 1.43, + "step": 2518 + }, + { + "epoch": 0.4345725869058915, + "grad_norm": 0.69140625, + "learning_rate": 1.7825582633936058e-05, + "loss": 1.4749, + "step": 2519 + }, + { + "epoch": 0.4347451048046235, + "grad_norm": 0.8671875, + "learning_rate": 1.7823887763888944e-05, + "loss": 1.5154, + "step": 2520 + }, + { + "epoch": 0.4349176227033555, + "grad_norm": 0.65625, + "learning_rate": 1.782219231419934e-05, + "loss": 1.4378, + "step": 2521 + }, + { + "epoch": 0.43509014060208745, + "grad_norm": 0.8125, + "learning_rate": 1.7820496284992853e-05, + "loss": 1.5858, + "step": 2522 + }, + { + "epoch": 0.43526265850081947, + "grad_norm": 0.57421875, + "learning_rate": 1.781879967639514e-05, + "loss": 1.5369, + "step": 2523 + }, + { + "epoch": 0.4354351763995514, + "grad_norm": 0.78125, + "learning_rate": 1.781710248853189e-05, + "loss": 1.4153, + "step": 2524 + }, + { + "epoch": 0.43560769429828344, + "grad_norm": 0.68359375, + "learning_rate": 1.7815404721528848e-05, + "loss": 1.4206, + "step": 2525 + }, + { + "epoch": 0.43578021219701546, + "grad_norm": 0.6796875, + "learning_rate": 1.7813706375511784e-05, + "loss": 1.5314, + "step": 2526 + }, + { + "epoch": 0.4359527300957474, + "grad_norm": 0.58984375, + "learning_rate": 1.7812007450606536e-05, + "loss": 1.4995, + "step": 2527 + }, + { + "epoch": 0.43612524799447944, + "grad_norm": 0.640625, + "learning_rate": 1.781030794693896e-05, + "loss": 1.4864, + "step": 2528 + }, + { + "epoch": 0.4362977658932114, + "grad_norm": 0.69921875, + "learning_rate": 1.7808607864634976e-05, + "loss": 1.475, + "step": 2529 + }, + { + "epoch": 0.4364702837919434, + "grad_norm": 0.91796875, + "learning_rate": 1.7806907203820525e-05, + "loss": 1.4901, + "step": 2530 + }, + { + "epoch": 0.43664280169067543, + "grad_norm": 0.6328125, + "learning_rate": 1.780520596462161e-05, + "loss": 1.4808, + "step": 2531 + }, + { + "epoch": 0.4368153195894074, + "grad_norm": 0.80859375, + "learning_rate": 1.780350414716427e-05, + "loss": 1.5526, + "step": 2532 + }, + { + "epoch": 0.4369878374881394, + "grad_norm": 0.62890625, + "learning_rate": 1.7801801751574583e-05, + "loss": 1.4987, + "step": 2533 + }, + { + "epoch": 0.43716035538687137, + "grad_norm": 0.7734375, + "learning_rate": 1.780009877797867e-05, + "loss": 1.455, + "step": 2534 + }, + { + "epoch": 0.4373328732856034, + "grad_norm": 0.671875, + "learning_rate": 1.779839522650271e-05, + "loss": 1.5205, + "step": 2535 + }, + { + "epoch": 0.4375053911843354, + "grad_norm": 0.703125, + "learning_rate": 1.7796691097272902e-05, + "loss": 1.5363, + "step": 2536 + }, + { + "epoch": 0.43767790908306736, + "grad_norm": 0.78125, + "learning_rate": 1.77949863904155e-05, + "loss": 1.4779, + "step": 2537 + }, + { + "epoch": 0.4378504269817994, + "grad_norm": 0.734375, + "learning_rate": 1.77932811060568e-05, + "loss": 1.4271, + "step": 2538 + }, + { + "epoch": 0.43802294488053134, + "grad_norm": 0.82421875, + "learning_rate": 1.7791575244323143e-05, + "loss": 1.5411, + "step": 2539 + }, + { + "epoch": 0.43819546277926336, + "grad_norm": 0.625, + "learning_rate": 1.7789868805340908e-05, + "loss": 1.551, + "step": 2540 + }, + { + "epoch": 0.4383679806779953, + "grad_norm": 0.9140625, + "learning_rate": 1.7788161789236518e-05, + "loss": 1.4683, + "step": 2541 + }, + { + "epoch": 0.43854049857672733, + "grad_norm": 0.8046875, + "learning_rate": 1.778645419613644e-05, + "loss": 1.4342, + "step": 2542 + }, + { + "epoch": 0.43871301647545935, + "grad_norm": 1.078125, + "learning_rate": 1.7784746026167184e-05, + "loss": 1.4436, + "step": 2543 + }, + { + "epoch": 0.4388855343741913, + "grad_norm": 0.84375, + "learning_rate": 1.77830372794553e-05, + "loss": 1.4579, + "step": 2544 + }, + { + "epoch": 0.4390580522729233, + "grad_norm": 0.625, + "learning_rate": 1.7781327956127385e-05, + "loss": 1.5987, + "step": 2545 + }, + { + "epoch": 0.4392305701716553, + "grad_norm": 1.4375, + "learning_rate": 1.7779618056310074e-05, + "loss": 1.5104, + "step": 2546 + }, + { + "epoch": 0.4394030880703873, + "grad_norm": 0.59765625, + "learning_rate": 1.777790758013005e-05, + "loss": 1.5887, + "step": 2547 + }, + { + "epoch": 0.4395756059691193, + "grad_norm": 0.82421875, + "learning_rate": 1.777619652771403e-05, + "loss": 1.5033, + "step": 2548 + }, + { + "epoch": 0.4397481238678513, + "grad_norm": 0.72265625, + "learning_rate": 1.7774484899188788e-05, + "loss": 1.5848, + "step": 2549 + }, + { + "epoch": 0.4399206417665833, + "grad_norm": 0.75390625, + "learning_rate": 1.7772772694681123e-05, + "loss": 1.5275, + "step": 2550 + }, + { + "epoch": 0.44009315966531526, + "grad_norm": 0.8046875, + "learning_rate": 1.777105991431789e-05, + "loss": 1.4234, + "step": 2551 + }, + { + "epoch": 0.4402656775640473, + "grad_norm": 0.78515625, + "learning_rate": 1.7769346558225987e-05, + "loss": 1.5312, + "step": 2552 + }, + { + "epoch": 0.44043819546277924, + "grad_norm": 0.6171875, + "learning_rate": 1.7767632626532344e-05, + "loss": 1.5402, + "step": 2553 + }, + { + "epoch": 0.44061071336151125, + "grad_norm": 0.734375, + "learning_rate": 1.7765918119363942e-05, + "loss": 1.4823, + "step": 2554 + }, + { + "epoch": 0.44078323126024327, + "grad_norm": 1.0234375, + "learning_rate": 1.77642030368478e-05, + "loss": 1.5659, + "step": 2555 + }, + { + "epoch": 0.44095574915897523, + "grad_norm": 0.80859375, + "learning_rate": 1.7762487379110984e-05, + "loss": 1.518, + "step": 2556 + }, + { + "epoch": 0.44112826705770725, + "grad_norm": 0.90234375, + "learning_rate": 1.7760771146280603e-05, + "loss": 1.4378, + "step": 2557 + }, + { + "epoch": 0.4413007849564392, + "grad_norm": 0.70703125, + "learning_rate": 1.7759054338483803e-05, + "loss": 1.4941, + "step": 2558 + }, + { + "epoch": 0.4414733028551712, + "grad_norm": 0.7265625, + "learning_rate": 1.7757336955847775e-05, + "loss": 1.5745, + "step": 2559 + }, + { + "epoch": 0.44164582075390324, + "grad_norm": 0.63671875, + "learning_rate": 1.7755618998499757e-05, + "loss": 1.5342, + "step": 2560 + }, + { + "epoch": 0.4418183386526352, + "grad_norm": 0.87109375, + "learning_rate": 1.7753900466567024e-05, + "loss": 1.5415, + "step": 2561 + }, + { + "epoch": 0.4419908565513672, + "grad_norm": 0.69921875, + "learning_rate": 1.7752181360176895e-05, + "loss": 1.4617, + "step": 2562 + }, + { + "epoch": 0.4421633744500992, + "grad_norm": 0.62890625, + "learning_rate": 1.7750461679456737e-05, + "loss": 1.5255, + "step": 2563 + }, + { + "epoch": 0.4423358923488312, + "grad_norm": 0.7421875, + "learning_rate": 1.7748741424533947e-05, + "loss": 1.454, + "step": 2564 + }, + { + "epoch": 0.4425084102475632, + "grad_norm": 0.8203125, + "learning_rate": 1.7747020595535976e-05, + "loss": 1.5193, + "step": 2565 + }, + { + "epoch": 0.44268092814629517, + "grad_norm": 0.7109375, + "learning_rate": 1.7745299192590317e-05, + "loss": 1.4076, + "step": 2566 + }, + { + "epoch": 0.4428534460450272, + "grad_norm": 0.83203125, + "learning_rate": 1.7743577215824494e-05, + "loss": 1.4981, + "step": 2567 + }, + { + "epoch": 0.44302596394375915, + "grad_norm": 0.68359375, + "learning_rate": 1.774185466536609e-05, + "loss": 1.5218, + "step": 2568 + }, + { + "epoch": 0.44319848184249117, + "grad_norm": 0.81640625, + "learning_rate": 1.774013154134272e-05, + "loss": 1.5258, + "step": 2569 + }, + { + "epoch": 0.4433709997412231, + "grad_norm": 1.03125, + "learning_rate": 1.7738407843882037e-05, + "loss": 1.5437, + "step": 2570 + }, + { + "epoch": 0.44354351763995514, + "grad_norm": 0.6484375, + "learning_rate": 1.773668357311175e-05, + "loss": 1.45, + "step": 2571 + }, + { + "epoch": 0.44371603553868716, + "grad_norm": 0.859375, + "learning_rate": 1.773495872915961e-05, + "loss": 1.4216, + "step": 2572 + }, + { + "epoch": 0.4438885534374191, + "grad_norm": 0.84765625, + "learning_rate": 1.773323331215339e-05, + "loss": 1.342, + "step": 2573 + }, + { + "epoch": 0.44406107133615114, + "grad_norm": 0.7265625, + "learning_rate": 1.7731507322220932e-05, + "loss": 1.5443, + "step": 2574 + }, + { + "epoch": 0.4442335892348831, + "grad_norm": 0.62890625, + "learning_rate": 1.77297807594901e-05, + "loss": 1.5211, + "step": 2575 + }, + { + "epoch": 0.4444061071336151, + "grad_norm": 0.84765625, + "learning_rate": 1.7728053624088812e-05, + "loss": 1.4566, + "step": 2576 + }, + { + "epoch": 0.44457862503234713, + "grad_norm": 0.97265625, + "learning_rate": 1.7726325916145027e-05, + "loss": 1.4819, + "step": 2577 + }, + { + "epoch": 0.4447511429310791, + "grad_norm": 0.6015625, + "learning_rate": 1.7724597635786737e-05, + "loss": 1.4937, + "step": 2578 + }, + { + "epoch": 0.4449236608298111, + "grad_norm": 0.83984375, + "learning_rate": 1.7722868783141992e-05, + "loss": 1.5356, + "step": 2579 + }, + { + "epoch": 0.44509617872854307, + "grad_norm": 0.78515625, + "learning_rate": 1.772113935833887e-05, + "loss": 1.4269, + "step": 2580 + }, + { + "epoch": 0.4452686966272751, + "grad_norm": 0.69140625, + "learning_rate": 1.7719409361505503e-05, + "loss": 1.5314, + "step": 2581 + }, + { + "epoch": 0.4454412145260071, + "grad_norm": 0.71875, + "learning_rate": 1.7717678792770056e-05, + "loss": 1.4201, + "step": 2582 + }, + { + "epoch": 0.44561373242473906, + "grad_norm": 0.8046875, + "learning_rate": 1.771594765226074e-05, + "loss": 1.4507, + "step": 2583 + }, + { + "epoch": 0.4457862503234711, + "grad_norm": 0.79296875, + "learning_rate": 1.7714215940105813e-05, + "loss": 1.4426, + "step": 2584 + }, + { + "epoch": 0.44595876822220304, + "grad_norm": 0.69140625, + "learning_rate": 1.771248365643357e-05, + "loss": 1.4883, + "step": 2585 + }, + { + "epoch": 0.44613128612093506, + "grad_norm": 0.7421875, + "learning_rate": 1.7710750801372345e-05, + "loss": 1.4473, + "step": 2586 + }, + { + "epoch": 0.446303804019667, + "grad_norm": 0.63671875, + "learning_rate": 1.7709017375050525e-05, + "loss": 1.4357, + "step": 2587 + }, + { + "epoch": 0.44647632191839903, + "grad_norm": 0.65625, + "learning_rate": 1.7707283377596526e-05, + "loss": 1.372, + "step": 2588 + }, + { + "epoch": 0.44664883981713105, + "grad_norm": 0.66015625, + "learning_rate": 1.770554880913882e-05, + "loss": 1.5774, + "step": 2589 + }, + { + "epoch": 0.446821357715863, + "grad_norm": 0.6875, + "learning_rate": 1.770381366980591e-05, + "loss": 1.4317, + "step": 2590 + }, + { + "epoch": 0.446993875614595, + "grad_norm": 0.6796875, + "learning_rate": 1.7702077959726346e-05, + "loss": 1.5231, + "step": 2591 + }, + { + "epoch": 0.447166393513327, + "grad_norm": 0.7109375, + "learning_rate": 1.7700341679028725e-05, + "loss": 1.4977, + "step": 2592 + }, + { + "epoch": 0.447338911412059, + "grad_norm": 0.79296875, + "learning_rate": 1.769860482784168e-05, + "loss": 1.5746, + "step": 2593 + }, + { + "epoch": 0.447511429310791, + "grad_norm": 0.77734375, + "learning_rate": 1.769686740629388e-05, + "loss": 1.474, + "step": 2594 + }, + { + "epoch": 0.447683947209523, + "grad_norm": 0.55859375, + "learning_rate": 1.7695129414514057e-05, + "loss": 1.4457, + "step": 2595 + }, + { + "epoch": 0.447856465108255, + "grad_norm": 0.81640625, + "learning_rate": 1.769339085263096e-05, + "loss": 1.5435, + "step": 2596 + }, + { + "epoch": 0.44802898300698696, + "grad_norm": 0.67578125, + "learning_rate": 1.76916517207734e-05, + "loss": 1.4355, + "step": 2597 + }, + { + "epoch": 0.448201500905719, + "grad_norm": 0.6796875, + "learning_rate": 1.7689912019070223e-05, + "loss": 1.4525, + "step": 2598 + }, + { + "epoch": 0.44837401880445094, + "grad_norm": 0.59375, + "learning_rate": 1.7688171747650313e-05, + "loss": 1.4333, + "step": 2599 + }, + { + "epoch": 0.44854653670318295, + "grad_norm": 0.671875, + "learning_rate": 1.7686430906642602e-05, + "loss": 1.5239, + "step": 2600 + }, + { + "epoch": 0.44854653670318295, + "eval_loss": 1.4545358419418335, + "eval_runtime": 11.0064, + "eval_samples_per_second": 93.037, + "eval_steps_per_second": 23.259, + "step": 2600 + }, + { + "epoch": 0.44871905460191497, + "grad_norm": 0.6171875, + "learning_rate": 1.7684689496176065e-05, + "loss": 1.4694, + "step": 2601 + }, + { + "epoch": 0.44889157250064693, + "grad_norm": 0.6171875, + "learning_rate": 1.7682947516379706e-05, + "loss": 1.4667, + "step": 2602 + }, + { + "epoch": 0.44906409039937895, + "grad_norm": 0.65625, + "learning_rate": 1.7681204967382597e-05, + "loss": 1.5195, + "step": 2603 + }, + { + "epoch": 0.4492366082981109, + "grad_norm": 0.86328125, + "learning_rate": 1.767946184931383e-05, + "loss": 1.4439, + "step": 2604 + }, + { + "epoch": 0.4494091261968429, + "grad_norm": 0.625, + "learning_rate": 1.7677718162302546e-05, + "loss": 1.5034, + "step": 2605 + }, + { + "epoch": 0.44958164409557494, + "grad_norm": 0.71484375, + "learning_rate": 1.7675973906477924e-05, + "loss": 1.4178, + "step": 2606 + }, + { + "epoch": 0.4497541619943069, + "grad_norm": 0.625, + "learning_rate": 1.7674229081969195e-05, + "loss": 1.3571, + "step": 2607 + }, + { + "epoch": 0.4499266798930389, + "grad_norm": 0.59375, + "learning_rate": 1.7672483688905622e-05, + "loss": 1.3652, + "step": 2608 + }, + { + "epoch": 0.4500991977917709, + "grad_norm": 0.703125, + "learning_rate": 1.767073772741652e-05, + "loss": 1.4889, + "step": 2609 + }, + { + "epoch": 0.4502717156905029, + "grad_norm": 0.6171875, + "learning_rate": 1.766899119763124e-05, + "loss": 1.5149, + "step": 2610 + }, + { + "epoch": 0.4504442335892349, + "grad_norm": 0.6875, + "learning_rate": 1.7667244099679172e-05, + "loss": 1.5902, + "step": 2611 + }, + { + "epoch": 0.45061675148796687, + "grad_norm": 0.68359375, + "learning_rate": 1.7665496433689754e-05, + "loss": 1.458, + "step": 2612 + }, + { + "epoch": 0.4507892693866989, + "grad_norm": 0.79296875, + "learning_rate": 1.7663748199792463e-05, + "loss": 1.4436, + "step": 2613 + }, + { + "epoch": 0.45096178728543085, + "grad_norm": 0.69140625, + "learning_rate": 1.7661999398116824e-05, + "loss": 1.3657, + "step": 2614 + }, + { + "epoch": 0.45113430518416286, + "grad_norm": 0.6875, + "learning_rate": 1.7660250028792392e-05, + "loss": 1.5338, + "step": 2615 + }, + { + "epoch": 0.4513068230828948, + "grad_norm": 0.625, + "learning_rate": 1.7658500091948774e-05, + "loss": 1.4237, + "step": 2616 + }, + { + "epoch": 0.45147934098162684, + "grad_norm": 0.65625, + "learning_rate": 1.7656749587715617e-05, + "loss": 1.4443, + "step": 2617 + }, + { + "epoch": 0.45165185888035886, + "grad_norm": 0.78515625, + "learning_rate": 1.765499851622261e-05, + "loss": 1.5702, + "step": 2618 + }, + { + "epoch": 0.4518243767790908, + "grad_norm": 0.72265625, + "learning_rate": 1.765324687759948e-05, + "loss": 1.4471, + "step": 2619 + }, + { + "epoch": 0.45199689467782284, + "grad_norm": 0.5859375, + "learning_rate": 1.7651494671976003e-05, + "loss": 1.5225, + "step": 2620 + }, + { + "epoch": 0.4521694125765548, + "grad_norm": 0.62890625, + "learning_rate": 1.7649741899481997e-05, + "loss": 1.4638, + "step": 2621 + }, + { + "epoch": 0.4523419304752868, + "grad_norm": 0.6875, + "learning_rate": 1.7647988560247305e-05, + "loss": 1.4911, + "step": 2622 + }, + { + "epoch": 0.45251444837401883, + "grad_norm": 0.79296875, + "learning_rate": 1.764623465440184e-05, + "loss": 1.396, + "step": 2623 + }, + { + "epoch": 0.4526869662727508, + "grad_norm": 0.609375, + "learning_rate": 1.764448018207553e-05, + "loss": 1.4932, + "step": 2624 + }, + { + "epoch": 0.4528594841714828, + "grad_norm": 0.77734375, + "learning_rate": 1.764272514339837e-05, + "loss": 1.4553, + "step": 2625 + }, + { + "epoch": 0.45303200207021477, + "grad_norm": 0.7265625, + "learning_rate": 1.764096953850037e-05, + "loss": 1.5165, + "step": 2626 + }, + { + "epoch": 0.4532045199689468, + "grad_norm": 0.63671875, + "learning_rate": 1.7639213367511608e-05, + "loss": 1.5094, + "step": 2627 + }, + { + "epoch": 0.45337703786767874, + "grad_norm": 0.6953125, + "learning_rate": 1.763745663056219e-05, + "loss": 1.4608, + "step": 2628 + }, + { + "epoch": 0.45354955576641076, + "grad_norm": 0.8984375, + "learning_rate": 1.7635699327782257e-05, + "loss": 1.4027, + "step": 2629 + }, + { + "epoch": 0.4537220736651428, + "grad_norm": 0.73046875, + "learning_rate": 1.7633941459302013e-05, + "loss": 1.4297, + "step": 2630 + }, + { + "epoch": 0.45389459156387474, + "grad_norm": 0.80078125, + "learning_rate": 1.763218302525169e-05, + "loss": 1.3769, + "step": 2631 + }, + { + "epoch": 0.45406710946260675, + "grad_norm": 0.71484375, + "learning_rate": 1.7630424025761554e-05, + "loss": 1.585, + "step": 2632 + }, + { + "epoch": 0.4542396273613387, + "grad_norm": 0.59765625, + "learning_rate": 1.7628664460961928e-05, + "loss": 1.4986, + "step": 2633 + }, + { + "epoch": 0.45441214526007073, + "grad_norm": 0.91796875, + "learning_rate": 1.7626904330983176e-05, + "loss": 1.5802, + "step": 2634 + }, + { + "epoch": 0.45458466315880275, + "grad_norm": 0.64453125, + "learning_rate": 1.7625143635955697e-05, + "loss": 1.3242, + "step": 2635 + }, + { + "epoch": 0.4547571810575347, + "grad_norm": 0.5859375, + "learning_rate": 1.7623382376009928e-05, + "loss": 1.5192, + "step": 2636 + }, + { + "epoch": 0.4549296989562667, + "grad_norm": 0.77734375, + "learning_rate": 1.7621620551276366e-05, + "loss": 1.4385, + "step": 2637 + }, + { + "epoch": 0.4551022168549987, + "grad_norm": 0.77734375, + "learning_rate": 1.761985816188553e-05, + "loss": 1.4715, + "step": 2638 + }, + { + "epoch": 0.4552747347537307, + "grad_norm": 0.6171875, + "learning_rate": 1.7618095207967988e-05, + "loss": 1.5298, + "step": 2639 + }, + { + "epoch": 0.4554472526524627, + "grad_norm": 0.84375, + "learning_rate": 1.7616331689654352e-05, + "loss": 1.4869, + "step": 2640 + }, + { + "epoch": 0.4556197705511947, + "grad_norm": 0.6484375, + "learning_rate": 1.7614567607075278e-05, + "loss": 1.5208, + "step": 2641 + }, + { + "epoch": 0.4557922884499267, + "grad_norm": 0.828125, + "learning_rate": 1.761280296036146e-05, + "loss": 1.4899, + "step": 2642 + }, + { + "epoch": 0.45596480634865866, + "grad_norm": 0.8828125, + "learning_rate": 1.761103774964363e-05, + "loss": 1.3997, + "step": 2643 + }, + { + "epoch": 0.4561373242473907, + "grad_norm": 0.58984375, + "learning_rate": 1.7609271975052563e-05, + "loss": 1.4327, + "step": 2644 + }, + { + "epoch": 0.45630984214612264, + "grad_norm": 0.78515625, + "learning_rate": 1.7607505636719085e-05, + "loss": 1.4269, + "step": 2645 + }, + { + "epoch": 0.45648236004485465, + "grad_norm": 0.7265625, + "learning_rate": 1.7605738734774062e-05, + "loss": 1.4966, + "step": 2646 + }, + { + "epoch": 0.45665487794358667, + "grad_norm": 0.7265625, + "learning_rate": 1.7603971269348383e-05, + "loss": 1.4565, + "step": 2647 + }, + { + "epoch": 0.45682739584231863, + "grad_norm": 0.796875, + "learning_rate": 1.7602203240573004e-05, + "loss": 1.3994, + "step": 2648 + }, + { + "epoch": 0.45699991374105065, + "grad_norm": 0.80078125, + "learning_rate": 1.7600434648578906e-05, + "loss": 1.4751, + "step": 2649 + }, + { + "epoch": 0.4571724316397826, + "grad_norm": 0.96875, + "learning_rate": 1.7598665493497122e-05, + "loss": 1.5806, + "step": 2650 + }, + { + "epoch": 0.4573449495385146, + "grad_norm": 0.6328125, + "learning_rate": 1.759689577545872e-05, + "loss": 1.5674, + "step": 2651 + }, + { + "epoch": 0.45751746743724664, + "grad_norm": 0.7734375, + "learning_rate": 1.759512549459481e-05, + "loss": 1.478, + "step": 2652 + }, + { + "epoch": 0.4576899853359786, + "grad_norm": 0.828125, + "learning_rate": 1.7593354651036544e-05, + "loss": 1.4421, + "step": 2653 + }, + { + "epoch": 0.4578625032347106, + "grad_norm": 0.765625, + "learning_rate": 1.759158324491512e-05, + "loss": 1.5771, + "step": 2654 + }, + { + "epoch": 0.4580350211334426, + "grad_norm": 0.828125, + "learning_rate": 1.758981127636178e-05, + "loss": 1.4393, + "step": 2655 + }, + { + "epoch": 0.4582075390321746, + "grad_norm": 0.921875, + "learning_rate": 1.7588038745507797e-05, + "loss": 1.5219, + "step": 2656 + }, + { + "epoch": 0.45838005693090655, + "grad_norm": 0.7734375, + "learning_rate": 1.7586265652484488e-05, + "loss": 1.4692, + "step": 2657 + }, + { + "epoch": 0.45855257482963857, + "grad_norm": 0.86328125, + "learning_rate": 1.758449199742322e-05, + "loss": 1.4827, + "step": 2658 + }, + { + "epoch": 0.4587250927283706, + "grad_norm": 1.2734375, + "learning_rate": 1.7582717780455395e-05, + "loss": 1.509, + "step": 2659 + }, + { + "epoch": 0.45889761062710255, + "grad_norm": 1.3359375, + "learning_rate": 1.7580943001712457e-05, + "loss": 1.61, + "step": 2660 + }, + { + "epoch": 0.45907012852583456, + "grad_norm": 0.67578125, + "learning_rate": 1.7579167661325892e-05, + "loss": 1.5574, + "step": 2661 + }, + { + "epoch": 0.4592426464245665, + "grad_norm": 0.7890625, + "learning_rate": 1.757739175942723e-05, + "loss": 1.4575, + "step": 2662 + }, + { + "epoch": 0.45941516432329854, + "grad_norm": 0.66796875, + "learning_rate": 1.7575615296148044e-05, + "loss": 1.429, + "step": 2663 + }, + { + "epoch": 0.45958768222203056, + "grad_norm": 0.6328125, + "learning_rate": 1.757383827161994e-05, + "loss": 1.3583, + "step": 2664 + }, + { + "epoch": 0.4597602001207625, + "grad_norm": 0.69140625, + "learning_rate": 1.7572060685974577e-05, + "loss": 1.4718, + "step": 2665 + }, + { + "epoch": 0.45993271801949454, + "grad_norm": 0.75, + "learning_rate": 1.7570282539343643e-05, + "loss": 1.3904, + "step": 2666 + }, + { + "epoch": 0.4601052359182265, + "grad_norm": 0.81640625, + "learning_rate": 1.7568503831858875e-05, + "loss": 1.5472, + "step": 2667 + }, + { + "epoch": 0.4602777538169585, + "grad_norm": 0.56640625, + "learning_rate": 1.7566724563652052e-05, + "loss": 1.4417, + "step": 2668 + }, + { + "epoch": 0.46045027171569053, + "grad_norm": 0.671875, + "learning_rate": 1.7564944734855e-05, + "loss": 1.5233, + "step": 2669 + }, + { + "epoch": 0.4606227896144225, + "grad_norm": 0.69921875, + "learning_rate": 1.756316434559957e-05, + "loss": 1.4207, + "step": 2670 + }, + { + "epoch": 0.4607953075131545, + "grad_norm": 0.62109375, + "learning_rate": 1.7561383396017672e-05, + "loss": 1.5302, + "step": 2671 + }, + { + "epoch": 0.46096782541188647, + "grad_norm": 0.60546875, + "learning_rate": 1.7559601886241245e-05, + "loss": 1.4491, + "step": 2672 + }, + { + "epoch": 0.4611403433106185, + "grad_norm": 0.65234375, + "learning_rate": 1.7557819816402273e-05, + "loss": 1.5596, + "step": 2673 + }, + { + "epoch": 0.46131286120935044, + "grad_norm": 0.6484375, + "learning_rate": 1.7556037186632787e-05, + "loss": 1.4611, + "step": 2674 + }, + { + "epoch": 0.46148537910808246, + "grad_norm": 0.6796875, + "learning_rate": 1.7554253997064854e-05, + "loss": 1.4785, + "step": 2675 + }, + { + "epoch": 0.4616578970068145, + "grad_norm": 0.64453125, + "learning_rate": 1.755247024783058e-05, + "loss": 1.4997, + "step": 2676 + }, + { + "epoch": 0.46183041490554644, + "grad_norm": 0.62109375, + "learning_rate": 1.7550685939062125e-05, + "loss": 1.4593, + "step": 2677 + }, + { + "epoch": 0.46200293280427845, + "grad_norm": 0.6171875, + "learning_rate": 1.754890107089168e-05, + "loss": 1.5093, + "step": 2678 + }, + { + "epoch": 0.4621754507030104, + "grad_norm": 0.65625, + "learning_rate": 1.754711564345147e-05, + "loss": 1.436, + "step": 2679 + }, + { + "epoch": 0.46234796860174243, + "grad_norm": 0.58984375, + "learning_rate": 1.754532965687378e-05, + "loss": 1.4643, + "step": 2680 + }, + { + "epoch": 0.46252048650047445, + "grad_norm": 0.625, + "learning_rate": 1.754354311129092e-05, + "loss": 1.4679, + "step": 2681 + }, + { + "epoch": 0.4626930043992064, + "grad_norm": 0.78515625, + "learning_rate": 1.7541756006835253e-05, + "loss": 1.5252, + "step": 2682 + }, + { + "epoch": 0.4628655222979384, + "grad_norm": 0.7109375, + "learning_rate": 1.753996834363918e-05, + "loss": 1.5314, + "step": 2683 + }, + { + "epoch": 0.4630380401966704, + "grad_norm": 0.66796875, + "learning_rate": 1.753818012183514e-05, + "loss": 1.4043, + "step": 2684 + }, + { + "epoch": 0.4632105580954024, + "grad_norm": 0.73046875, + "learning_rate": 1.7536391341555613e-05, + "loss": 1.5028, + "step": 2685 + }, + { + "epoch": 0.4633830759941344, + "grad_norm": 0.5859375, + "learning_rate": 1.7534602002933128e-05, + "loss": 1.4713, + "step": 2686 + }, + { + "epoch": 0.4635555938928664, + "grad_norm": 0.62890625, + "learning_rate": 1.7532812106100247e-05, + "loss": 1.4458, + "step": 2687 + }, + { + "epoch": 0.4637281117915984, + "grad_norm": 0.61328125, + "learning_rate": 1.7531021651189578e-05, + "loss": 1.4422, + "step": 2688 + }, + { + "epoch": 0.46390062969033036, + "grad_norm": 0.7578125, + "learning_rate": 1.752923063833377e-05, + "loss": 1.4305, + "step": 2689 + }, + { + "epoch": 0.4640731475890624, + "grad_norm": 0.64453125, + "learning_rate": 1.7527439067665516e-05, + "loss": 1.4937, + "step": 2690 + }, + { + "epoch": 0.46424566548779433, + "grad_norm": 0.62109375, + "learning_rate": 1.7525646939317535e-05, + "loss": 1.4594, + "step": 2691 + }, + { + "epoch": 0.46441818338652635, + "grad_norm": 0.66015625, + "learning_rate": 1.752385425342261e-05, + "loss": 1.4699, + "step": 2692 + }, + { + "epoch": 0.46459070128525837, + "grad_norm": 0.625, + "learning_rate": 1.752206101011355e-05, + "loss": 1.5202, + "step": 2693 + }, + { + "epoch": 0.46476321918399033, + "grad_norm": 0.8515625, + "learning_rate": 1.752026720952321e-05, + "loss": 1.5505, + "step": 2694 + }, + { + "epoch": 0.46493573708272234, + "grad_norm": 0.62109375, + "learning_rate": 1.7518472851784485e-05, + "loss": 1.4083, + "step": 2695 + }, + { + "epoch": 0.4651082549814543, + "grad_norm": 0.7578125, + "learning_rate": 1.7516677937030318e-05, + "loss": 1.5286, + "step": 2696 + }, + { + "epoch": 0.4652807728801863, + "grad_norm": 0.83203125, + "learning_rate": 1.751488246539368e-05, + "loss": 1.5077, + "step": 2697 + }, + { + "epoch": 0.46545329077891834, + "grad_norm": 0.6640625, + "learning_rate": 1.7513086437007593e-05, + "loss": 1.5907, + "step": 2698 + }, + { + "epoch": 0.4656258086776503, + "grad_norm": 0.66015625, + "learning_rate": 1.751128985200512e-05, + "loss": 1.498, + "step": 2699 + }, + { + "epoch": 0.4657983265763823, + "grad_norm": 0.78515625, + "learning_rate": 1.750949271051936e-05, + "loss": 1.5613, + "step": 2700 + }, + { + "epoch": 0.4657983265763823, + "eval_loss": 1.451514720916748, + "eval_runtime": 11.0249, + "eval_samples_per_second": 92.88, + "eval_steps_per_second": 23.22, + "step": 2700 + }, + { + "epoch": 0.4659708444751143, + "grad_norm": 0.6640625, + "learning_rate": 1.7507695012683463e-05, + "loss": 1.4832, + "step": 2701 + }, + { + "epoch": 0.4661433623738463, + "grad_norm": 0.73046875, + "learning_rate": 1.7505896758630606e-05, + "loss": 1.508, + "step": 2702 + }, + { + "epoch": 0.46631588027257825, + "grad_norm": 0.76953125, + "learning_rate": 1.750409794849402e-05, + "loss": 1.3877, + "step": 2703 + }, + { + "epoch": 0.46648839817131027, + "grad_norm": 0.6328125, + "learning_rate": 1.7502298582406967e-05, + "loss": 1.4419, + "step": 2704 + }, + { + "epoch": 0.4666609160700423, + "grad_norm": 0.84375, + "learning_rate": 1.7500498660502757e-05, + "loss": 1.4838, + "step": 2705 + }, + { + "epoch": 0.46683343396877425, + "grad_norm": 0.8203125, + "learning_rate": 1.7498698182914746e-05, + "loss": 1.5293, + "step": 2706 + }, + { + "epoch": 0.46700595186750626, + "grad_norm": 0.6875, + "learning_rate": 1.749689714977632e-05, + "loss": 1.5346, + "step": 2707 + }, + { + "epoch": 0.4671784697662382, + "grad_norm": 0.7578125, + "learning_rate": 1.7495095561220908e-05, + "loss": 1.4503, + "step": 2708 + }, + { + "epoch": 0.46735098766497024, + "grad_norm": 0.828125, + "learning_rate": 1.7493293417381985e-05, + "loss": 1.4146, + "step": 2709 + }, + { + "epoch": 0.46752350556370226, + "grad_norm": 3.15625, + "learning_rate": 1.7491490718393067e-05, + "loss": 1.554, + "step": 2710 + }, + { + "epoch": 0.4676960234624342, + "grad_norm": 0.8984375, + "learning_rate": 1.7489687464387705e-05, + "loss": 1.5255, + "step": 2711 + }, + { + "epoch": 0.46786854136116623, + "grad_norm": 0.89453125, + "learning_rate": 1.74878836554995e-05, + "loss": 1.4677, + "step": 2712 + }, + { + "epoch": 0.4680410592598982, + "grad_norm": 0.73046875, + "learning_rate": 1.748607929186209e-05, + "loss": 1.5083, + "step": 2713 + }, + { + "epoch": 0.4682135771586302, + "grad_norm": 0.7578125, + "learning_rate": 1.7484274373609143e-05, + "loss": 1.5612, + "step": 2714 + }, + { + "epoch": 0.46838609505736223, + "grad_norm": 0.82421875, + "learning_rate": 1.7482468900874393e-05, + "loss": 1.4519, + "step": 2715 + }, + { + "epoch": 0.4685586129560942, + "grad_norm": 0.5859375, + "learning_rate": 1.7480662873791592e-05, + "loss": 1.4596, + "step": 2716 + }, + { + "epoch": 0.4687311308548262, + "grad_norm": 1.0, + "learning_rate": 1.7478856292494543e-05, + "loss": 1.5029, + "step": 2717 + }, + { + "epoch": 0.46890364875355817, + "grad_norm": 0.8515625, + "learning_rate": 1.7477049157117093e-05, + "loss": 1.4348, + "step": 2718 + }, + { + "epoch": 0.4690761666522902, + "grad_norm": 0.6796875, + "learning_rate": 1.747524146779312e-05, + "loss": 1.531, + "step": 2719 + }, + { + "epoch": 0.46924868455102214, + "grad_norm": 0.87890625, + "learning_rate": 1.7473433224656554e-05, + "loss": 1.4822, + "step": 2720 + }, + { + "epoch": 0.46942120244975416, + "grad_norm": 0.71875, + "learning_rate": 1.7471624427841356e-05, + "loss": 1.3717, + "step": 2721 + }, + { + "epoch": 0.4695937203484862, + "grad_norm": 0.72265625, + "learning_rate": 1.7469815077481537e-05, + "loss": 1.462, + "step": 2722 + }, + { + "epoch": 0.46976623824721814, + "grad_norm": 0.703125, + "learning_rate": 1.746800517371114e-05, + "loss": 1.4464, + "step": 2723 + }, + { + "epoch": 0.46993875614595015, + "grad_norm": 0.625, + "learning_rate": 1.7466194716664262e-05, + "loss": 1.4288, + "step": 2724 + }, + { + "epoch": 0.4701112740446821, + "grad_norm": 0.7734375, + "learning_rate": 1.7464383706475028e-05, + "loss": 1.4812, + "step": 2725 + }, + { + "epoch": 0.47028379194341413, + "grad_norm": 0.5546875, + "learning_rate": 1.7462572143277606e-05, + "loss": 1.388, + "step": 2726 + }, + { + "epoch": 0.47045630984214615, + "grad_norm": 0.62109375, + "learning_rate": 1.7460760027206215e-05, + "loss": 1.4307, + "step": 2727 + }, + { + "epoch": 0.4706288277408781, + "grad_norm": 0.75, + "learning_rate": 1.7458947358395102e-05, + "loss": 1.521, + "step": 2728 + }, + { + "epoch": 0.4708013456396101, + "grad_norm": 0.75, + "learning_rate": 1.7457134136978566e-05, + "loss": 1.525, + "step": 2729 + }, + { + "epoch": 0.4709738635383421, + "grad_norm": 0.640625, + "learning_rate": 1.7455320363090936e-05, + "loss": 1.4898, + "step": 2730 + }, + { + "epoch": 0.4711463814370741, + "grad_norm": 0.94921875, + "learning_rate": 1.7453506036866592e-05, + "loss": 1.416, + "step": 2731 + }, + { + "epoch": 0.47131889933580606, + "grad_norm": 0.62109375, + "learning_rate": 1.745169115843995e-05, + "loss": 1.4478, + "step": 2732 + }, + { + "epoch": 0.4714914172345381, + "grad_norm": 0.6171875, + "learning_rate": 1.7449875727945463e-05, + "loss": 1.4715, + "step": 2733 + }, + { + "epoch": 0.4716639351332701, + "grad_norm": 0.69921875, + "learning_rate": 1.7448059745517635e-05, + "loss": 1.5544, + "step": 2734 + }, + { + "epoch": 0.47183645303200206, + "grad_norm": 0.74609375, + "learning_rate": 1.7446243211291003e-05, + "loss": 1.4665, + "step": 2735 + }, + { + "epoch": 0.4720089709307341, + "grad_norm": 0.58984375, + "learning_rate": 1.7444426125400148e-05, + "loss": 1.4048, + "step": 2736 + }, + { + "epoch": 0.47218148882946603, + "grad_norm": 0.984375, + "learning_rate": 1.7442608487979692e-05, + "loss": 1.4598, + "step": 2737 + }, + { + "epoch": 0.47235400672819805, + "grad_norm": 0.83984375, + "learning_rate": 1.7440790299164295e-05, + "loss": 1.5017, + "step": 2738 + }, + { + "epoch": 0.47252652462693007, + "grad_norm": 0.60546875, + "learning_rate": 1.7438971559088658e-05, + "loss": 1.4438, + "step": 2739 + }, + { + "epoch": 0.472699042525662, + "grad_norm": 0.734375, + "learning_rate": 1.743715226788753e-05, + "loss": 1.4198, + "step": 2740 + }, + { + "epoch": 0.47287156042439404, + "grad_norm": 0.625, + "learning_rate": 1.743533242569569e-05, + "loss": 1.4941, + "step": 2741 + }, + { + "epoch": 0.473044078323126, + "grad_norm": 0.6328125, + "learning_rate": 1.7433512032647968e-05, + "loss": 1.4281, + "step": 2742 + }, + { + "epoch": 0.473216596221858, + "grad_norm": 0.59765625, + "learning_rate": 1.7431691088879228e-05, + "loss": 1.4557, + "step": 2743 + }, + { + "epoch": 0.47338911412059004, + "grad_norm": 0.6328125, + "learning_rate": 1.7429869594524375e-05, + "loss": 1.4984, + "step": 2744 + }, + { + "epoch": 0.473561632019322, + "grad_norm": 0.58203125, + "learning_rate": 1.742804754971836e-05, + "loss": 1.537, + "step": 2745 + }, + { + "epoch": 0.473734149918054, + "grad_norm": 0.6015625, + "learning_rate": 1.742622495459617e-05, + "loss": 1.4307, + "step": 2746 + }, + { + "epoch": 0.473906667816786, + "grad_norm": 0.6328125, + "learning_rate": 1.7424401809292833e-05, + "loss": 1.5323, + "step": 2747 + }, + { + "epoch": 0.474079185715518, + "grad_norm": 0.8125, + "learning_rate": 1.742257811394342e-05, + "loss": 1.5329, + "step": 2748 + }, + { + "epoch": 0.47425170361424995, + "grad_norm": 1.1796875, + "learning_rate": 1.7420753868683044e-05, + "loss": 1.533, + "step": 2749 + }, + { + "epoch": 0.47442422151298197, + "grad_norm": 0.7109375, + "learning_rate": 1.7418929073646855e-05, + "loss": 1.4591, + "step": 2750 + }, + { + "epoch": 0.474596739411714, + "grad_norm": 0.6796875, + "learning_rate": 1.741710372897004e-05, + "loss": 1.4273, + "step": 2751 + }, + { + "epoch": 0.47476925731044595, + "grad_norm": 0.67578125, + "learning_rate": 1.741527783478784e-05, + "loss": 1.4346, + "step": 2752 + }, + { + "epoch": 0.47494177520917796, + "grad_norm": 0.7578125, + "learning_rate": 1.7413451391235524e-05, + "loss": 1.4941, + "step": 2753 + }, + { + "epoch": 0.4751142931079099, + "grad_norm": 0.72265625, + "learning_rate": 1.7411624398448408e-05, + "loss": 1.4746, + "step": 2754 + }, + { + "epoch": 0.47528681100664194, + "grad_norm": 0.8359375, + "learning_rate": 1.7409796856561847e-05, + "loss": 1.4099, + "step": 2755 + }, + { + "epoch": 0.47545932890537396, + "grad_norm": 0.7734375, + "learning_rate": 1.7407968765711234e-05, + "loss": 1.5676, + "step": 2756 + }, + { + "epoch": 0.4756318468041059, + "grad_norm": 0.6796875, + "learning_rate": 1.740614012603201e-05, + "loss": 1.4248, + "step": 2757 + }, + { + "epoch": 0.47580436470283793, + "grad_norm": 0.6015625, + "learning_rate": 1.740431093765965e-05, + "loss": 1.5019, + "step": 2758 + }, + { + "epoch": 0.4759768826015699, + "grad_norm": 0.71875, + "learning_rate": 1.740248120072967e-05, + "loss": 1.4743, + "step": 2759 + }, + { + "epoch": 0.4761494005003019, + "grad_norm": 0.73046875, + "learning_rate": 1.740065091537763e-05, + "loss": 1.3471, + "step": 2760 + }, + { + "epoch": 0.4763219183990339, + "grad_norm": 0.78125, + "learning_rate": 1.7398820081739128e-05, + "loss": 1.5532, + "step": 2761 + }, + { + "epoch": 0.4764944362977659, + "grad_norm": 0.62890625, + "learning_rate": 1.739698869994981e-05, + "loss": 1.4123, + "step": 2762 + }, + { + "epoch": 0.4766669541964979, + "grad_norm": 0.7734375, + "learning_rate": 1.7395156770145343e-05, + "loss": 1.5422, + "step": 2763 + }, + { + "epoch": 0.47683947209522987, + "grad_norm": 0.640625, + "learning_rate": 1.739332429246146e-05, + "loss": 1.5009, + "step": 2764 + }, + { + "epoch": 0.4770119899939619, + "grad_norm": 0.81640625, + "learning_rate": 1.7391491267033916e-05, + "loss": 1.3644, + "step": 2765 + }, + { + "epoch": 0.47718450789269384, + "grad_norm": 0.65625, + "learning_rate": 1.7389657693998515e-05, + "loss": 1.4097, + "step": 2766 + }, + { + "epoch": 0.47735702579142586, + "grad_norm": 0.60546875, + "learning_rate": 1.73878235734911e-05, + "loss": 1.4486, + "step": 2767 + }, + { + "epoch": 0.4775295436901579, + "grad_norm": 0.62109375, + "learning_rate": 1.7385988905647556e-05, + "loss": 1.3957, + "step": 2768 + }, + { + "epoch": 0.47770206158888984, + "grad_norm": 0.7734375, + "learning_rate": 1.73841536906038e-05, + "loss": 1.3941, + "step": 2769 + }, + { + "epoch": 0.47787457948762185, + "grad_norm": 0.71875, + "learning_rate": 1.7382317928495803e-05, + "loss": 1.5116, + "step": 2770 + }, + { + "epoch": 0.4780470973863538, + "grad_norm": 0.6953125, + "learning_rate": 1.7380481619459564e-05, + "loss": 1.4201, + "step": 2771 + }, + { + "epoch": 0.47821961528508583, + "grad_norm": 1.515625, + "learning_rate": 1.7378644763631133e-05, + "loss": 1.4355, + "step": 2772 + }, + { + "epoch": 0.47839213318381785, + "grad_norm": 0.6171875, + "learning_rate": 1.7376807361146594e-05, + "loss": 1.6016, + "step": 2773 + }, + { + "epoch": 0.4785646510825498, + "grad_norm": 0.66796875, + "learning_rate": 1.7374969412142072e-05, + "loss": 1.433, + "step": 2774 + }, + { + "epoch": 0.4787371689812818, + "grad_norm": 0.73046875, + "learning_rate": 1.7373130916753737e-05, + "loss": 1.4282, + "step": 2775 + }, + { + "epoch": 0.4789096868800138, + "grad_norm": 0.62890625, + "learning_rate": 1.737129187511779e-05, + "loss": 1.4499, + "step": 2776 + }, + { + "epoch": 0.4790822047787458, + "grad_norm": 0.7578125, + "learning_rate": 1.736945228737049e-05, + "loss": 1.5561, + "step": 2777 + }, + { + "epoch": 0.47925472267747776, + "grad_norm": 0.59765625, + "learning_rate": 1.7367612153648113e-05, + "loss": 1.3707, + "step": 2778 + }, + { + "epoch": 0.4794272405762098, + "grad_norm": 0.6796875, + "learning_rate": 1.7365771474086993e-05, + "loss": 1.5786, + "step": 2779 + }, + { + "epoch": 0.4795997584749418, + "grad_norm": 1.015625, + "learning_rate": 1.73639302488235e-05, + "loss": 1.5003, + "step": 2780 + }, + { + "epoch": 0.47977227637367376, + "grad_norm": 0.66015625, + "learning_rate": 1.7362088477994043e-05, + "loss": 1.5189, + "step": 2781 + }, + { + "epoch": 0.4799447942724058, + "grad_norm": 0.66796875, + "learning_rate": 1.736024616173507e-05, + "loss": 1.4467, + "step": 2782 + }, + { + "epoch": 0.48011731217113773, + "grad_norm": 0.6640625, + "learning_rate": 1.735840330018307e-05, + "loss": 1.5065, + "step": 2783 + }, + { + "epoch": 0.48028983006986975, + "grad_norm": 0.74609375, + "learning_rate": 1.7356559893474578e-05, + "loss": 1.5442, + "step": 2784 + }, + { + "epoch": 0.48046234796860177, + "grad_norm": 0.7265625, + "learning_rate": 1.735471594174616e-05, + "loss": 1.41, + "step": 2785 + }, + { + "epoch": 0.4806348658673337, + "grad_norm": 0.55078125, + "learning_rate": 1.735287144513444e-05, + "loss": 1.4113, + "step": 2786 + }, + { + "epoch": 0.48080738376606574, + "grad_norm": 0.6015625, + "learning_rate": 1.7351026403776054e-05, + "loss": 1.4412, + "step": 2787 + }, + { + "epoch": 0.4809799016647977, + "grad_norm": 0.66015625, + "learning_rate": 1.73491808178077e-05, + "loss": 1.4297, + "step": 2788 + }, + { + "epoch": 0.4811524195635297, + "grad_norm": 0.734375, + "learning_rate": 1.7347334687366114e-05, + "loss": 1.6249, + "step": 2789 + }, + { + "epoch": 0.48132493746226174, + "grad_norm": 0.60546875, + "learning_rate": 1.7345488012588064e-05, + "loss": 1.3788, + "step": 2790 + }, + { + "epoch": 0.4814974553609937, + "grad_norm": 0.640625, + "learning_rate": 1.7343640793610366e-05, + "loss": 1.476, + "step": 2791 + }, + { + "epoch": 0.4816699732597257, + "grad_norm": 0.62890625, + "learning_rate": 1.7341793030569874e-05, + "loss": 1.4452, + "step": 2792 + }, + { + "epoch": 0.4818424911584577, + "grad_norm": 0.5703125, + "learning_rate": 1.733994472360348e-05, + "loss": 1.5272, + "step": 2793 + }, + { + "epoch": 0.4820150090571897, + "grad_norm": 0.60546875, + "learning_rate": 1.733809587284812e-05, + "loss": 1.416, + "step": 2794 + }, + { + "epoch": 0.48218752695592165, + "grad_norm": 0.6484375, + "learning_rate": 1.733624647844076e-05, + "loss": 1.4474, + "step": 2795 + }, + { + "epoch": 0.48236004485465367, + "grad_norm": 0.59765625, + "learning_rate": 1.733439654051843e-05, + "loss": 1.5665, + "step": 2796 + }, + { + "epoch": 0.4825325627533857, + "grad_norm": 0.7578125, + "learning_rate": 1.7332546059218174e-05, + "loss": 1.4993, + "step": 2797 + }, + { + "epoch": 0.48270508065211765, + "grad_norm": 0.5703125, + "learning_rate": 1.733069503467709e-05, + "loss": 1.3398, + "step": 2798 + }, + { + "epoch": 0.48287759855084966, + "grad_norm": 0.6484375, + "learning_rate": 1.7328843467032314e-05, + "loss": 1.502, + "step": 2799 + }, + { + "epoch": 0.4830501164495816, + "grad_norm": 0.609375, + "learning_rate": 1.7326991356421023e-05, + "loss": 1.5295, + "step": 2800 + }, + { + "epoch": 0.4830501164495816, + "eval_loss": 1.4480316638946533, + "eval_runtime": 10.8875, + "eval_samples_per_second": 94.053, + "eval_steps_per_second": 23.513, + "step": 2800 + }, + { + "epoch": 0.48322263434831364, + "grad_norm": 0.7109375, + "learning_rate": 1.7325138702980427e-05, + "loss": 1.4666, + "step": 2801 + }, + { + "epoch": 0.48339515224704566, + "grad_norm": 0.578125, + "learning_rate": 1.7323285506847788e-05, + "loss": 1.4056, + "step": 2802 + }, + { + "epoch": 0.4835676701457776, + "grad_norm": 0.6796875, + "learning_rate": 1.7321431768160402e-05, + "loss": 1.4883, + "step": 2803 + }, + { + "epoch": 0.48374018804450963, + "grad_norm": 0.76171875, + "learning_rate": 1.7319577487055603e-05, + "loss": 1.4309, + "step": 2804 + }, + { + "epoch": 0.4839127059432416, + "grad_norm": 0.578125, + "learning_rate": 1.7317722663670767e-05, + "loss": 1.5414, + "step": 2805 + }, + { + "epoch": 0.4840852238419736, + "grad_norm": 0.57421875, + "learning_rate": 1.731586729814332e-05, + "loss": 1.3881, + "step": 2806 + }, + { + "epoch": 0.48425774174070557, + "grad_norm": 0.94921875, + "learning_rate": 1.7314011390610705e-05, + "loss": 1.5173, + "step": 2807 + }, + { + "epoch": 0.4844302596394376, + "grad_norm": 0.765625, + "learning_rate": 1.731215494121043e-05, + "loss": 1.3449, + "step": 2808 + }, + { + "epoch": 0.4846027775381696, + "grad_norm": 0.61328125, + "learning_rate": 1.7310297950080022e-05, + "loss": 1.4315, + "step": 2809 + }, + { + "epoch": 0.48477529543690157, + "grad_norm": 0.73828125, + "learning_rate": 1.730844041735707e-05, + "loss": 1.4923, + "step": 2810 + }, + { + "epoch": 0.4849478133356336, + "grad_norm": 0.890625, + "learning_rate": 1.730658234317919e-05, + "loss": 1.5253, + "step": 2811 + }, + { + "epoch": 0.48512033123436554, + "grad_norm": 0.7734375, + "learning_rate": 1.7304723727684033e-05, + "loss": 1.5575, + "step": 2812 + }, + { + "epoch": 0.48529284913309756, + "grad_norm": 0.6484375, + "learning_rate": 1.7302864571009296e-05, + "loss": 1.4681, + "step": 2813 + }, + { + "epoch": 0.4854653670318296, + "grad_norm": 0.67578125, + "learning_rate": 1.7301004873292727e-05, + "loss": 1.5213, + "step": 2814 + }, + { + "epoch": 0.48563788493056154, + "grad_norm": 0.64453125, + "learning_rate": 1.7299144634672096e-05, + "loss": 1.4503, + "step": 2815 + }, + { + "epoch": 0.48581040282929355, + "grad_norm": 0.71875, + "learning_rate": 1.729728385528522e-05, + "loss": 1.4475, + "step": 2816 + }, + { + "epoch": 0.4859829207280255, + "grad_norm": 0.64453125, + "learning_rate": 1.729542253526997e-05, + "loss": 1.507, + "step": 2817 + }, + { + "epoch": 0.48615543862675753, + "grad_norm": 0.5859375, + "learning_rate": 1.7293560674764224e-05, + "loss": 1.4533, + "step": 2818 + }, + { + "epoch": 0.48632795652548955, + "grad_norm": 0.58984375, + "learning_rate": 1.7291698273905936e-05, + "loss": 1.4052, + "step": 2819 + }, + { + "epoch": 0.4865004744242215, + "grad_norm": 0.72265625, + "learning_rate": 1.7289835332833083e-05, + "loss": 1.3862, + "step": 2820 + }, + { + "epoch": 0.4866729923229535, + "grad_norm": 0.63671875, + "learning_rate": 1.7287971851683675e-05, + "loss": 1.4409, + "step": 2821 + }, + { + "epoch": 0.4868455102216855, + "grad_norm": 0.625, + "learning_rate": 1.7286107830595774e-05, + "loss": 1.436, + "step": 2822 + }, + { + "epoch": 0.4870180281204175, + "grad_norm": 0.8671875, + "learning_rate": 1.7284243269707478e-05, + "loss": 1.4044, + "step": 2823 + }, + { + "epoch": 0.48719054601914946, + "grad_norm": 0.66015625, + "learning_rate": 1.728237816915693e-05, + "loss": 1.5035, + "step": 2824 + }, + { + "epoch": 0.4873630639178815, + "grad_norm": 0.59765625, + "learning_rate": 1.7280512529082306e-05, + "loss": 1.4622, + "step": 2825 + }, + { + "epoch": 0.4875355818166135, + "grad_norm": 0.765625, + "learning_rate": 1.727864634962182e-05, + "loss": 1.5554, + "step": 2826 + }, + { + "epoch": 0.48770809971534546, + "grad_norm": 0.66015625, + "learning_rate": 1.7276779630913734e-05, + "loss": 1.5669, + "step": 2827 + }, + { + "epoch": 0.48788061761407747, + "grad_norm": 0.71484375, + "learning_rate": 1.7274912373096345e-05, + "loss": 1.3718, + "step": 2828 + }, + { + "epoch": 0.48805313551280943, + "grad_norm": 0.69140625, + "learning_rate": 1.7273044576307993e-05, + "loss": 1.3165, + "step": 2829 + }, + { + "epoch": 0.48822565341154145, + "grad_norm": 0.6953125, + "learning_rate": 1.7271176240687054e-05, + "loss": 1.455, + "step": 2830 + }, + { + "epoch": 0.48839817131027347, + "grad_norm": 0.6953125, + "learning_rate": 1.7269307366371948e-05, + "loss": 1.5077, + "step": 2831 + }, + { + "epoch": 0.4885706892090054, + "grad_norm": 0.5859375, + "learning_rate": 1.726743795350113e-05, + "loss": 1.4871, + "step": 2832 + }, + { + "epoch": 0.48874320710773744, + "grad_norm": 0.6875, + "learning_rate": 1.7265568002213103e-05, + "loss": 1.5284, + "step": 2833 + }, + { + "epoch": 0.4889157250064694, + "grad_norm": 0.81640625, + "learning_rate": 1.7263697512646397e-05, + "loss": 1.5449, + "step": 2834 + }, + { + "epoch": 0.4890882429052014, + "grad_norm": 0.66796875, + "learning_rate": 1.7261826484939592e-05, + "loss": 1.4365, + "step": 2835 + }, + { + "epoch": 0.4892607608039334, + "grad_norm": 0.85546875, + "learning_rate": 1.725995491923131e-05, + "loss": 1.5388, + "step": 2836 + }, + { + "epoch": 0.4894332787026654, + "grad_norm": 0.9375, + "learning_rate": 1.7258082815660203e-05, + "loss": 1.5065, + "step": 2837 + }, + { + "epoch": 0.4896057966013974, + "grad_norm": 0.70703125, + "learning_rate": 1.7256210174364975e-05, + "loss": 1.5679, + "step": 2838 + }, + { + "epoch": 0.4897783145001294, + "grad_norm": 0.8828125, + "learning_rate": 1.7254336995484355e-05, + "loss": 1.4383, + "step": 2839 + }, + { + "epoch": 0.4899508323988614, + "grad_norm": 0.80859375, + "learning_rate": 1.7252463279157123e-05, + "loss": 1.4321, + "step": 2840 + }, + { + "epoch": 0.49012335029759335, + "grad_norm": 0.62890625, + "learning_rate": 1.72505890255221e-05, + "loss": 1.47, + "step": 2841 + }, + { + "epoch": 0.49029586819632537, + "grad_norm": 0.76953125, + "learning_rate": 1.724871423471813e-05, + "loss": 1.4975, + "step": 2842 + }, + { + "epoch": 0.4904683860950574, + "grad_norm": 0.7890625, + "learning_rate": 1.7246838906884125e-05, + "loss": 1.4325, + "step": 2843 + }, + { + "epoch": 0.49064090399378935, + "grad_norm": 0.58203125, + "learning_rate": 1.724496304215901e-05, + "loss": 1.5435, + "step": 2844 + }, + { + "epoch": 0.49081342189252136, + "grad_norm": 0.7734375, + "learning_rate": 1.724308664068176e-05, + "loss": 1.4481, + "step": 2845 + }, + { + "epoch": 0.4909859397912533, + "grad_norm": 0.61328125, + "learning_rate": 1.7241209702591405e-05, + "loss": 1.4997, + "step": 2846 + }, + { + "epoch": 0.49115845768998534, + "grad_norm": 0.6484375, + "learning_rate": 1.7239332228026982e-05, + "loss": 1.3803, + "step": 2847 + }, + { + "epoch": 0.49133097558871736, + "grad_norm": 1.46875, + "learning_rate": 1.72374542171276e-05, + "loss": 1.4905, + "step": 2848 + }, + { + "epoch": 0.4915034934874493, + "grad_norm": 0.609375, + "learning_rate": 1.7235575670032382e-05, + "loss": 1.4539, + "step": 2849 + }, + { + "epoch": 0.49167601138618133, + "grad_norm": 0.61328125, + "learning_rate": 1.7233696586880513e-05, + "loss": 1.4779, + "step": 2850 + }, + { + "epoch": 0.4918485292849133, + "grad_norm": 0.65234375, + "learning_rate": 1.72318169678112e-05, + "loss": 1.4413, + "step": 2851 + }, + { + "epoch": 0.4920210471836453, + "grad_norm": 0.76953125, + "learning_rate": 1.7229936812963697e-05, + "loss": 1.5197, + "step": 2852 + }, + { + "epoch": 0.49219356508237727, + "grad_norm": 0.65625, + "learning_rate": 1.7228056122477307e-05, + "loss": 1.5067, + "step": 2853 + }, + { + "epoch": 0.4923660829811093, + "grad_norm": 0.6796875, + "learning_rate": 1.7226174896491354e-05, + "loss": 1.5387, + "step": 2854 + }, + { + "epoch": 0.4925386008798413, + "grad_norm": 0.6953125, + "learning_rate": 1.7224293135145213e-05, + "loss": 1.5383, + "step": 2855 + }, + { + "epoch": 0.49271111877857326, + "grad_norm": 0.58984375, + "learning_rate": 1.722241083857829e-05, + "loss": 1.4639, + "step": 2856 + }, + { + "epoch": 0.4928836366773053, + "grad_norm": 0.671875, + "learning_rate": 1.7220528006930056e-05, + "loss": 1.441, + "step": 2857 + }, + { + "epoch": 0.49305615457603724, + "grad_norm": 0.73828125, + "learning_rate": 1.7218644640339986e-05, + "loss": 1.5281, + "step": 2858 + }, + { + "epoch": 0.49322867247476926, + "grad_norm": 0.65625, + "learning_rate": 1.7216760738947614e-05, + "loss": 1.4893, + "step": 2859 + }, + { + "epoch": 0.4934011903735013, + "grad_norm": 0.65234375, + "learning_rate": 1.721487630289252e-05, + "loss": 1.4929, + "step": 2860 + }, + { + "epoch": 0.49357370827223324, + "grad_norm": 0.65625, + "learning_rate": 1.7212991332314303e-05, + "loss": 1.4151, + "step": 2861 + }, + { + "epoch": 0.49374622617096525, + "grad_norm": 0.6640625, + "learning_rate": 1.7211105827352624e-05, + "loss": 1.5097, + "step": 2862 + }, + { + "epoch": 0.4939187440696972, + "grad_norm": 0.71484375, + "learning_rate": 1.7209219788147166e-05, + "loss": 1.559, + "step": 2863 + }, + { + "epoch": 0.49409126196842923, + "grad_norm": 0.72265625, + "learning_rate": 1.720733321483766e-05, + "loss": 1.399, + "step": 2864 + }, + { + "epoch": 0.4942637798671612, + "grad_norm": 0.671875, + "learning_rate": 1.7205446107563876e-05, + "loss": 1.4557, + "step": 2865 + }, + { + "epoch": 0.4944362977658932, + "grad_norm": 0.67578125, + "learning_rate": 1.7203558466465626e-05, + "loss": 1.5592, + "step": 2866 + }, + { + "epoch": 0.4946088156646252, + "grad_norm": 0.7265625, + "learning_rate": 1.7201670291682754e-05, + "loss": 1.4551, + "step": 2867 + }, + { + "epoch": 0.4947813335633572, + "grad_norm": 0.5859375, + "learning_rate": 1.7199781583355144e-05, + "loss": 1.5005, + "step": 2868 + }, + { + "epoch": 0.4949538514620892, + "grad_norm": 0.61328125, + "learning_rate": 1.719789234162273e-05, + "loss": 1.3738, + "step": 2869 + }, + { + "epoch": 0.49512636936082116, + "grad_norm": 0.62890625, + "learning_rate": 1.719600256662548e-05, + "loss": 1.5196, + "step": 2870 + }, + { + "epoch": 0.4952988872595532, + "grad_norm": 0.6796875, + "learning_rate": 1.7194112258503395e-05, + "loss": 1.4637, + "step": 2871 + }, + { + "epoch": 0.4954714051582852, + "grad_norm": 0.6484375, + "learning_rate": 1.7192221417396524e-05, + "loss": 1.5419, + "step": 2872 + }, + { + "epoch": 0.49564392305701716, + "grad_norm": 0.5859375, + "learning_rate": 1.7190330043444953e-05, + "loss": 1.5005, + "step": 2873 + }, + { + "epoch": 0.49581644095574917, + "grad_norm": 0.578125, + "learning_rate": 1.71884381367888e-05, + "loss": 1.5498, + "step": 2874 + }, + { + "epoch": 0.49598895885448113, + "grad_norm": 0.765625, + "learning_rate": 1.7186545697568236e-05, + "loss": 1.4867, + "step": 2875 + }, + { + "epoch": 0.49616147675321315, + "grad_norm": 0.80859375, + "learning_rate": 1.7184652725923465e-05, + "loss": 1.5072, + "step": 2876 + }, + { + "epoch": 0.49633399465194517, + "grad_norm": 0.66796875, + "learning_rate": 1.7182759221994727e-05, + "loss": 1.4238, + "step": 2877 + }, + { + "epoch": 0.4965065125506771, + "grad_norm": 1.0625, + "learning_rate": 1.7180865185922307e-05, + "loss": 1.4501, + "step": 2878 + }, + { + "epoch": 0.49667903044940914, + "grad_norm": 0.6875, + "learning_rate": 1.7178970617846524e-05, + "loss": 1.4374, + "step": 2879 + }, + { + "epoch": 0.4968515483481411, + "grad_norm": 0.59765625, + "learning_rate": 1.7177075517907745e-05, + "loss": 1.4367, + "step": 2880 + }, + { + "epoch": 0.4970240662468731, + "grad_norm": 0.85546875, + "learning_rate": 1.7175179886246365e-05, + "loss": 1.4153, + "step": 2881 + }, + { + "epoch": 0.4971965841456051, + "grad_norm": 0.6953125, + "learning_rate": 1.7173283723002825e-05, + "loss": 1.5074, + "step": 2882 + }, + { + "epoch": 0.4973691020443371, + "grad_norm": 0.64453125, + "learning_rate": 1.7171387028317606e-05, + "loss": 1.4974, + "step": 2883 + }, + { + "epoch": 0.4975416199430691, + "grad_norm": 0.65234375, + "learning_rate": 1.716948980233123e-05, + "loss": 1.4671, + "step": 2884 + }, + { + "epoch": 0.4977141378418011, + "grad_norm": 0.6953125, + "learning_rate": 1.716759204518425e-05, + "loss": 1.4103, + "step": 2885 + }, + { + "epoch": 0.4978866557405331, + "grad_norm": 0.65625, + "learning_rate": 1.7165693757017267e-05, + "loss": 1.5011, + "step": 2886 + }, + { + "epoch": 0.49805917363926505, + "grad_norm": 0.61328125, + "learning_rate": 1.7163794937970916e-05, + "loss": 1.4511, + "step": 2887 + }, + { + "epoch": 0.49823169153799707, + "grad_norm": 0.61328125, + "learning_rate": 1.7161895588185878e-05, + "loss": 1.5568, + "step": 2888 + }, + { + "epoch": 0.4984042094367291, + "grad_norm": 0.69140625, + "learning_rate": 1.7159995707802863e-05, + "loss": 1.5091, + "step": 2889 + }, + { + "epoch": 0.49857672733546105, + "grad_norm": 0.6796875, + "learning_rate": 1.7158095296962627e-05, + "loss": 1.4798, + "step": 2890 + }, + { + "epoch": 0.49874924523419306, + "grad_norm": 0.6640625, + "learning_rate": 1.7156194355805968e-05, + "loss": 1.4575, + "step": 2891 + }, + { + "epoch": 0.498921763132925, + "grad_norm": 0.66796875, + "learning_rate": 1.7154292884473712e-05, + "loss": 1.5086, + "step": 2892 + }, + { + "epoch": 0.49909428103165704, + "grad_norm": 0.71484375, + "learning_rate": 1.7152390883106743e-05, + "loss": 1.4749, + "step": 2893 + }, + { + "epoch": 0.49926679893038906, + "grad_norm": 0.67578125, + "learning_rate": 1.7150488351845965e-05, + "loss": 1.5043, + "step": 2894 + }, + { + "epoch": 0.499439316829121, + "grad_norm": 0.8125, + "learning_rate": 1.7148585290832333e-05, + "loss": 1.539, + "step": 2895 + }, + { + "epoch": 0.49961183472785303, + "grad_norm": 0.59765625, + "learning_rate": 1.7146681700206834e-05, + "loss": 1.434, + "step": 2896 + }, + { + "epoch": 0.499784352626585, + "grad_norm": 0.59375, + "learning_rate": 1.71447775801105e-05, + "loss": 1.4453, + "step": 2897 + }, + { + "epoch": 0.499956870525317, + "grad_norm": 0.59375, + "learning_rate": 1.71428729306844e-05, + "loss": 1.456, + "step": 2898 + }, + { + "epoch": 0.500129388424049, + "grad_norm": 0.62109375, + "learning_rate": 1.7140967752069645e-05, + "loss": 1.4775, + "step": 2899 + }, + { + "epoch": 0.500301906322781, + "grad_norm": 0.60546875, + "learning_rate": 1.713906204440738e-05, + "loss": 1.3852, + "step": 2900 + }, + { + "epoch": 0.500301906322781, + "eval_loss": 1.445255160331726, + "eval_runtime": 10.763, + "eval_samples_per_second": 95.141, + "eval_steps_per_second": 23.785, + "step": 2900 + }, + { + "epoch": 0.500474424221513, + "grad_norm": 0.62890625, + "learning_rate": 1.713715580783879e-05, + "loss": 1.5041, + "step": 2901 + }, + { + "epoch": 0.500646942120245, + "grad_norm": 0.69921875, + "learning_rate": 1.71352490425051e-05, + "loss": 1.4627, + "step": 2902 + }, + { + "epoch": 0.5008194600189769, + "grad_norm": 0.6171875, + "learning_rate": 1.7133341748547586e-05, + "loss": 1.503, + "step": 2903 + }, + { + "epoch": 0.500991977917709, + "grad_norm": 0.73828125, + "learning_rate": 1.7131433926107536e-05, + "loss": 1.4275, + "step": 2904 + }, + { + "epoch": 0.501164495816441, + "grad_norm": 0.8046875, + "learning_rate": 1.7129525575326307e-05, + "loss": 1.49, + "step": 2905 + }, + { + "epoch": 0.5013370137151729, + "grad_norm": 0.66796875, + "learning_rate": 1.7127616696345273e-05, + "loss": 1.5291, + "step": 2906 + }, + { + "epoch": 0.501509531613905, + "grad_norm": 0.9453125, + "learning_rate": 1.7125707289305862e-05, + "loss": 1.4875, + "step": 2907 + }, + { + "epoch": 0.501682049512637, + "grad_norm": 0.79296875, + "learning_rate": 1.7123797354349524e-05, + "loss": 1.4227, + "step": 2908 + }, + { + "epoch": 0.5018545674113689, + "grad_norm": 0.609375, + "learning_rate": 1.7121886891617774e-05, + "loss": 1.4348, + "step": 2909 + }, + { + "epoch": 0.5020270853101009, + "grad_norm": 0.6796875, + "learning_rate": 1.711997590125214e-05, + "loss": 1.4808, + "step": 2910 + }, + { + "epoch": 0.502199603208833, + "grad_norm": 0.62109375, + "learning_rate": 1.7118064383394206e-05, + "loss": 1.4416, + "step": 2911 + }, + { + "epoch": 0.5023721211075649, + "grad_norm": 0.58203125, + "learning_rate": 1.7116152338185584e-05, + "loss": 1.4522, + "step": 2912 + }, + { + "epoch": 0.5025446390062969, + "grad_norm": 0.61328125, + "learning_rate": 1.711423976576794e-05, + "loss": 1.4111, + "step": 2913 + }, + { + "epoch": 0.5027171569050289, + "grad_norm": 0.609375, + "learning_rate": 1.7112326666282953e-05, + "loss": 1.4947, + "step": 2914 + }, + { + "epoch": 0.5028896748037609, + "grad_norm": 0.61328125, + "learning_rate": 1.7110413039872372e-05, + "loss": 1.4555, + "step": 2915 + }, + { + "epoch": 0.5030621927024929, + "grad_norm": 0.6328125, + "learning_rate": 1.710849888667796e-05, + "loss": 1.5209, + "step": 2916 + }, + { + "epoch": 0.5032347106012248, + "grad_norm": 0.578125, + "learning_rate": 1.710658420684154e-05, + "loss": 1.4655, + "step": 2917 + }, + { + "epoch": 0.5034072284999569, + "grad_norm": 0.68359375, + "learning_rate": 1.7104669000504955e-05, + "loss": 1.4579, + "step": 2918 + }, + { + "epoch": 0.5035797463986889, + "grad_norm": 0.59765625, + "learning_rate": 1.71027532678101e-05, + "loss": 1.5267, + "step": 2919 + }, + { + "epoch": 0.5037522642974208, + "grad_norm": 0.72265625, + "learning_rate": 1.7100837008898903e-05, + "loss": 1.484, + "step": 2920 + }, + { + "epoch": 0.5039247821961529, + "grad_norm": 0.69921875, + "learning_rate": 1.709892022391333e-05, + "loss": 1.4201, + "step": 2921 + }, + { + "epoch": 0.5040973000948848, + "grad_norm": 0.5859375, + "learning_rate": 1.7097002912995392e-05, + "loss": 1.4935, + "step": 2922 + }, + { + "epoch": 0.5042698179936168, + "grad_norm": 0.609375, + "learning_rate": 1.7095085076287135e-05, + "loss": 1.5663, + "step": 2923 + }, + { + "epoch": 0.5044423358923489, + "grad_norm": 0.61328125, + "learning_rate": 1.7093166713930642e-05, + "loss": 1.4487, + "step": 2924 + }, + { + "epoch": 0.5046148537910808, + "grad_norm": 0.578125, + "learning_rate": 1.709124782606804e-05, + "loss": 1.3968, + "step": 2925 + }, + { + "epoch": 0.5047873716898128, + "grad_norm": 0.66796875, + "learning_rate": 1.708932841284149e-05, + "loss": 1.3832, + "step": 2926 + }, + { + "epoch": 0.5049598895885448, + "grad_norm": 0.7265625, + "learning_rate": 1.7087408474393193e-05, + "loss": 1.4526, + "step": 2927 + }, + { + "epoch": 0.5051324074872768, + "grad_norm": 0.60546875, + "learning_rate": 1.7085488010865398e-05, + "loss": 1.4902, + "step": 2928 + }, + { + "epoch": 0.5053049253860088, + "grad_norm": 0.640625, + "learning_rate": 1.7083567022400376e-05, + "loss": 1.4707, + "step": 2929 + }, + { + "epoch": 0.5054774432847408, + "grad_norm": 0.6953125, + "learning_rate": 1.7081645509140448e-05, + "loss": 1.491, + "step": 2930 + }, + { + "epoch": 0.5056499611834728, + "grad_norm": 0.63671875, + "learning_rate": 1.707972347122797e-05, + "loss": 1.4669, + "step": 2931 + }, + { + "epoch": 0.5058224790822048, + "grad_norm": 0.625, + "learning_rate": 1.7077800908805343e-05, + "loss": 1.4441, + "step": 2932 + }, + { + "epoch": 0.5059949969809368, + "grad_norm": 0.6953125, + "learning_rate": 1.7075877822015003e-05, + "loss": 1.4583, + "step": 2933 + }, + { + "epoch": 0.5061675148796687, + "grad_norm": 1.6328125, + "learning_rate": 1.707395421099942e-05, + "loss": 1.5226, + "step": 2934 + }, + { + "epoch": 0.5063400327784008, + "grad_norm": 0.73828125, + "learning_rate": 1.707203007590111e-05, + "loss": 1.49, + "step": 2935 + }, + { + "epoch": 0.5065125506771327, + "grad_norm": 0.65234375, + "learning_rate": 1.707010541686262e-05, + "loss": 1.479, + "step": 2936 + }, + { + "epoch": 0.5066850685758647, + "grad_norm": 0.72265625, + "learning_rate": 1.706818023402655e-05, + "loss": 1.4068, + "step": 2937 + }, + { + "epoch": 0.5068575864745968, + "grad_norm": 0.609375, + "learning_rate": 1.7066254527535518e-05, + "loss": 1.4789, + "step": 2938 + }, + { + "epoch": 0.5070301043733287, + "grad_norm": 0.65625, + "learning_rate": 1.7064328297532203e-05, + "loss": 1.5497, + "step": 2939 + }, + { + "epoch": 0.5072026222720607, + "grad_norm": 0.57421875, + "learning_rate": 1.706240154415931e-05, + "loss": 1.4872, + "step": 2940 + }, + { + "epoch": 0.5073751401707928, + "grad_norm": 0.65625, + "learning_rate": 1.706047426755958e-05, + "loss": 1.4383, + "step": 2941 + }, + { + "epoch": 0.5075476580695247, + "grad_norm": 0.60546875, + "learning_rate": 1.70585464678758e-05, + "loss": 1.4348, + "step": 2942 + }, + { + "epoch": 0.5077201759682567, + "grad_norm": 0.671875, + "learning_rate": 1.7056618145250797e-05, + "loss": 1.4141, + "step": 2943 + }, + { + "epoch": 0.5078926938669887, + "grad_norm": 0.66015625, + "learning_rate": 1.7054689299827425e-05, + "loss": 1.5521, + "step": 2944 + }, + { + "epoch": 0.5080652117657207, + "grad_norm": 0.6484375, + "learning_rate": 1.7052759931748593e-05, + "loss": 1.4734, + "step": 2945 + }, + { + "epoch": 0.5082377296644527, + "grad_norm": 0.625, + "learning_rate": 1.7050830041157234e-05, + "loss": 1.4657, + "step": 2946 + }, + { + "epoch": 0.5084102475631846, + "grad_norm": 0.6015625, + "learning_rate": 1.7048899628196337e-05, + "loss": 1.4567, + "step": 2947 + }, + { + "epoch": 0.5085827654619167, + "grad_norm": 0.62109375, + "learning_rate": 1.7046968693008903e-05, + "loss": 1.4203, + "step": 2948 + }, + { + "epoch": 0.5087552833606487, + "grad_norm": 0.67578125, + "learning_rate": 1.7045037235738005e-05, + "loss": 1.4839, + "step": 2949 + }, + { + "epoch": 0.5089278012593806, + "grad_norm": 0.65625, + "learning_rate": 1.7043105256526723e-05, + "loss": 1.5014, + "step": 2950 + }, + { + "epoch": 0.5091003191581126, + "grad_norm": 0.71484375, + "learning_rate": 1.70411727555182e-05, + "loss": 1.4932, + "step": 2951 + }, + { + "epoch": 0.5092728370568447, + "grad_norm": 0.5859375, + "learning_rate": 1.70392397328556e-05, + "loss": 1.5016, + "step": 2952 + }, + { + "epoch": 0.5094453549555766, + "grad_norm": 0.84765625, + "learning_rate": 1.7037306188682142e-05, + "loss": 1.4502, + "step": 2953 + }, + { + "epoch": 0.5096178728543086, + "grad_norm": 0.7578125, + "learning_rate": 1.7035372123141067e-05, + "loss": 1.4817, + "step": 2954 + }, + { + "epoch": 0.5097903907530407, + "grad_norm": 0.78125, + "learning_rate": 1.703343753637567e-05, + "loss": 1.4951, + "step": 2955 + }, + { + "epoch": 0.5099629086517726, + "grad_norm": 0.64453125, + "learning_rate": 1.703150242852927e-05, + "loss": 1.5164, + "step": 2956 + }, + { + "epoch": 0.5101354265505046, + "grad_norm": 0.66796875, + "learning_rate": 1.7029566799745233e-05, + "loss": 1.4479, + "step": 2957 + }, + { + "epoch": 0.5103079444492367, + "grad_norm": 0.796875, + "learning_rate": 1.702763065016697e-05, + "loss": 1.4953, + "step": 2958 + }, + { + "epoch": 0.5104804623479686, + "grad_norm": 0.7109375, + "learning_rate": 1.7025693979937915e-05, + "loss": 1.5578, + "step": 2959 + }, + { + "epoch": 0.5106529802467006, + "grad_norm": 0.6640625, + "learning_rate": 1.7023756789201553e-05, + "loss": 1.4928, + "step": 2960 + }, + { + "epoch": 0.5108254981454325, + "grad_norm": 0.74609375, + "learning_rate": 1.70218190781014e-05, + "loss": 1.4897, + "step": 2961 + }, + { + "epoch": 0.5109980160441646, + "grad_norm": 0.6875, + "learning_rate": 1.7019880846781017e-05, + "loss": 1.5874, + "step": 2962 + }, + { + "epoch": 0.5111705339428966, + "grad_norm": 0.6640625, + "learning_rate": 1.7017942095383997e-05, + "loss": 1.5494, + "step": 2963 + }, + { + "epoch": 0.5113430518416285, + "grad_norm": 0.8125, + "learning_rate": 1.701600282405398e-05, + "loss": 1.3314, + "step": 2964 + }, + { + "epoch": 0.5115155697403606, + "grad_norm": 0.61328125, + "learning_rate": 1.7014063032934632e-05, + "loss": 1.3961, + "step": 2965 + }, + { + "epoch": 0.5116880876390926, + "grad_norm": 0.75390625, + "learning_rate": 1.701212272216967e-05, + "loss": 1.5077, + "step": 2966 + }, + { + "epoch": 0.5118606055378245, + "grad_norm": 0.6875, + "learning_rate": 1.701018189190284e-05, + "loss": 1.4418, + "step": 2967 + }, + { + "epoch": 0.5120331234365565, + "grad_norm": 0.62890625, + "learning_rate": 1.7008240542277933e-05, + "loss": 1.3953, + "step": 2968 + }, + { + "epoch": 0.5122056413352886, + "grad_norm": 0.59765625, + "learning_rate": 1.7006298673438784e-05, + "loss": 1.4377, + "step": 2969 + }, + { + "epoch": 0.5123781592340205, + "grad_norm": 0.66015625, + "learning_rate": 1.7004356285529246e-05, + "loss": 1.4127, + "step": 2970 + }, + { + "epoch": 0.5125506771327525, + "grad_norm": 0.60546875, + "learning_rate": 1.7002413378693236e-05, + "loss": 1.5438, + "step": 2971 + }, + { + "epoch": 0.5127231950314846, + "grad_norm": 0.70703125, + "learning_rate": 1.7000469953074682e-05, + "loss": 1.4592, + "step": 2972 + }, + { + "epoch": 0.5128957129302165, + "grad_norm": 0.65234375, + "learning_rate": 1.6998526008817577e-05, + "loss": 1.4799, + "step": 2973 + }, + { + "epoch": 0.5130682308289485, + "grad_norm": 0.58203125, + "learning_rate": 1.6996581546065933e-05, + "loss": 1.5223, + "step": 2974 + }, + { + "epoch": 0.5132407487276806, + "grad_norm": 0.5859375, + "learning_rate": 1.6994636564963817e-05, + "loss": 1.4836, + "step": 2975 + }, + { + "epoch": 0.5134132666264125, + "grad_norm": 0.6328125, + "learning_rate": 1.6992691065655316e-05, + "loss": 1.3625, + "step": 2976 + }, + { + "epoch": 0.5135857845251445, + "grad_norm": 0.6953125, + "learning_rate": 1.699074504828457e-05, + "loss": 1.4768, + "step": 2977 + }, + { + "epoch": 0.5137583024238764, + "grad_norm": 0.640625, + "learning_rate": 1.6988798512995747e-05, + "loss": 1.5272, + "step": 2978 + }, + { + "epoch": 0.5139308203226085, + "grad_norm": 0.62890625, + "learning_rate": 1.6986851459933067e-05, + "loss": 1.4577, + "step": 2979 + }, + { + "epoch": 0.5141033382213405, + "grad_norm": 0.90234375, + "learning_rate": 1.698490388924077e-05, + "loss": 1.5431, + "step": 2980 + }, + { + "epoch": 0.5142758561200724, + "grad_norm": 0.65234375, + "learning_rate": 1.6982955801063155e-05, + "loss": 1.469, + "step": 2981 + }, + { + "epoch": 0.5144483740188045, + "grad_norm": 0.68359375, + "learning_rate": 1.698100719554454e-05, + "loss": 1.4332, + "step": 2982 + }, + { + "epoch": 0.5146208919175365, + "grad_norm": 0.63671875, + "learning_rate": 1.6979058072829292e-05, + "loss": 1.552, + "step": 2983 + }, + { + "epoch": 0.5147934098162684, + "grad_norm": 0.69140625, + "learning_rate": 1.6977108433061812e-05, + "loss": 1.4409, + "step": 2984 + }, + { + "epoch": 0.5149659277150004, + "grad_norm": 0.6328125, + "learning_rate": 1.697515827638655e-05, + "loss": 1.4671, + "step": 2985 + }, + { + "epoch": 0.5151384456137325, + "grad_norm": 0.609375, + "learning_rate": 1.6973207602947977e-05, + "loss": 1.5082, + "step": 2986 + }, + { + "epoch": 0.5153109635124644, + "grad_norm": 0.6640625, + "learning_rate": 1.697125641289062e-05, + "loss": 1.4577, + "step": 2987 + }, + { + "epoch": 0.5154834814111964, + "grad_norm": 0.7109375, + "learning_rate": 1.696930470635902e-05, + "loss": 1.4511, + "step": 2988 + }, + { + "epoch": 0.5156559993099284, + "grad_norm": 0.6640625, + "learning_rate": 1.6967352483497785e-05, + "loss": 1.4251, + "step": 2989 + }, + { + "epoch": 0.5158285172086604, + "grad_norm": 0.66796875, + "learning_rate": 1.6965399744451544e-05, + "loss": 1.3204, + "step": 2990 + }, + { + "epoch": 0.5160010351073924, + "grad_norm": 0.6015625, + "learning_rate": 1.696344648936497e-05, + "loss": 1.5144, + "step": 2991 + }, + { + "epoch": 0.5161735530061243, + "grad_norm": 0.64453125, + "learning_rate": 1.696149271838277e-05, + "loss": 1.4833, + "step": 2992 + }, + { + "epoch": 0.5163460709048564, + "grad_norm": 0.80078125, + "learning_rate": 1.695953843164969e-05, + "loss": 1.5172, + "step": 2993 + }, + { + "epoch": 0.5165185888035884, + "grad_norm": 0.66796875, + "learning_rate": 1.695758362931052e-05, + "loss": 1.6615, + "step": 2994 + }, + { + "epoch": 0.5166911067023203, + "grad_norm": 0.64453125, + "learning_rate": 1.695562831151008e-05, + "loss": 1.4564, + "step": 2995 + }, + { + "epoch": 0.5168636246010524, + "grad_norm": 0.5703125, + "learning_rate": 1.6953672478393235e-05, + "loss": 1.3986, + "step": 2996 + }, + { + "epoch": 0.5170361424997844, + "grad_norm": 0.67578125, + "learning_rate": 1.6951716130104884e-05, + "loss": 1.4119, + "step": 2997 + }, + { + "epoch": 0.5172086603985163, + "grad_norm": 0.66015625, + "learning_rate": 1.6949759266789963e-05, + "loss": 1.4871, + "step": 2998 + }, + { + "epoch": 0.5173811782972484, + "grad_norm": 0.65234375, + "learning_rate": 1.6947801888593452e-05, + "loss": 1.4817, + "step": 2999 + }, + { + "epoch": 0.5175536961959804, + "grad_norm": 0.7890625, + "learning_rate": 1.694584399566037e-05, + "loss": 1.4331, + "step": 3000 + }, + { + "epoch": 0.5175536961959804, + "eval_loss": 1.4426935911178589, + "eval_runtime": 11.0486, + "eval_samples_per_second": 92.681, + "eval_steps_per_second": 23.17, + "step": 3000 + }, + { + "epoch": 0.5177262140947123, + "grad_norm": 0.6640625, + "learning_rate": 1.6943885588135763e-05, + "loss": 1.4386, + "step": 3001 + }, + { + "epoch": 0.5178987319934443, + "grad_norm": 0.58984375, + "learning_rate": 1.694192666616472e-05, + "loss": 1.4496, + "step": 3002 + }, + { + "epoch": 0.5180712498921763, + "grad_norm": 0.87890625, + "learning_rate": 1.693996722989238e-05, + "loss": 1.4264, + "step": 3003 + }, + { + "epoch": 0.5182437677909083, + "grad_norm": 0.65234375, + "learning_rate": 1.6938007279463903e-05, + "loss": 1.5147, + "step": 3004 + }, + { + "epoch": 0.5184162856896403, + "grad_norm": 0.73828125, + "learning_rate": 1.6936046815024495e-05, + "loss": 1.5204, + "step": 3005 + }, + { + "epoch": 0.5185888035883723, + "grad_norm": 0.77734375, + "learning_rate": 1.69340858367194e-05, + "loss": 1.4578, + "step": 3006 + }, + { + "epoch": 0.5187613214871043, + "grad_norm": 0.8671875, + "learning_rate": 1.6932124344693904e-05, + "loss": 1.4739, + "step": 3007 + }, + { + "epoch": 0.5189338393858363, + "grad_norm": 0.69140625, + "learning_rate": 1.693016233909332e-05, + "loss": 1.4174, + "step": 3008 + }, + { + "epoch": 0.5191063572845682, + "grad_norm": 0.734375, + "learning_rate": 1.692819982006301e-05, + "loss": 1.4694, + "step": 3009 + }, + { + "epoch": 0.5192788751833003, + "grad_norm": 0.66015625, + "learning_rate": 1.692623678774836e-05, + "loss": 1.4741, + "step": 3010 + }, + { + "epoch": 0.5194513930820323, + "grad_norm": 0.6640625, + "learning_rate": 1.692427324229482e-05, + "loss": 1.4905, + "step": 3011 + }, + { + "epoch": 0.5196239109807642, + "grad_norm": 0.60546875, + "learning_rate": 1.692230918384785e-05, + "loss": 1.4186, + "step": 3012 + }, + { + "epoch": 0.5197964288794963, + "grad_norm": 0.7734375, + "learning_rate": 1.6920344612552965e-05, + "loss": 1.4845, + "step": 3013 + }, + { + "epoch": 0.5199689467782282, + "grad_norm": 0.71484375, + "learning_rate": 1.691837952855571e-05, + "loss": 1.5879, + "step": 3014 + }, + { + "epoch": 0.5201414646769602, + "grad_norm": 0.7421875, + "learning_rate": 1.6916413932001673e-05, + "loss": 1.4893, + "step": 3015 + }, + { + "epoch": 0.5203139825756923, + "grad_norm": 0.87109375, + "learning_rate": 1.6914447823036474e-05, + "loss": 1.5236, + "step": 3016 + }, + { + "epoch": 0.5204865004744242, + "grad_norm": 0.91015625, + "learning_rate": 1.691248120180578e-05, + "loss": 1.5532, + "step": 3017 + }, + { + "epoch": 0.5206590183731562, + "grad_norm": 0.9140625, + "learning_rate": 1.691051406845528e-05, + "loss": 1.4801, + "step": 3018 + }, + { + "epoch": 0.5208315362718882, + "grad_norm": 0.69921875, + "learning_rate": 1.6908546423130726e-05, + "loss": 1.4341, + "step": 3019 + }, + { + "epoch": 0.5210040541706202, + "grad_norm": 0.6484375, + "learning_rate": 1.6906578265977885e-05, + "loss": 1.5411, + "step": 3020 + }, + { + "epoch": 0.5211765720693522, + "grad_norm": 0.62890625, + "learning_rate": 1.690460959714257e-05, + "loss": 1.4389, + "step": 3021 + }, + { + "epoch": 0.5213490899680842, + "grad_norm": 0.64453125, + "learning_rate": 1.6902640416770635e-05, + "loss": 1.5066, + "step": 3022 + }, + { + "epoch": 0.5215216078668162, + "grad_norm": 0.66796875, + "learning_rate": 1.6900670725007968e-05, + "loss": 1.4621, + "step": 3023 + }, + { + "epoch": 0.5216941257655482, + "grad_norm": 0.78125, + "learning_rate": 1.6898700522000498e-05, + "loss": 1.4797, + "step": 3024 + }, + { + "epoch": 0.5218666436642801, + "grad_norm": 0.65234375, + "learning_rate": 1.6896729807894187e-05, + "loss": 1.5472, + "step": 3025 + }, + { + "epoch": 0.5220391615630121, + "grad_norm": 0.60546875, + "learning_rate": 1.6894758582835038e-05, + "loss": 1.4273, + "step": 3026 + }, + { + "epoch": 0.5222116794617442, + "grad_norm": 0.671875, + "learning_rate": 1.6892786846969095e-05, + "loss": 1.4918, + "step": 3027 + }, + { + "epoch": 0.5223841973604761, + "grad_norm": 0.63671875, + "learning_rate": 1.689081460044243e-05, + "loss": 1.4427, + "step": 3028 + }, + { + "epoch": 0.5225567152592081, + "grad_norm": 0.7109375, + "learning_rate": 1.6888841843401164e-05, + "loss": 1.3589, + "step": 3029 + }, + { + "epoch": 0.5227292331579402, + "grad_norm": 0.6015625, + "learning_rate": 1.688686857599146e-05, + "loss": 1.3723, + "step": 3030 + }, + { + "epoch": 0.5229017510566721, + "grad_norm": 0.60546875, + "learning_rate": 1.688489479835949e-05, + "loss": 1.5148, + "step": 3031 + }, + { + "epoch": 0.5230742689554041, + "grad_norm": 0.59375, + "learning_rate": 1.6882920510651497e-05, + "loss": 1.4609, + "step": 3032 + }, + { + "epoch": 0.5232467868541362, + "grad_norm": 0.6484375, + "learning_rate": 1.6880945713013747e-05, + "loss": 1.3951, + "step": 3033 + }, + { + "epoch": 0.5234193047528681, + "grad_norm": 0.65625, + "learning_rate": 1.6878970405592545e-05, + "loss": 1.4334, + "step": 3034 + }, + { + "epoch": 0.5235918226516001, + "grad_norm": 0.6484375, + "learning_rate": 1.6876994588534234e-05, + "loss": 1.4421, + "step": 3035 + }, + { + "epoch": 0.523764340550332, + "grad_norm": 0.59765625, + "learning_rate": 1.6875018261985193e-05, + "loss": 1.4525, + "step": 3036 + }, + { + "epoch": 0.5239368584490641, + "grad_norm": 0.60546875, + "learning_rate": 1.6873041426091845e-05, + "loss": 1.42, + "step": 3037 + }, + { + "epoch": 0.5241093763477961, + "grad_norm": 0.61328125, + "learning_rate": 1.6871064081000643e-05, + "loss": 1.5279, + "step": 3038 + }, + { + "epoch": 0.524281894246528, + "grad_norm": 0.6796875, + "learning_rate": 1.686908622685808e-05, + "loss": 1.4593, + "step": 3039 + }, + { + "epoch": 0.5244544121452601, + "grad_norm": 0.64453125, + "learning_rate": 1.6867107863810687e-05, + "loss": 1.5149, + "step": 3040 + }, + { + "epoch": 0.5246269300439921, + "grad_norm": 0.56640625, + "learning_rate": 1.6865128992005037e-05, + "loss": 1.3865, + "step": 3041 + }, + { + "epoch": 0.524799447942724, + "grad_norm": 0.58984375, + "learning_rate": 1.6863149611587738e-05, + "loss": 1.4426, + "step": 3042 + }, + { + "epoch": 0.524971965841456, + "grad_norm": 0.5859375, + "learning_rate": 1.686116972270543e-05, + "loss": 1.4312, + "step": 3043 + }, + { + "epoch": 0.5251444837401881, + "grad_norm": 0.71875, + "learning_rate": 1.68591893255048e-05, + "loss": 1.5589, + "step": 3044 + }, + { + "epoch": 0.52531700163892, + "grad_norm": 0.6640625, + "learning_rate": 1.685720842013257e-05, + "loss": 1.441, + "step": 3045 + }, + { + "epoch": 0.525489519537652, + "grad_norm": 0.7265625, + "learning_rate": 1.6855227006735492e-05, + "loss": 1.3209, + "step": 3046 + }, + { + "epoch": 0.5256620374363841, + "grad_norm": 0.67578125, + "learning_rate": 1.6853245085460362e-05, + "loss": 1.4729, + "step": 3047 + }, + { + "epoch": 0.525834555335116, + "grad_norm": 0.64453125, + "learning_rate": 1.6851262656454016e-05, + "loss": 1.5194, + "step": 3048 + }, + { + "epoch": 0.526007073233848, + "grad_norm": 0.7734375, + "learning_rate": 1.684927971986332e-05, + "loss": 1.5129, + "step": 3049 + }, + { + "epoch": 0.5261795911325801, + "grad_norm": 0.6328125, + "learning_rate": 1.684729627583519e-05, + "loss": 1.4712, + "step": 3050 + }, + { + "epoch": 0.526352109031312, + "grad_norm": 0.58203125, + "learning_rate": 1.684531232451657e-05, + "loss": 1.473, + "step": 3051 + }, + { + "epoch": 0.526524626930044, + "grad_norm": 0.64453125, + "learning_rate": 1.684332786605444e-05, + "loss": 1.4668, + "step": 3052 + }, + { + "epoch": 0.5266971448287759, + "grad_norm": 0.66796875, + "learning_rate": 1.684134290059582e-05, + "loss": 1.3984, + "step": 3053 + }, + { + "epoch": 0.526869662727508, + "grad_norm": 0.63671875, + "learning_rate": 1.6839357428287775e-05, + "loss": 1.447, + "step": 3054 + }, + { + "epoch": 0.52704218062624, + "grad_norm": 0.6484375, + "learning_rate": 1.6837371449277395e-05, + "loss": 1.4159, + "step": 3055 + }, + { + "epoch": 0.5272146985249719, + "grad_norm": 0.76953125, + "learning_rate": 1.6835384963711815e-05, + "loss": 1.5567, + "step": 3056 + }, + { + "epoch": 0.527387216423704, + "grad_norm": 0.71875, + "learning_rate": 1.683339797173821e-05, + "loss": 1.5057, + "step": 3057 + }, + { + "epoch": 0.527559734322436, + "grad_norm": 0.66015625, + "learning_rate": 1.6831410473503788e-05, + "loss": 1.5606, + "step": 3058 + }, + { + "epoch": 0.5277322522211679, + "grad_norm": 0.69140625, + "learning_rate": 1.682942246915579e-05, + "loss": 1.4107, + "step": 3059 + }, + { + "epoch": 0.5279047701198999, + "grad_norm": 0.5546875, + "learning_rate": 1.6827433958841504e-05, + "loss": 1.3407, + "step": 3060 + }, + { + "epoch": 0.528077288018632, + "grad_norm": 0.96484375, + "learning_rate": 1.682544494270825e-05, + "loss": 1.5163, + "step": 3061 + }, + { + "epoch": 0.5282498059173639, + "grad_norm": 0.65625, + "learning_rate": 1.682345542090339e-05, + "loss": 1.4255, + "step": 3062 + }, + { + "epoch": 0.5284223238160959, + "grad_norm": 0.61328125, + "learning_rate": 1.6821465393574315e-05, + "loss": 1.3823, + "step": 3063 + }, + { + "epoch": 0.528594841714828, + "grad_norm": 0.69921875, + "learning_rate": 1.681947486086846e-05, + "loss": 1.489, + "step": 3064 + }, + { + "epoch": 0.5287673596135599, + "grad_norm": 0.70703125, + "learning_rate": 1.6817483822933298e-05, + "loss": 1.3998, + "step": 3065 + }, + { + "epoch": 0.5289398775122919, + "grad_norm": 0.73828125, + "learning_rate": 1.681549227991634e-05, + "loss": 1.4426, + "step": 3066 + }, + { + "epoch": 0.5291123954110238, + "grad_norm": 0.63671875, + "learning_rate": 1.6813500231965127e-05, + "loss": 1.486, + "step": 3067 + }, + { + "epoch": 0.5292849133097559, + "grad_norm": 0.70703125, + "learning_rate": 1.6811507679227242e-05, + "loss": 1.4526, + "step": 3068 + }, + { + "epoch": 0.5294574312084879, + "grad_norm": 0.6328125, + "learning_rate": 1.6809514621850313e-05, + "loss": 1.4536, + "step": 3069 + }, + { + "epoch": 0.5296299491072198, + "grad_norm": 0.58984375, + "learning_rate": 1.680752105998199e-05, + "loss": 1.4325, + "step": 3070 + }, + { + "epoch": 0.5298024670059519, + "grad_norm": 0.62109375, + "learning_rate": 1.680552699376997e-05, + "loss": 1.3994, + "step": 3071 + }, + { + "epoch": 0.5299749849046839, + "grad_norm": 0.6171875, + "learning_rate": 1.6803532423361985e-05, + "loss": 1.516, + "step": 3072 + }, + { + "epoch": 0.5301475028034158, + "grad_norm": 0.640625, + "learning_rate": 1.6801537348905813e-05, + "loss": 1.4652, + "step": 3073 + }, + { + "epoch": 0.5303200207021479, + "grad_norm": 0.65625, + "learning_rate": 1.6799541770549256e-05, + "loss": 1.5399, + "step": 3074 + }, + { + "epoch": 0.5304925386008799, + "grad_norm": 0.82421875, + "learning_rate": 1.6797545688440156e-05, + "loss": 1.4818, + "step": 3075 + }, + { + "epoch": 0.5306650564996118, + "grad_norm": 0.671875, + "learning_rate": 1.67955491027264e-05, + "loss": 1.4993, + "step": 3076 + }, + { + "epoch": 0.5308375743983438, + "grad_norm": 0.7265625, + "learning_rate": 1.6793552013555905e-05, + "loss": 1.3808, + "step": 3077 + }, + { + "epoch": 0.5310100922970759, + "grad_norm": 0.72265625, + "learning_rate": 1.679155442107663e-05, + "loss": 1.4844, + "step": 3078 + }, + { + "epoch": 0.5311826101958078, + "grad_norm": 0.578125, + "learning_rate": 1.6789556325436566e-05, + "loss": 1.4248, + "step": 3079 + }, + { + "epoch": 0.5313551280945398, + "grad_norm": 0.65234375, + "learning_rate": 1.678755772678375e-05, + "loss": 1.4678, + "step": 3080 + }, + { + "epoch": 0.5315276459932718, + "grad_norm": 0.72265625, + "learning_rate": 1.6785558625266243e-05, + "loss": 1.4931, + "step": 3081 + }, + { + "epoch": 0.5317001638920038, + "grad_norm": 0.6640625, + "learning_rate": 1.6783559021032158e-05, + "loss": 1.3671, + "step": 3082 + }, + { + "epoch": 0.5318726817907358, + "grad_norm": 0.7265625, + "learning_rate": 1.678155891422963e-05, + "loss": 1.5186, + "step": 3083 + }, + { + "epoch": 0.5320451996894677, + "grad_norm": 0.68359375, + "learning_rate": 1.677955830500685e-05, + "loss": 1.3654, + "step": 3084 + }, + { + "epoch": 0.5322177175881998, + "grad_norm": 0.6328125, + "learning_rate": 1.6777557193512025e-05, + "loss": 1.4802, + "step": 3085 + }, + { + "epoch": 0.5323902354869318, + "grad_norm": 0.70703125, + "learning_rate": 1.6775555579893422e-05, + "loss": 1.5506, + "step": 3086 + }, + { + "epoch": 0.5325627533856637, + "grad_norm": 0.7421875, + "learning_rate": 1.677355346429932e-05, + "loss": 1.392, + "step": 3087 + }, + { + "epoch": 0.5327352712843958, + "grad_norm": 0.5703125, + "learning_rate": 1.6771550846878056e-05, + "loss": 1.5255, + "step": 3088 + }, + { + "epoch": 0.5329077891831278, + "grad_norm": 0.72265625, + "learning_rate": 1.6769547727777997e-05, + "loss": 1.4502, + "step": 3089 + }, + { + "epoch": 0.5330803070818597, + "grad_norm": 0.7890625, + "learning_rate": 1.6767544107147542e-05, + "loss": 1.4886, + "step": 3090 + }, + { + "epoch": 0.5332528249805918, + "grad_norm": 0.66015625, + "learning_rate": 1.6765539985135134e-05, + "loss": 1.4446, + "step": 3091 + }, + { + "epoch": 0.5334253428793237, + "grad_norm": 0.73828125, + "learning_rate": 1.6763535361889252e-05, + "loss": 1.4988, + "step": 3092 + }, + { + "epoch": 0.5335978607780557, + "grad_norm": 0.6796875, + "learning_rate": 1.676153023755841e-05, + "loss": 1.4266, + "step": 3093 + }, + { + "epoch": 0.5337703786767877, + "grad_norm": 0.66796875, + "learning_rate": 1.675952461229116e-05, + "loss": 1.5449, + "step": 3094 + }, + { + "epoch": 0.5339428965755197, + "grad_norm": 0.7109375, + "learning_rate": 1.6757518486236088e-05, + "loss": 1.469, + "step": 3095 + }, + { + "epoch": 0.5341154144742517, + "grad_norm": 0.6640625, + "learning_rate": 1.6755511859541827e-05, + "loss": 1.5697, + "step": 3096 + }, + { + "epoch": 0.5342879323729837, + "grad_norm": 0.609375, + "learning_rate": 1.6753504732357035e-05, + "loss": 1.4817, + "step": 3097 + }, + { + "epoch": 0.5344604502717157, + "grad_norm": 0.6484375, + "learning_rate": 1.6751497104830416e-05, + "loss": 1.5511, + "step": 3098 + }, + { + "epoch": 0.5346329681704477, + "grad_norm": 0.73828125, + "learning_rate": 1.6749488977110706e-05, + "loss": 1.4915, + "step": 3099 + }, + { + "epoch": 0.5348054860691797, + "grad_norm": 0.63671875, + "learning_rate": 1.6747480349346678e-05, + "loss": 1.4205, + "step": 3100 + }, + { + "epoch": 0.5348054860691797, + "eval_loss": 1.4400311708450317, + "eval_runtime": 10.8978, + "eval_samples_per_second": 93.964, + "eval_steps_per_second": 23.491, + "step": 3100 + }, + { + "epoch": 0.5349780039679116, + "grad_norm": 0.7109375, + "learning_rate": 1.674547122168715e-05, + "loss": 1.4605, + "step": 3101 + }, + { + "epoch": 0.5351505218666437, + "grad_norm": 0.67578125, + "learning_rate": 1.6743461594280962e-05, + "loss": 1.4807, + "step": 3102 + }, + { + "epoch": 0.5353230397653757, + "grad_norm": 0.6171875, + "learning_rate": 1.6741451467277006e-05, + "loss": 1.4194, + "step": 3103 + }, + { + "epoch": 0.5354955576641076, + "grad_norm": 0.6328125, + "learning_rate": 1.6739440840824203e-05, + "loss": 1.4102, + "step": 3104 + }, + { + "epoch": 0.5356680755628397, + "grad_norm": 0.625, + "learning_rate": 1.6737429715071512e-05, + "loss": 1.4654, + "step": 3105 + }, + { + "epoch": 0.5358405934615716, + "grad_norm": 0.71484375, + "learning_rate": 1.673541809016793e-05, + "loss": 1.3424, + "step": 3106 + }, + { + "epoch": 0.5360131113603036, + "grad_norm": 0.7578125, + "learning_rate": 1.673340596626249e-05, + "loss": 1.4412, + "step": 3107 + }, + { + "epoch": 0.5361856292590357, + "grad_norm": 0.6015625, + "learning_rate": 1.6731393343504266e-05, + "loss": 1.4353, + "step": 3108 + }, + { + "epoch": 0.5363581471577676, + "grad_norm": 0.66015625, + "learning_rate": 1.6729380222042363e-05, + "loss": 1.4199, + "step": 3109 + }, + { + "epoch": 0.5365306650564996, + "grad_norm": 0.66796875, + "learning_rate": 1.672736660202592e-05, + "loss": 1.4368, + "step": 3110 + }, + { + "epoch": 0.5367031829552316, + "grad_norm": 0.609375, + "learning_rate": 1.6725352483604133e-05, + "loss": 1.4803, + "step": 3111 + }, + { + "epoch": 0.5368757008539636, + "grad_norm": 0.72265625, + "learning_rate": 1.672333786692621e-05, + "loss": 1.5334, + "step": 3112 + }, + { + "epoch": 0.5370482187526956, + "grad_norm": 0.67578125, + "learning_rate": 1.6721322752141404e-05, + "loss": 1.412, + "step": 3113 + }, + { + "epoch": 0.5372207366514276, + "grad_norm": 0.61328125, + "learning_rate": 1.6719307139399018e-05, + "loss": 1.5448, + "step": 3114 + }, + { + "epoch": 0.5373932545501596, + "grad_norm": 0.73046875, + "learning_rate": 1.671729102884837e-05, + "loss": 1.3743, + "step": 3115 + }, + { + "epoch": 0.5375657724488916, + "grad_norm": 0.609375, + "learning_rate": 1.6715274420638833e-05, + "loss": 1.3251, + "step": 3116 + }, + { + "epoch": 0.5377382903476235, + "grad_norm": 0.7578125, + "learning_rate": 1.671325731491981e-05, + "loss": 1.4829, + "step": 3117 + }, + { + "epoch": 0.5379108082463555, + "grad_norm": 0.6171875, + "learning_rate": 1.6711239711840736e-05, + "loss": 1.4443, + "step": 3118 + }, + { + "epoch": 0.5380833261450876, + "grad_norm": 0.734375, + "learning_rate": 1.670922161155109e-05, + "loss": 1.5348, + "step": 3119 + }, + { + "epoch": 0.5382558440438195, + "grad_norm": 0.640625, + "learning_rate": 1.670720301420039e-05, + "loss": 1.4214, + "step": 3120 + }, + { + "epoch": 0.5384283619425515, + "grad_norm": 0.72265625, + "learning_rate": 1.670518391993818e-05, + "loss": 1.5121, + "step": 3121 + }, + { + "epoch": 0.5386008798412836, + "grad_norm": 0.64453125, + "learning_rate": 1.6703164328914048e-05, + "loss": 1.4731, + "step": 3122 + }, + { + "epoch": 0.5387733977400155, + "grad_norm": 0.65234375, + "learning_rate": 1.670114424127762e-05, + "loss": 1.4324, + "step": 3123 + }, + { + "epoch": 0.5389459156387475, + "grad_norm": 0.703125, + "learning_rate": 1.6699123657178553e-05, + "loss": 1.4381, + "step": 3124 + }, + { + "epoch": 0.5391184335374796, + "grad_norm": 0.80859375, + "learning_rate": 1.6697102576766552e-05, + "loss": 1.45, + "step": 3125 + }, + { + "epoch": 0.5392909514362115, + "grad_norm": 0.62109375, + "learning_rate": 1.6695081000191345e-05, + "loss": 1.414, + "step": 3126 + }, + { + "epoch": 0.5394634693349435, + "grad_norm": 0.78515625, + "learning_rate": 1.6693058927602704e-05, + "loss": 1.5434, + "step": 3127 + }, + { + "epoch": 0.5396359872336755, + "grad_norm": 0.76171875, + "learning_rate": 1.6691036359150435e-05, + "loss": 1.4602, + "step": 3128 + }, + { + "epoch": 0.5398085051324075, + "grad_norm": 0.625, + "learning_rate": 1.668901329498439e-05, + "loss": 1.5231, + "step": 3129 + }, + { + "epoch": 0.5399810230311395, + "grad_norm": 0.6015625, + "learning_rate": 1.6686989735254442e-05, + "loss": 1.3867, + "step": 3130 + }, + { + "epoch": 0.5401535409298714, + "grad_norm": 0.59765625, + "learning_rate": 1.6684965680110514e-05, + "loss": 1.4377, + "step": 3131 + }, + { + "epoch": 0.5403260588286035, + "grad_norm": 0.6953125, + "learning_rate": 1.668294112970256e-05, + "loss": 1.523, + "step": 3132 + }, + { + "epoch": 0.5404985767273355, + "grad_norm": 1.5078125, + "learning_rate": 1.6680916084180566e-05, + "loss": 1.5506, + "step": 3133 + }, + { + "epoch": 0.5406710946260674, + "grad_norm": 0.58984375, + "learning_rate": 1.667889054369457e-05, + "loss": 1.5098, + "step": 3134 + }, + { + "epoch": 0.5408436125247994, + "grad_norm": 0.60546875, + "learning_rate": 1.6676864508394624e-05, + "loss": 1.4797, + "step": 3135 + }, + { + "epoch": 0.5410161304235315, + "grad_norm": 0.66796875, + "learning_rate": 1.667483797843084e-05, + "loss": 1.5096, + "step": 3136 + }, + { + "epoch": 0.5411886483222634, + "grad_norm": 0.640625, + "learning_rate": 1.6672810953953352e-05, + "loss": 1.5928, + "step": 3137 + }, + { + "epoch": 0.5413611662209954, + "grad_norm": 0.59765625, + "learning_rate": 1.6670783435112334e-05, + "loss": 1.4941, + "step": 3138 + }, + { + "epoch": 0.5415336841197275, + "grad_norm": 0.671875, + "learning_rate": 1.6668755422058e-05, + "loss": 1.4992, + "step": 3139 + }, + { + "epoch": 0.5417062020184594, + "grad_norm": 0.58984375, + "learning_rate": 1.6666726914940594e-05, + "loss": 1.4193, + "step": 3140 + }, + { + "epoch": 0.5418787199171914, + "grad_norm": 0.6015625, + "learning_rate": 1.6664697913910405e-05, + "loss": 1.4341, + "step": 3141 + }, + { + "epoch": 0.5420512378159233, + "grad_norm": 0.5625, + "learning_rate": 1.6662668419117748e-05, + "loss": 1.4427, + "step": 3142 + }, + { + "epoch": 0.5422237557146554, + "grad_norm": 0.60546875, + "learning_rate": 1.666063843071299e-05, + "loss": 1.4435, + "step": 3143 + }, + { + "epoch": 0.5423962736133874, + "grad_norm": 0.80859375, + "learning_rate": 1.6658607948846513e-05, + "loss": 1.4828, + "step": 3144 + }, + { + "epoch": 0.5425687915121193, + "grad_norm": 0.62890625, + "learning_rate": 1.6656576973668754e-05, + "loss": 1.5457, + "step": 3145 + }, + { + "epoch": 0.5427413094108514, + "grad_norm": 0.640625, + "learning_rate": 1.6654545505330184e-05, + "loss": 1.374, + "step": 3146 + }, + { + "epoch": 0.5429138273095834, + "grad_norm": 0.65625, + "learning_rate": 1.6652513543981302e-05, + "loss": 1.496, + "step": 3147 + }, + { + "epoch": 0.5430863452083153, + "grad_norm": 0.60546875, + "learning_rate": 1.6650481089772652e-05, + "loss": 1.4306, + "step": 3148 + }, + { + "epoch": 0.5432588631070474, + "grad_norm": 0.59765625, + "learning_rate": 1.664844814285481e-05, + "loss": 1.3942, + "step": 3149 + }, + { + "epoch": 0.5434313810057794, + "grad_norm": 0.7578125, + "learning_rate": 1.6646414703378387e-05, + "loss": 1.4703, + "step": 3150 + }, + { + "epoch": 0.5436038989045113, + "grad_norm": 0.64453125, + "learning_rate": 1.6644380771494032e-05, + "loss": 1.6057, + "step": 3151 + }, + { + "epoch": 0.5437764168032433, + "grad_norm": 0.58984375, + "learning_rate": 1.664234634735243e-05, + "loss": 1.3779, + "step": 3152 + }, + { + "epoch": 0.5439489347019754, + "grad_norm": 0.73046875, + "learning_rate": 1.6640311431104314e-05, + "loss": 1.4757, + "step": 3153 + }, + { + "epoch": 0.5441214526007073, + "grad_norm": 0.65625, + "learning_rate": 1.6638276022900434e-05, + "loss": 1.4549, + "step": 3154 + }, + { + "epoch": 0.5442939704994393, + "grad_norm": 0.68359375, + "learning_rate": 1.6636240122891587e-05, + "loss": 1.4703, + "step": 3155 + }, + { + "epoch": 0.5444664883981714, + "grad_norm": 0.76171875, + "learning_rate": 1.663420373122861e-05, + "loss": 1.528, + "step": 3156 + }, + { + "epoch": 0.5446390062969033, + "grad_norm": 0.66015625, + "learning_rate": 1.6632166848062367e-05, + "loss": 1.4214, + "step": 3157 + }, + { + "epoch": 0.5448115241956353, + "grad_norm": 0.6640625, + "learning_rate": 1.6630129473543762e-05, + "loss": 1.5069, + "step": 3158 + }, + { + "epoch": 0.5449840420943672, + "grad_norm": 0.66796875, + "learning_rate": 1.662809160782374e-05, + "loss": 1.4371, + "step": 3159 + }, + { + "epoch": 0.5451565599930993, + "grad_norm": 0.6796875, + "learning_rate": 1.662605325105328e-05, + "loss": 1.5044, + "step": 3160 + }, + { + "epoch": 0.5453290778918313, + "grad_norm": 0.5703125, + "learning_rate": 1.662401440338339e-05, + "loss": 1.4541, + "step": 3161 + }, + { + "epoch": 0.5455015957905632, + "grad_norm": 0.66796875, + "learning_rate": 1.6621975064965122e-05, + "loss": 1.5276, + "step": 3162 + }, + { + "epoch": 0.5456741136892953, + "grad_norm": 0.640625, + "learning_rate": 1.661993523594957e-05, + "loss": 1.5695, + "step": 3163 + }, + { + "epoch": 0.5458466315880273, + "grad_norm": 0.60546875, + "learning_rate": 1.6617894916487852e-05, + "loss": 1.5393, + "step": 3164 + }, + { + "epoch": 0.5460191494867592, + "grad_norm": 0.640625, + "learning_rate": 1.6615854106731127e-05, + "loss": 1.4206, + "step": 3165 + }, + { + "epoch": 0.5461916673854913, + "grad_norm": 0.64453125, + "learning_rate": 1.661381280683059e-05, + "loss": 1.5095, + "step": 3166 + }, + { + "epoch": 0.5463641852842233, + "grad_norm": 0.65625, + "learning_rate": 1.661177101693748e-05, + "loss": 1.5578, + "step": 3167 + }, + { + "epoch": 0.5465367031829552, + "grad_norm": 0.609375, + "learning_rate": 1.6609728737203058e-05, + "loss": 1.5203, + "step": 3168 + }, + { + "epoch": 0.5467092210816872, + "grad_norm": 0.609375, + "learning_rate": 1.660768596777863e-05, + "loss": 1.5172, + "step": 3169 + }, + { + "epoch": 0.5468817389804193, + "grad_norm": 0.671875, + "learning_rate": 1.660564270881554e-05, + "loss": 1.4111, + "step": 3170 + }, + { + "epoch": 0.5470542568791512, + "grad_norm": 0.62109375, + "learning_rate": 1.6603598960465166e-05, + "loss": 1.4433, + "step": 3171 + }, + { + "epoch": 0.5472267747778832, + "grad_norm": 0.609375, + "learning_rate": 1.660155472287892e-05, + "loss": 1.3998, + "step": 3172 + }, + { + "epoch": 0.5473992926766152, + "grad_norm": 0.62890625, + "learning_rate": 1.659950999620825e-05, + "loss": 1.4611, + "step": 3173 + }, + { + "epoch": 0.5475718105753472, + "grad_norm": 0.609375, + "learning_rate": 1.6597464780604646e-05, + "loss": 1.4887, + "step": 3174 + }, + { + "epoch": 0.5477443284740792, + "grad_norm": 0.640625, + "learning_rate": 1.6595419076219625e-05, + "loss": 1.5055, + "step": 3175 + }, + { + "epoch": 0.5479168463728111, + "grad_norm": 0.63671875, + "learning_rate": 1.659337288320475e-05, + "loss": 1.4551, + "step": 3176 + }, + { + "epoch": 0.5480893642715432, + "grad_norm": 0.63671875, + "learning_rate": 1.6591326201711612e-05, + "loss": 1.5612, + "step": 3177 + }, + { + "epoch": 0.5482618821702752, + "grad_norm": 0.70703125, + "learning_rate": 1.6589279031891847e-05, + "loss": 1.4727, + "step": 3178 + }, + { + "epoch": 0.5484344000690071, + "grad_norm": 0.58984375, + "learning_rate": 1.658723137389712e-05, + "loss": 1.5481, + "step": 3179 + }, + { + "epoch": 0.5486069179677392, + "grad_norm": 0.609375, + "learning_rate": 1.6585183227879132e-05, + "loss": 1.4579, + "step": 3180 + }, + { + "epoch": 0.5487794358664712, + "grad_norm": 0.75, + "learning_rate": 1.6583134593989627e-05, + "loss": 1.466, + "step": 3181 + }, + { + "epoch": 0.5489519537652031, + "grad_norm": 0.5703125, + "learning_rate": 1.658108547238038e-05, + "loss": 1.4213, + "step": 3182 + }, + { + "epoch": 0.5491244716639352, + "grad_norm": 0.64453125, + "learning_rate": 1.6579035863203197e-05, + "loss": 1.457, + "step": 3183 + }, + { + "epoch": 0.5492969895626671, + "grad_norm": 0.671875, + "learning_rate": 1.6576985766609926e-05, + "loss": 1.4649, + "step": 3184 + }, + { + "epoch": 0.5494695074613991, + "grad_norm": 0.66015625, + "learning_rate": 1.657493518275246e-05, + "loss": 1.426, + "step": 3185 + }, + { + "epoch": 0.5496420253601311, + "grad_norm": 0.59375, + "learning_rate": 1.6572884111782716e-05, + "loss": 1.3793, + "step": 3186 + }, + { + "epoch": 0.5498145432588631, + "grad_norm": 0.609375, + "learning_rate": 1.6570832553852643e-05, + "loss": 1.3756, + "step": 3187 + }, + { + "epoch": 0.5499870611575951, + "grad_norm": 0.60546875, + "learning_rate": 1.6568780509114237e-05, + "loss": 1.458, + "step": 3188 + }, + { + "epoch": 0.5501595790563271, + "grad_norm": 0.55859375, + "learning_rate": 1.656672797771953e-05, + "loss": 1.3458, + "step": 3189 + }, + { + "epoch": 0.5503320969550591, + "grad_norm": 0.71875, + "learning_rate": 1.6564674959820585e-05, + "loss": 1.4256, + "step": 3190 + }, + { + "epoch": 0.5505046148537911, + "grad_norm": 0.6796875, + "learning_rate": 1.6562621455569495e-05, + "loss": 1.4807, + "step": 3191 + }, + { + "epoch": 0.5506771327525231, + "grad_norm": 0.6171875, + "learning_rate": 1.6560567465118407e-05, + "loss": 1.5037, + "step": 3192 + }, + { + "epoch": 0.550849650651255, + "grad_norm": 0.66015625, + "learning_rate": 1.655851298861949e-05, + "loss": 1.606, + "step": 3193 + }, + { + "epoch": 0.5510221685499871, + "grad_norm": 0.58984375, + "learning_rate": 1.6556458026224948e-05, + "loss": 1.4176, + "step": 3194 + }, + { + "epoch": 0.551194686448719, + "grad_norm": 0.5625, + "learning_rate": 1.655440257808703e-05, + "loss": 1.3671, + "step": 3195 + }, + { + "epoch": 0.551367204347451, + "grad_norm": 0.62890625, + "learning_rate": 1.6552346644358014e-05, + "loss": 1.4281, + "step": 3196 + }, + { + "epoch": 0.5515397222461831, + "grad_norm": 0.62890625, + "learning_rate": 1.6550290225190217e-05, + "loss": 1.3226, + "step": 3197 + }, + { + "epoch": 0.551712240144915, + "grad_norm": 0.59765625, + "learning_rate": 1.6548233320735997e-05, + "loss": 1.2947, + "step": 3198 + }, + { + "epoch": 0.551884758043647, + "grad_norm": 0.67578125, + "learning_rate": 1.6546175931147733e-05, + "loss": 1.4838, + "step": 3199 + }, + { + "epoch": 0.552057275942379, + "grad_norm": 0.6328125, + "learning_rate": 1.6544118056577856e-05, + "loss": 1.3835, + "step": 3200 + }, + { + "epoch": 0.552057275942379, + "eval_loss": 1.4378929138183594, + "eval_runtime": 11.3818, + "eval_samples_per_second": 89.968, + "eval_steps_per_second": 22.492, + "step": 3200 + }, + { + "epoch": 0.552229793841111, + "grad_norm": 0.65625, + "learning_rate": 1.6542059697178822e-05, + "loss": 1.5091, + "step": 3201 + }, + { + "epoch": 0.552402311739843, + "grad_norm": 0.65234375, + "learning_rate": 1.6540000853103132e-05, + "loss": 1.4104, + "step": 3202 + }, + { + "epoch": 0.552574829638575, + "grad_norm": 0.60546875, + "learning_rate": 1.653794152450331e-05, + "loss": 1.3491, + "step": 3203 + }, + { + "epoch": 0.552747347537307, + "grad_norm": 0.6953125, + "learning_rate": 1.653588171153193e-05, + "loss": 1.4691, + "step": 3204 + }, + { + "epoch": 0.552919865436039, + "grad_norm": 0.60546875, + "learning_rate": 1.6533821414341597e-05, + "loss": 1.473, + "step": 3205 + }, + { + "epoch": 0.553092383334771, + "grad_norm": 0.6640625, + "learning_rate": 1.6531760633084948e-05, + "loss": 1.5411, + "step": 3206 + }, + { + "epoch": 0.553264901233503, + "grad_norm": 0.59765625, + "learning_rate": 1.652969936791466e-05, + "loss": 1.4346, + "step": 3207 + }, + { + "epoch": 0.553437419132235, + "grad_norm": 0.62890625, + "learning_rate": 1.6527637618983443e-05, + "loss": 1.4768, + "step": 3208 + }, + { + "epoch": 0.553609937030967, + "grad_norm": 0.77734375, + "learning_rate": 1.6525575386444042e-05, + "loss": 1.4556, + "step": 3209 + }, + { + "epoch": 0.5537824549296989, + "grad_norm": 0.62109375, + "learning_rate": 1.6523512670449246e-05, + "loss": 1.4614, + "step": 3210 + }, + { + "epoch": 0.553954972828431, + "grad_norm": 0.57421875, + "learning_rate": 1.6521449471151867e-05, + "loss": 1.3232, + "step": 3211 + }, + { + "epoch": 0.5541274907271629, + "grad_norm": 0.65234375, + "learning_rate": 1.6519385788704766e-05, + "loss": 1.2933, + "step": 3212 + }, + { + "epoch": 0.5543000086258949, + "grad_norm": 0.6328125, + "learning_rate": 1.6517321623260828e-05, + "loss": 1.4747, + "step": 3213 + }, + { + "epoch": 0.554472526524627, + "grad_norm": 0.60546875, + "learning_rate": 1.6515256974972985e-05, + "loss": 1.4267, + "step": 3214 + }, + { + "epoch": 0.5546450444233589, + "grad_norm": 0.59375, + "learning_rate": 1.6513191843994195e-05, + "loss": 1.4805, + "step": 3215 + }, + { + "epoch": 0.5548175623220909, + "grad_norm": 0.65234375, + "learning_rate": 1.6511126230477458e-05, + "loss": 1.374, + "step": 3216 + }, + { + "epoch": 0.5549900802208229, + "grad_norm": 0.6640625, + "learning_rate": 1.6509060134575807e-05, + "loss": 1.4438, + "step": 3217 + }, + { + "epoch": 0.5551625981195549, + "grad_norm": 0.6875, + "learning_rate": 1.6506993556442307e-05, + "loss": 1.3694, + "step": 3218 + }, + { + "epoch": 0.5553351160182869, + "grad_norm": 0.6015625, + "learning_rate": 1.650492649623007e-05, + "loss": 1.4282, + "step": 3219 + }, + { + "epoch": 0.5555076339170189, + "grad_norm": 0.7265625, + "learning_rate": 1.6502858954092233e-05, + "loss": 1.4742, + "step": 3220 + }, + { + "epoch": 0.5556801518157509, + "grad_norm": 0.6640625, + "learning_rate": 1.6500790930181973e-05, + "loss": 1.4559, + "step": 3221 + }, + { + "epoch": 0.5558526697144829, + "grad_norm": 0.6484375, + "learning_rate": 1.64987224246525e-05, + "loss": 1.474, + "step": 3222 + }, + { + "epoch": 0.5560251876132148, + "grad_norm": 0.58203125, + "learning_rate": 1.6496653437657068e-05, + "loss": 1.5381, + "step": 3223 + }, + { + "epoch": 0.5561977055119469, + "grad_norm": 0.73828125, + "learning_rate": 1.6494583969348953e-05, + "loss": 1.4106, + "step": 3224 + }, + { + "epoch": 0.5563702234106789, + "grad_norm": 0.60546875, + "learning_rate": 1.6492514019881476e-05, + "loss": 1.5662, + "step": 3225 + }, + { + "epoch": 0.5565427413094108, + "grad_norm": 0.671875, + "learning_rate": 1.6490443589407996e-05, + "loss": 1.4847, + "step": 3226 + }, + { + "epoch": 0.5567152592081428, + "grad_norm": 0.625, + "learning_rate": 1.6488372678081902e-05, + "loss": 1.4211, + "step": 3227 + }, + { + "epoch": 0.5568877771068749, + "grad_norm": 0.86328125, + "learning_rate": 1.6486301286056617e-05, + "loss": 1.4936, + "step": 3228 + }, + { + "epoch": 0.5570602950056068, + "grad_norm": 0.640625, + "learning_rate": 1.64842294134856e-05, + "loss": 1.3648, + "step": 3229 + }, + { + "epoch": 0.5572328129043388, + "grad_norm": 0.60546875, + "learning_rate": 1.6482157060522363e-05, + "loss": 1.5151, + "step": 3230 + }, + { + "epoch": 0.5574053308030709, + "grad_norm": 0.57421875, + "learning_rate": 1.6480084227320422e-05, + "loss": 1.5417, + "step": 3231 + }, + { + "epoch": 0.5575778487018028, + "grad_norm": 0.6640625, + "learning_rate": 1.647801091403335e-05, + "loss": 1.4683, + "step": 3232 + }, + { + "epoch": 0.5577503666005348, + "grad_norm": 0.640625, + "learning_rate": 1.647593712081476e-05, + "loss": 1.5107, + "step": 3233 + }, + { + "epoch": 0.5579228844992667, + "grad_norm": 0.7109375, + "learning_rate": 1.647386284781828e-05, + "loss": 1.398, + "step": 3234 + }, + { + "epoch": 0.5580954023979988, + "grad_norm": 0.59375, + "learning_rate": 1.6471788095197587e-05, + "loss": 1.4902, + "step": 3235 + }, + { + "epoch": 0.5582679202967308, + "grad_norm": 0.61328125, + "learning_rate": 1.64697128631064e-05, + "loss": 1.457, + "step": 3236 + }, + { + "epoch": 0.5584404381954627, + "grad_norm": 0.65625, + "learning_rate": 1.6467637151698457e-05, + "loss": 1.6169, + "step": 3237 + }, + { + "epoch": 0.5586129560941948, + "grad_norm": 0.6171875, + "learning_rate": 1.646556096112754e-05, + "loss": 1.5738, + "step": 3238 + }, + { + "epoch": 0.5587854739929268, + "grad_norm": 0.5703125, + "learning_rate": 1.6463484291547472e-05, + "loss": 1.3692, + "step": 3239 + }, + { + "epoch": 0.5589579918916587, + "grad_norm": 0.62109375, + "learning_rate": 1.64614071431121e-05, + "loss": 1.4634, + "step": 3240 + }, + { + "epoch": 0.5591305097903908, + "grad_norm": 0.59765625, + "learning_rate": 1.6459329515975313e-05, + "loss": 1.4855, + "step": 3241 + }, + { + "epoch": 0.5593030276891228, + "grad_norm": 0.58203125, + "learning_rate": 1.645725141029104e-05, + "loss": 1.4872, + "step": 3242 + }, + { + "epoch": 0.5594755455878547, + "grad_norm": 0.671875, + "learning_rate": 1.6455172826213228e-05, + "loss": 1.5038, + "step": 3243 + }, + { + "epoch": 0.5596480634865867, + "grad_norm": 0.75, + "learning_rate": 1.6453093763895885e-05, + "loss": 1.4403, + "step": 3244 + }, + { + "epoch": 0.5598205813853188, + "grad_norm": 0.66796875, + "learning_rate": 1.6451014223493035e-05, + "loss": 1.3938, + "step": 3245 + }, + { + "epoch": 0.5599930992840507, + "grad_norm": 1.5078125, + "learning_rate": 1.6448934205158743e-05, + "loss": 1.4209, + "step": 3246 + }, + { + "epoch": 0.5601656171827827, + "grad_norm": 0.64453125, + "learning_rate": 1.644685370904711e-05, + "loss": 1.4142, + "step": 3247 + }, + { + "epoch": 0.5603381350815148, + "grad_norm": 0.5703125, + "learning_rate": 1.6444772735312272e-05, + "loss": 1.508, + "step": 3248 + }, + { + "epoch": 0.5605106529802467, + "grad_norm": 0.66796875, + "learning_rate": 1.6442691284108403e-05, + "loss": 1.4892, + "step": 3249 + }, + { + "epoch": 0.5606831708789787, + "grad_norm": 0.6953125, + "learning_rate": 1.6440609355589704e-05, + "loss": 1.4895, + "step": 3250 + }, + { + "epoch": 0.5608556887777106, + "grad_norm": 0.80859375, + "learning_rate": 1.6438526949910425e-05, + "loss": 1.4742, + "step": 3251 + }, + { + "epoch": 0.5610282066764427, + "grad_norm": 0.62890625, + "learning_rate": 1.643644406722484e-05, + "loss": 1.5226, + "step": 3252 + }, + { + "epoch": 0.5612007245751747, + "grad_norm": 0.69921875, + "learning_rate": 1.643436070768726e-05, + "loss": 1.4427, + "step": 3253 + }, + { + "epoch": 0.5613732424739066, + "grad_norm": 0.62890625, + "learning_rate": 1.6432276871452036e-05, + "loss": 1.4517, + "step": 3254 + }, + { + "epoch": 0.5615457603726387, + "grad_norm": 0.65625, + "learning_rate": 1.643019255867355e-05, + "loss": 1.4601, + "step": 3255 + }, + { + "epoch": 0.5617182782713707, + "grad_norm": 0.640625, + "learning_rate": 1.642810776950622e-05, + "loss": 1.5211, + "step": 3256 + }, + { + "epoch": 0.5618907961701026, + "grad_norm": 0.6796875, + "learning_rate": 1.6426022504104505e-05, + "loss": 1.5343, + "step": 3257 + }, + { + "epoch": 0.5620633140688347, + "grad_norm": 0.6328125, + "learning_rate": 1.6423936762622887e-05, + "loss": 1.4592, + "step": 3258 + }, + { + "epoch": 0.5622358319675667, + "grad_norm": 0.73828125, + "learning_rate": 1.6421850545215897e-05, + "loss": 1.4003, + "step": 3259 + }, + { + "epoch": 0.5624083498662986, + "grad_norm": 0.86328125, + "learning_rate": 1.6419763852038092e-05, + "loss": 1.4569, + "step": 3260 + }, + { + "epoch": 0.5625808677650306, + "grad_norm": 0.6328125, + "learning_rate": 1.641767668324407e-05, + "loss": 1.4049, + "step": 3261 + }, + { + "epoch": 0.5627533856637627, + "grad_norm": 0.80859375, + "learning_rate": 1.641558903898846e-05, + "loss": 1.4446, + "step": 3262 + }, + { + "epoch": 0.5629259035624946, + "grad_norm": 0.91015625, + "learning_rate": 1.6413500919425927e-05, + "loss": 1.481, + "step": 3263 + }, + { + "epoch": 0.5630984214612266, + "grad_norm": 0.66796875, + "learning_rate": 1.6411412324711174e-05, + "loss": 1.5068, + "step": 3264 + }, + { + "epoch": 0.5632709393599586, + "grad_norm": 0.79296875, + "learning_rate": 1.6409323254998932e-05, + "loss": 1.4457, + "step": 3265 + }, + { + "epoch": 0.5634434572586906, + "grad_norm": 0.7265625, + "learning_rate": 1.640723371044398e-05, + "loss": 1.4314, + "step": 3266 + }, + { + "epoch": 0.5636159751574226, + "grad_norm": 0.6484375, + "learning_rate": 1.640514369120112e-05, + "loss": 1.3798, + "step": 3267 + }, + { + "epoch": 0.5637884930561545, + "grad_norm": 0.79296875, + "learning_rate": 1.640305319742519e-05, + "loss": 1.5154, + "step": 3268 + }, + { + "epoch": 0.5639610109548866, + "grad_norm": 0.69140625, + "learning_rate": 1.6400962229271074e-05, + "loss": 1.3999, + "step": 3269 + }, + { + "epoch": 0.5641335288536186, + "grad_norm": 0.63671875, + "learning_rate": 1.639887078689368e-05, + "loss": 1.473, + "step": 3270 + }, + { + "epoch": 0.5643060467523505, + "grad_norm": 0.734375, + "learning_rate": 1.639677887044796e-05, + "loss": 1.3974, + "step": 3271 + }, + { + "epoch": 0.5644785646510826, + "grad_norm": 0.9453125, + "learning_rate": 1.639468648008889e-05, + "loss": 1.5242, + "step": 3272 + }, + { + "epoch": 0.5646510825498146, + "grad_norm": 0.765625, + "learning_rate": 1.6392593615971487e-05, + "loss": 1.5228, + "step": 3273 + }, + { + "epoch": 0.5648236004485465, + "grad_norm": 0.6171875, + "learning_rate": 1.6390500278250806e-05, + "loss": 1.4572, + "step": 3274 + }, + { + "epoch": 0.5649961183472785, + "grad_norm": 0.62109375, + "learning_rate": 1.6388406467081936e-05, + "loss": 1.5359, + "step": 3275 + }, + { + "epoch": 0.5651686362460105, + "grad_norm": 0.6328125, + "learning_rate": 1.6386312182619997e-05, + "loss": 1.4999, + "step": 3276 + }, + { + "epoch": 0.5653411541447425, + "grad_norm": 0.640625, + "learning_rate": 1.638421742502015e-05, + "loss": 1.4103, + "step": 3277 + }, + { + "epoch": 0.5655136720434745, + "grad_norm": 0.6796875, + "learning_rate": 1.638212219443759e-05, + "loss": 1.3701, + "step": 3278 + }, + { + "epoch": 0.5656861899422065, + "grad_norm": 0.67578125, + "learning_rate": 1.6380026491027537e-05, + "loss": 1.5241, + "step": 3279 + }, + { + "epoch": 0.5658587078409385, + "grad_norm": 0.6484375, + "learning_rate": 1.6377930314945254e-05, + "loss": 1.45, + "step": 3280 + }, + { + "epoch": 0.5660312257396705, + "grad_norm": 0.83984375, + "learning_rate": 1.6375833666346048e-05, + "loss": 1.4207, + "step": 3281 + }, + { + "epoch": 0.5662037436384025, + "grad_norm": 0.609375, + "learning_rate": 1.637373654538524e-05, + "loss": 1.5209, + "step": 3282 + }, + { + "epoch": 0.5663762615371345, + "grad_norm": 0.71875, + "learning_rate": 1.637163895221821e-05, + "loss": 1.5028, + "step": 3283 + }, + { + "epoch": 0.5665487794358665, + "grad_norm": 0.6484375, + "learning_rate": 1.636954088700035e-05, + "loss": 1.4265, + "step": 3284 + }, + { + "epoch": 0.5667212973345984, + "grad_norm": 0.578125, + "learning_rate": 1.63674423498871e-05, + "loss": 1.4944, + "step": 3285 + }, + { + "epoch": 0.5668938152333305, + "grad_norm": 0.72265625, + "learning_rate": 1.636534334103394e-05, + "loss": 1.4467, + "step": 3286 + }, + { + "epoch": 0.5670663331320625, + "grad_norm": 0.66015625, + "learning_rate": 1.636324386059637e-05, + "loss": 1.4922, + "step": 3287 + }, + { + "epoch": 0.5672388510307944, + "grad_norm": 0.578125, + "learning_rate": 1.636114390872994e-05, + "loss": 1.3776, + "step": 3288 + }, + { + "epoch": 0.5674113689295265, + "grad_norm": 0.82421875, + "learning_rate": 1.6359043485590217e-05, + "loss": 1.5595, + "step": 3289 + }, + { + "epoch": 0.5675838868282584, + "grad_norm": 0.8125, + "learning_rate": 1.635694259133282e-05, + "loss": 1.552, + "step": 3290 + }, + { + "epoch": 0.5677564047269904, + "grad_norm": 0.59765625, + "learning_rate": 1.63548412261134e-05, + "loss": 1.4487, + "step": 3291 + }, + { + "epoch": 0.5679289226257224, + "grad_norm": 0.6328125, + "learning_rate": 1.635273939008763e-05, + "loss": 1.4105, + "step": 3292 + }, + { + "epoch": 0.5681014405244544, + "grad_norm": 0.6875, + "learning_rate": 1.6350637083411238e-05, + "loss": 1.4208, + "step": 3293 + }, + { + "epoch": 0.5682739584231864, + "grad_norm": 0.58203125, + "learning_rate": 1.634853430623997e-05, + "loss": 1.4819, + "step": 3294 + }, + { + "epoch": 0.5684464763219184, + "grad_norm": 0.609375, + "learning_rate": 1.6346431058729607e-05, + "loss": 1.4907, + "step": 3295 + }, + { + "epoch": 0.5686189942206504, + "grad_norm": 0.88671875, + "learning_rate": 1.6344327341035982e-05, + "loss": 1.4593, + "step": 3296 + }, + { + "epoch": 0.5687915121193824, + "grad_norm": 0.59765625, + "learning_rate": 1.6342223153314946e-05, + "loss": 1.4974, + "step": 3297 + }, + { + "epoch": 0.5689640300181144, + "grad_norm": 0.62890625, + "learning_rate": 1.634011849572239e-05, + "loss": 1.52, + "step": 3298 + }, + { + "epoch": 0.5691365479168464, + "grad_norm": 0.73828125, + "learning_rate": 1.6338013368414237e-05, + "loss": 1.4899, + "step": 3299 + }, + { + "epoch": 0.5693090658155784, + "grad_norm": 0.6171875, + "learning_rate": 1.6335907771546458e-05, + "loss": 1.4335, + "step": 3300 + }, + { + "epoch": 0.5693090658155784, + "eval_loss": 1.4357802867889404, + "eval_runtime": 10.7789, + "eval_samples_per_second": 95.0, + "eval_steps_per_second": 23.75, + "step": 3300 + }, + { + "epoch": 0.5694815837143103, + "grad_norm": 0.6015625, + "learning_rate": 1.633380170527504e-05, + "loss": 1.4017, + "step": 3301 + }, + { + "epoch": 0.5696541016130423, + "grad_norm": 0.59375, + "learning_rate": 1.6331695169756015e-05, + "loss": 1.4695, + "step": 3302 + }, + { + "epoch": 0.5698266195117744, + "grad_norm": 0.6953125, + "learning_rate": 1.6329588165145452e-05, + "loss": 1.4122, + "step": 3303 + }, + { + "epoch": 0.5699991374105063, + "grad_norm": 0.58984375, + "learning_rate": 1.6327480691599448e-05, + "loss": 1.4928, + "step": 3304 + }, + { + "epoch": 0.5701716553092383, + "grad_norm": 0.60546875, + "learning_rate": 1.632537274927414e-05, + "loss": 1.3496, + "step": 3305 + }, + { + "epoch": 0.5703441732079704, + "grad_norm": 0.62890625, + "learning_rate": 1.632326433832569e-05, + "loss": 1.4946, + "step": 3306 + }, + { + "epoch": 0.5705166911067023, + "grad_norm": 0.6328125, + "learning_rate": 1.6321155458910314e-05, + "loss": 1.4751, + "step": 3307 + }, + { + "epoch": 0.5706892090054343, + "grad_norm": 0.73046875, + "learning_rate": 1.631904611118424e-05, + "loss": 1.5496, + "step": 3308 + }, + { + "epoch": 0.5708617269041663, + "grad_norm": 0.81640625, + "learning_rate": 1.6316936295303754e-05, + "loss": 1.4039, + "step": 3309 + }, + { + "epoch": 0.5710342448028983, + "grad_norm": 0.7578125, + "learning_rate": 1.6314826011425153e-05, + "loss": 1.5043, + "step": 3310 + }, + { + "epoch": 0.5712067627016303, + "grad_norm": 0.7265625, + "learning_rate": 1.6312715259704786e-05, + "loss": 1.5031, + "step": 3311 + }, + { + "epoch": 0.5713792806003622, + "grad_norm": 0.6171875, + "learning_rate": 1.6310604040299026e-05, + "loss": 1.4799, + "step": 3312 + }, + { + "epoch": 0.5715517984990943, + "grad_norm": 0.734375, + "learning_rate": 1.630849235336429e-05, + "loss": 1.4857, + "step": 3313 + }, + { + "epoch": 0.5717243163978263, + "grad_norm": 0.6640625, + "learning_rate": 1.6306380199057024e-05, + "loss": 1.4491, + "step": 3314 + }, + { + "epoch": 0.5718968342965582, + "grad_norm": 0.6015625, + "learning_rate": 1.6304267577533706e-05, + "loss": 1.4579, + "step": 3315 + }, + { + "epoch": 0.5720693521952903, + "grad_norm": 0.6015625, + "learning_rate": 1.630215448895086e-05, + "loss": 1.4994, + "step": 3316 + }, + { + "epoch": 0.5722418700940223, + "grad_norm": 0.72265625, + "learning_rate": 1.6300040933465025e-05, + "loss": 1.4655, + "step": 3317 + }, + { + "epoch": 0.5724143879927542, + "grad_norm": 0.8203125, + "learning_rate": 1.6297926911232796e-05, + "loss": 1.4915, + "step": 3318 + }, + { + "epoch": 0.5725869058914862, + "grad_norm": 0.58984375, + "learning_rate": 1.6295812422410794e-05, + "loss": 1.5393, + "step": 3319 + }, + { + "epoch": 0.5727594237902183, + "grad_norm": 0.69140625, + "learning_rate": 1.6293697467155667e-05, + "loss": 1.5414, + "step": 3320 + }, + { + "epoch": 0.5729319416889502, + "grad_norm": 0.6484375, + "learning_rate": 1.6291582045624107e-05, + "loss": 1.4912, + "step": 3321 + }, + { + "epoch": 0.5731044595876822, + "grad_norm": 0.703125, + "learning_rate": 1.6289466157972835e-05, + "loss": 1.3418, + "step": 3322 + }, + { + "epoch": 0.5732769774864143, + "grad_norm": 1.7109375, + "learning_rate": 1.628734980435861e-05, + "loss": 1.5682, + "step": 3323 + }, + { + "epoch": 0.5734494953851462, + "grad_norm": 0.671875, + "learning_rate": 1.6285232984938234e-05, + "loss": 1.4293, + "step": 3324 + }, + { + "epoch": 0.5736220132838782, + "grad_norm": 0.66015625, + "learning_rate": 1.6283115699868522e-05, + "loss": 1.4405, + "step": 3325 + }, + { + "epoch": 0.5737945311826101, + "grad_norm": 0.57421875, + "learning_rate": 1.6280997949306342e-05, + "loss": 1.3624, + "step": 3326 + }, + { + "epoch": 0.5739670490813422, + "grad_norm": 0.6640625, + "learning_rate": 1.6278879733408587e-05, + "loss": 1.4126, + "step": 3327 + }, + { + "epoch": 0.5741395669800742, + "grad_norm": 0.73046875, + "learning_rate": 1.627676105233219e-05, + "loss": 1.429, + "step": 3328 + }, + { + "epoch": 0.5743120848788061, + "grad_norm": 0.6015625, + "learning_rate": 1.6274641906234113e-05, + "loss": 1.4055, + "step": 3329 + }, + { + "epoch": 0.5744846027775382, + "grad_norm": 0.59375, + "learning_rate": 1.627252229527136e-05, + "loss": 1.4173, + "step": 3330 + }, + { + "epoch": 0.5746571206762702, + "grad_norm": 2.0, + "learning_rate": 1.627040221960096e-05, + "loss": 1.5033, + "step": 3331 + }, + { + "epoch": 0.5748296385750021, + "grad_norm": 0.6015625, + "learning_rate": 1.6268281679379983e-05, + "loss": 1.4267, + "step": 3332 + }, + { + "epoch": 0.5750021564737342, + "grad_norm": 0.65625, + "learning_rate": 1.6266160674765542e-05, + "loss": 1.4609, + "step": 3333 + }, + { + "epoch": 0.5751746743724662, + "grad_norm": 0.7734375, + "learning_rate": 1.626403920591476e-05, + "loss": 1.4585, + "step": 3334 + }, + { + "epoch": 0.5753471922711981, + "grad_norm": 0.734375, + "learning_rate": 1.626191727298481e-05, + "loss": 1.4806, + "step": 3335 + }, + { + "epoch": 0.5755197101699301, + "grad_norm": 0.69921875, + "learning_rate": 1.625979487613291e-05, + "loss": 1.4952, + "step": 3336 + }, + { + "epoch": 0.5756922280686622, + "grad_norm": 0.6875, + "learning_rate": 1.6257672015516287e-05, + "loss": 1.463, + "step": 3337 + }, + { + "epoch": 0.5758647459673941, + "grad_norm": 0.6640625, + "learning_rate": 1.6255548691292223e-05, + "loss": 1.4367, + "step": 3338 + }, + { + "epoch": 0.5760372638661261, + "grad_norm": 0.58984375, + "learning_rate": 1.6253424903618023e-05, + "loss": 1.4492, + "step": 3339 + }, + { + "epoch": 0.5762097817648582, + "grad_norm": 0.58984375, + "learning_rate": 1.6251300652651037e-05, + "loss": 1.4922, + "step": 3340 + }, + { + "epoch": 0.5763822996635901, + "grad_norm": 0.67578125, + "learning_rate": 1.6249175938548635e-05, + "loss": 1.4813, + "step": 3341 + }, + { + "epoch": 0.5765548175623221, + "grad_norm": 0.7890625, + "learning_rate": 1.6247050761468236e-05, + "loss": 1.4617, + "step": 3342 + }, + { + "epoch": 0.576727335461054, + "grad_norm": 0.609375, + "learning_rate": 1.6244925121567283e-05, + "loss": 1.4464, + "step": 3343 + }, + { + "epoch": 0.5768998533597861, + "grad_norm": 0.61328125, + "learning_rate": 1.6242799019003256e-05, + "loss": 1.4271, + "step": 3344 + }, + { + "epoch": 0.5770723712585181, + "grad_norm": 0.6484375, + "learning_rate": 1.6240672453933673e-05, + "loss": 1.4269, + "step": 3345 + }, + { + "epoch": 0.57724488915725, + "grad_norm": 0.6640625, + "learning_rate": 1.623854542651608e-05, + "loss": 1.326, + "step": 3346 + }, + { + "epoch": 0.5774174070559821, + "grad_norm": 0.65625, + "learning_rate": 1.6236417936908058e-05, + "loss": 1.4414, + "step": 3347 + }, + { + "epoch": 0.5775899249547141, + "grad_norm": 0.77734375, + "learning_rate": 1.623428998526723e-05, + "loss": 1.5371, + "step": 3348 + }, + { + "epoch": 0.577762442853446, + "grad_norm": 0.87890625, + "learning_rate": 1.6232161571751248e-05, + "loss": 1.5019, + "step": 3349 + }, + { + "epoch": 0.577934960752178, + "grad_norm": 0.83984375, + "learning_rate": 1.6230032696517794e-05, + "loss": 1.475, + "step": 3350 + }, + { + "epoch": 0.5781074786509101, + "grad_norm": 0.88671875, + "learning_rate": 1.6227903359724595e-05, + "loss": 1.4802, + "step": 3351 + }, + { + "epoch": 0.578279996549642, + "grad_norm": 0.59375, + "learning_rate": 1.62257735615294e-05, + "loss": 1.4384, + "step": 3352 + }, + { + "epoch": 0.578452514448374, + "grad_norm": 0.78515625, + "learning_rate": 1.6223643302089994e-05, + "loss": 1.4694, + "step": 3353 + }, + { + "epoch": 0.578625032347106, + "grad_norm": 0.71484375, + "learning_rate": 1.622151258156421e-05, + "loss": 1.4337, + "step": 3354 + }, + { + "epoch": 0.578797550245838, + "grad_norm": 0.58203125, + "learning_rate": 1.6219381400109898e-05, + "loss": 1.6329, + "step": 3355 + }, + { + "epoch": 0.57897006814457, + "grad_norm": 0.83203125, + "learning_rate": 1.6217249757884954e-05, + "loss": 1.5241, + "step": 3356 + }, + { + "epoch": 0.579142586043302, + "grad_norm": 0.6875, + "learning_rate": 1.62151176550473e-05, + "loss": 1.4375, + "step": 3357 + }, + { + "epoch": 0.579315103942034, + "grad_norm": 0.703125, + "learning_rate": 1.6212985091754893e-05, + "loss": 1.4399, + "step": 3358 + }, + { + "epoch": 0.579487621840766, + "grad_norm": 0.71484375, + "learning_rate": 1.621085206816573e-05, + "loss": 1.5598, + "step": 3359 + }, + { + "epoch": 0.5796601397394979, + "grad_norm": 2.375, + "learning_rate": 1.6208718584437845e-05, + "loss": 1.465, + "step": 3360 + }, + { + "epoch": 0.57983265763823, + "grad_norm": 0.5859375, + "learning_rate": 1.620658464072929e-05, + "loss": 1.4114, + "step": 3361 + }, + { + "epoch": 0.580005175536962, + "grad_norm": 0.62109375, + "learning_rate": 1.620445023719816e-05, + "loss": 1.4664, + "step": 3362 + }, + { + "epoch": 0.5801776934356939, + "grad_norm": 0.84375, + "learning_rate": 1.6202315374002594e-05, + "loss": 1.2958, + "step": 3363 + }, + { + "epoch": 0.580350211334426, + "grad_norm": 0.65234375, + "learning_rate": 1.620018005130075e-05, + "loss": 1.4864, + "step": 3364 + }, + { + "epoch": 0.580522729233158, + "grad_norm": 0.60546875, + "learning_rate": 1.619804426925083e-05, + "loss": 1.5509, + "step": 3365 + }, + { + "epoch": 0.5806952471318899, + "grad_norm": 0.609375, + "learning_rate": 1.619590802801106e-05, + "loss": 1.3632, + "step": 3366 + }, + { + "epoch": 0.5808677650306219, + "grad_norm": 0.671875, + "learning_rate": 1.6193771327739712e-05, + "loss": 1.4981, + "step": 3367 + }, + { + "epoch": 0.581040282929354, + "grad_norm": 0.640625, + "learning_rate": 1.619163416859508e-05, + "loss": 1.3906, + "step": 3368 + }, + { + "epoch": 0.5812128008280859, + "grad_norm": 0.63671875, + "learning_rate": 1.6189496550735503e-05, + "loss": 1.4839, + "step": 3369 + }, + { + "epoch": 0.5813853187268179, + "grad_norm": 0.9296875, + "learning_rate": 1.6187358474319347e-05, + "loss": 1.4431, + "step": 3370 + }, + { + "epoch": 0.5815578366255499, + "grad_norm": 0.6171875, + "learning_rate": 1.6185219939505016e-05, + "loss": 1.4361, + "step": 3371 + }, + { + "epoch": 0.5817303545242819, + "grad_norm": 0.66015625, + "learning_rate": 1.6183080946450945e-05, + "loss": 1.3862, + "step": 3372 + }, + { + "epoch": 0.5819028724230139, + "grad_norm": 0.56640625, + "learning_rate": 1.6180941495315606e-05, + "loss": 1.3674, + "step": 3373 + }, + { + "epoch": 0.5820753903217459, + "grad_norm": 0.66796875, + "learning_rate": 1.6178801586257494e-05, + "loss": 1.4896, + "step": 3374 + }, + { + "epoch": 0.5822479082204779, + "grad_norm": 0.68359375, + "learning_rate": 1.617666121943516e-05, + "loss": 1.3988, + "step": 3375 + }, + { + "epoch": 0.5824204261192099, + "grad_norm": 0.64453125, + "learning_rate": 1.6174520395007168e-05, + "loss": 1.3873, + "step": 3376 + }, + { + "epoch": 0.5825929440179418, + "grad_norm": 0.78515625, + "learning_rate": 1.6172379113132123e-05, + "loss": 1.52, + "step": 3377 + }, + { + "epoch": 0.5827654619166739, + "grad_norm": 0.6171875, + "learning_rate": 1.617023737396867e-05, + "loss": 1.46, + "step": 3378 + }, + { + "epoch": 0.5829379798154058, + "grad_norm": 0.6953125, + "learning_rate": 1.6168095177675476e-05, + "loss": 1.5046, + "step": 3379 + }, + { + "epoch": 0.5831104977141378, + "grad_norm": 0.625, + "learning_rate": 1.6165952524411253e-05, + "loss": 1.4513, + "step": 3380 + }, + { + "epoch": 0.5832830156128699, + "grad_norm": 0.65625, + "learning_rate": 1.616380941433474e-05, + "loss": 1.547, + "step": 3381 + }, + { + "epoch": 0.5834555335116018, + "grad_norm": 0.66796875, + "learning_rate": 1.616166584760471e-05, + "loss": 1.5419, + "step": 3382 + }, + { + "epoch": 0.5836280514103338, + "grad_norm": 0.77734375, + "learning_rate": 1.6159521824379977e-05, + "loss": 1.4538, + "step": 3383 + }, + { + "epoch": 0.5838005693090658, + "grad_norm": 0.6640625, + "learning_rate": 1.6157377344819377e-05, + "loss": 1.5394, + "step": 3384 + }, + { + "epoch": 0.5839730872077978, + "grad_norm": 0.65625, + "learning_rate": 1.6155232409081794e-05, + "loss": 1.4276, + "step": 3385 + }, + { + "epoch": 0.5841456051065298, + "grad_norm": 0.58203125, + "learning_rate": 1.615308701732613e-05, + "loss": 1.5032, + "step": 3386 + }, + { + "epoch": 0.5843181230052618, + "grad_norm": 0.6875, + "learning_rate": 1.615094116971134e-05, + "loss": 1.4876, + "step": 3387 + }, + { + "epoch": 0.5844906409039938, + "grad_norm": 0.7421875, + "learning_rate": 1.6148794866396392e-05, + "loss": 1.5329, + "step": 3388 + }, + { + "epoch": 0.5846631588027258, + "grad_norm": 0.74609375, + "learning_rate": 1.6146648107540303e-05, + "loss": 1.4204, + "step": 3389 + }, + { + "epoch": 0.5848356767014578, + "grad_norm": 0.578125, + "learning_rate": 1.614450089330211e-05, + "loss": 1.4151, + "step": 3390 + }, + { + "epoch": 0.5850081946001898, + "grad_norm": 0.69140625, + "learning_rate": 1.6142353223840902e-05, + "loss": 1.467, + "step": 3391 + }, + { + "epoch": 0.5851807124989218, + "grad_norm": 0.73828125, + "learning_rate": 1.6140205099315787e-05, + "loss": 1.4764, + "step": 3392 + }, + { + "epoch": 0.5853532303976537, + "grad_norm": 0.7421875, + "learning_rate": 1.6138056519885916e-05, + "loss": 1.4966, + "step": 3393 + }, + { + "epoch": 0.5855257482963857, + "grad_norm": 0.59765625, + "learning_rate": 1.6135907485710462e-05, + "loss": 1.4324, + "step": 3394 + }, + { + "epoch": 0.5856982661951178, + "grad_norm": 0.84765625, + "learning_rate": 1.6133757996948645e-05, + "loss": 1.4643, + "step": 3395 + }, + { + "epoch": 0.5858707840938497, + "grad_norm": 0.59765625, + "learning_rate": 1.613160805375971e-05, + "loss": 1.3835, + "step": 3396 + }, + { + "epoch": 0.5860433019925817, + "grad_norm": 0.6484375, + "learning_rate": 1.6129457656302935e-05, + "loss": 1.4762, + "step": 3397 + }, + { + "epoch": 0.5862158198913138, + "grad_norm": 0.6015625, + "learning_rate": 1.612730680473764e-05, + "loss": 1.4536, + "step": 3398 + }, + { + "epoch": 0.5863883377900457, + "grad_norm": 0.60546875, + "learning_rate": 1.6125155499223174e-05, + "loss": 1.5047, + "step": 3399 + }, + { + "epoch": 0.5865608556887777, + "grad_norm": 0.703125, + "learning_rate": 1.6123003739918915e-05, + "loss": 1.5773, + "step": 3400 + }, + { + "epoch": 0.5865608556887777, + "eval_loss": 1.433887004852295, + "eval_runtime": 10.8057, + "eval_samples_per_second": 94.765, + "eval_steps_per_second": 23.691, + "step": 3400 + }, + { + "epoch": 0.5867333735875097, + "grad_norm": 0.6015625, + "learning_rate": 1.6120851526984282e-05, + "loss": 1.5089, + "step": 3401 + }, + { + "epoch": 0.5869058914862417, + "grad_norm": 0.55859375, + "learning_rate": 1.611869886057872e-05, + "loss": 1.3859, + "step": 3402 + }, + { + "epoch": 0.5870784093849737, + "grad_norm": 0.91015625, + "learning_rate": 1.6116545740861718e-05, + "loss": 1.4056, + "step": 3403 + }, + { + "epoch": 0.5872509272837056, + "grad_norm": 0.625, + "learning_rate": 1.6114392167992784e-05, + "loss": 1.5003, + "step": 3404 + }, + { + "epoch": 0.5874234451824377, + "grad_norm": 0.6640625, + "learning_rate": 1.611223814213148e-05, + "loss": 1.5606, + "step": 3405 + }, + { + "epoch": 0.5875959630811697, + "grad_norm": 0.95703125, + "learning_rate": 1.611008366343738e-05, + "loss": 1.4371, + "step": 3406 + }, + { + "epoch": 0.5877684809799016, + "grad_norm": 0.59765625, + "learning_rate": 1.6107928732070107e-05, + "loss": 1.4536, + "step": 3407 + }, + { + "epoch": 0.5879409988786336, + "grad_norm": 0.7421875, + "learning_rate": 1.6105773348189312e-05, + "loss": 1.3856, + "step": 3408 + }, + { + "epoch": 0.5881135167773657, + "grad_norm": 0.83984375, + "learning_rate": 1.610361751195467e-05, + "loss": 1.4591, + "step": 3409 + }, + { + "epoch": 0.5882860346760976, + "grad_norm": 0.6484375, + "learning_rate": 1.6101461223525908e-05, + "loss": 1.4714, + "step": 3410 + }, + { + "epoch": 0.5884585525748296, + "grad_norm": 0.64453125, + "learning_rate": 1.6099304483062776e-05, + "loss": 1.5012, + "step": 3411 + }, + { + "epoch": 0.5886310704735617, + "grad_norm": 0.703125, + "learning_rate": 1.609714729072506e-05, + "loss": 1.4031, + "step": 3412 + }, + { + "epoch": 0.5888035883722936, + "grad_norm": 0.73828125, + "learning_rate": 1.6094989646672573e-05, + "loss": 1.3807, + "step": 3413 + }, + { + "epoch": 0.5889761062710256, + "grad_norm": 0.75390625, + "learning_rate": 1.609283155106517e-05, + "loss": 1.476, + "step": 3414 + }, + { + "epoch": 0.5891486241697577, + "grad_norm": 0.62109375, + "learning_rate": 1.6090673004062734e-05, + "loss": 1.4646, + "step": 3415 + }, + { + "epoch": 0.5893211420684896, + "grad_norm": 0.60546875, + "learning_rate": 1.608851400582519e-05, + "loss": 1.4054, + "step": 3416 + }, + { + "epoch": 0.5894936599672216, + "grad_norm": 0.6328125, + "learning_rate": 1.608635455651248e-05, + "loss": 1.4973, + "step": 3417 + }, + { + "epoch": 0.5896661778659535, + "grad_norm": 0.60546875, + "learning_rate": 1.6084194656284598e-05, + "loss": 1.3328, + "step": 3418 + }, + { + "epoch": 0.5898386957646856, + "grad_norm": 0.734375, + "learning_rate": 1.6082034305301563e-05, + "loss": 1.5572, + "step": 3419 + }, + { + "epoch": 0.5900112136634176, + "grad_norm": 0.63671875, + "learning_rate": 1.607987350372342e-05, + "loss": 1.4634, + "step": 3420 + }, + { + "epoch": 0.5901837315621495, + "grad_norm": 0.7734375, + "learning_rate": 1.607771225171026e-05, + "loss": 1.4979, + "step": 3421 + }, + { + "epoch": 0.5903562494608816, + "grad_norm": 0.70703125, + "learning_rate": 1.6075550549422204e-05, + "loss": 1.3774, + "step": 3422 + }, + { + "epoch": 0.5905287673596136, + "grad_norm": 0.6875, + "learning_rate": 1.6073388397019397e-05, + "loss": 1.4982, + "step": 3423 + }, + { + "epoch": 0.5907012852583455, + "grad_norm": 0.7890625, + "learning_rate": 1.6071225794662033e-05, + "loss": 1.6353, + "step": 3424 + }, + { + "epoch": 0.5908738031570775, + "grad_norm": 0.79296875, + "learning_rate": 1.6069062742510326e-05, + "loss": 1.4391, + "step": 3425 + }, + { + "epoch": 0.5910463210558096, + "grad_norm": 0.6875, + "learning_rate": 1.606689924072453e-05, + "loss": 1.3626, + "step": 3426 + }, + { + "epoch": 0.5912188389545415, + "grad_norm": 0.84765625, + "learning_rate": 1.606473528946493e-05, + "loss": 1.4261, + "step": 3427 + }, + { + "epoch": 0.5913913568532735, + "grad_norm": 0.78125, + "learning_rate": 1.6062570888891847e-05, + "loss": 1.4534, + "step": 3428 + }, + { + "epoch": 0.5915638747520056, + "grad_norm": 0.65625, + "learning_rate": 1.6060406039165627e-05, + "loss": 1.5243, + "step": 3429 + }, + { + "epoch": 0.5917363926507375, + "grad_norm": 0.69921875, + "learning_rate": 1.6058240740446666e-05, + "loss": 1.458, + "step": 3430 + }, + { + "epoch": 0.5919089105494695, + "grad_norm": 0.72265625, + "learning_rate": 1.6056074992895378e-05, + "loss": 1.4456, + "step": 3431 + }, + { + "epoch": 0.5920814284482016, + "grad_norm": 0.671875, + "learning_rate": 1.605390879667221e-05, + "loss": 1.4622, + "step": 3432 + }, + { + "epoch": 0.5922539463469335, + "grad_norm": 0.75390625, + "learning_rate": 1.6051742151937655e-05, + "loss": 1.4636, + "step": 3433 + }, + { + "epoch": 0.5924264642456655, + "grad_norm": 0.6328125, + "learning_rate": 1.6049575058852223e-05, + "loss": 1.3617, + "step": 3434 + }, + { + "epoch": 0.5925989821443974, + "grad_norm": 0.62890625, + "learning_rate": 1.6047407517576478e-05, + "loss": 1.5374, + "step": 3435 + }, + { + "epoch": 0.5927715000431295, + "grad_norm": 0.56640625, + "learning_rate": 1.6045239528270993e-05, + "loss": 1.3766, + "step": 3436 + }, + { + "epoch": 0.5929440179418615, + "grad_norm": 0.703125, + "learning_rate": 1.6043071091096397e-05, + "loss": 1.3774, + "step": 3437 + }, + { + "epoch": 0.5931165358405934, + "grad_norm": 0.7421875, + "learning_rate": 1.6040902206213332e-05, + "loss": 1.5118, + "step": 3438 + }, + { + "epoch": 0.5932890537393255, + "grad_norm": 0.73046875, + "learning_rate": 1.6038732873782485e-05, + "loss": 1.4876, + "step": 3439 + }, + { + "epoch": 0.5934615716380575, + "grad_norm": 0.58984375, + "learning_rate": 1.6036563093964577e-05, + "loss": 1.4963, + "step": 3440 + }, + { + "epoch": 0.5936340895367894, + "grad_norm": 0.68359375, + "learning_rate": 1.6034392866920354e-05, + "loss": 1.5198, + "step": 3441 + }, + { + "epoch": 0.5938066074355214, + "grad_norm": 0.6484375, + "learning_rate": 1.6032222192810607e-05, + "loss": 1.3917, + "step": 3442 + }, + { + "epoch": 0.5939791253342535, + "grad_norm": 0.6015625, + "learning_rate": 1.6030051071796146e-05, + "loss": 1.4103, + "step": 3443 + }, + { + "epoch": 0.5941516432329854, + "grad_norm": 0.69140625, + "learning_rate": 1.6027879504037826e-05, + "loss": 1.4817, + "step": 3444 + }, + { + "epoch": 0.5943241611317174, + "grad_norm": 0.59765625, + "learning_rate": 1.6025707489696527e-05, + "loss": 1.5543, + "step": 3445 + }, + { + "epoch": 0.5944966790304494, + "grad_norm": 0.72265625, + "learning_rate": 1.6023535028933167e-05, + "loss": 1.5289, + "step": 3446 + }, + { + "epoch": 0.5946691969291814, + "grad_norm": 0.73046875, + "learning_rate": 1.6021362121908697e-05, + "loss": 1.5147, + "step": 3447 + }, + { + "epoch": 0.5948417148279134, + "grad_norm": 0.6796875, + "learning_rate": 1.60191887687841e-05, + "loss": 1.5132, + "step": 3448 + }, + { + "epoch": 0.5950142327266454, + "grad_norm": 0.60546875, + "learning_rate": 1.6017014969720382e-05, + "loss": 1.5291, + "step": 3449 + }, + { + "epoch": 0.5951867506253774, + "grad_norm": 0.73046875, + "learning_rate": 1.6014840724878603e-05, + "loss": 1.3743, + "step": 3450 + }, + { + "epoch": 0.5953592685241094, + "grad_norm": 0.9765625, + "learning_rate": 1.6012666034419838e-05, + "loss": 1.3932, + "step": 3451 + }, + { + "epoch": 0.5955317864228413, + "grad_norm": 0.578125, + "learning_rate": 1.601049089850521e-05, + "loss": 1.4492, + "step": 3452 + }, + { + "epoch": 0.5957043043215734, + "grad_norm": 0.78515625, + "learning_rate": 1.6008315317295857e-05, + "loss": 1.4471, + "step": 3453 + }, + { + "epoch": 0.5958768222203054, + "grad_norm": 0.8359375, + "learning_rate": 1.6006139290952964e-05, + "loss": 1.453, + "step": 3454 + }, + { + "epoch": 0.5960493401190373, + "grad_norm": 0.6796875, + "learning_rate": 1.6003962819637744e-05, + "loss": 1.5166, + "step": 3455 + }, + { + "epoch": 0.5962218580177694, + "grad_norm": 0.65234375, + "learning_rate": 1.6001785903511446e-05, + "loss": 1.5024, + "step": 3456 + }, + { + "epoch": 0.5963943759165014, + "grad_norm": 0.6953125, + "learning_rate": 1.5999608542735345e-05, + "loss": 1.4908, + "step": 3457 + }, + { + "epoch": 0.5965668938152333, + "grad_norm": 0.65625, + "learning_rate": 1.599743073747076e-05, + "loss": 1.3899, + "step": 3458 + }, + { + "epoch": 0.5967394117139653, + "grad_norm": 0.59765625, + "learning_rate": 1.5995252487879025e-05, + "loss": 1.4486, + "step": 3459 + }, + { + "epoch": 0.5969119296126973, + "grad_norm": 0.7734375, + "learning_rate": 1.599307379412153e-05, + "loss": 1.5181, + "step": 3460 + }, + { + "epoch": 0.5970844475114293, + "grad_norm": 0.62890625, + "learning_rate": 1.599089465635968e-05, + "loss": 1.4457, + "step": 3461 + }, + { + "epoch": 0.5972569654101613, + "grad_norm": 0.59375, + "learning_rate": 1.5988715074754923e-05, + "loss": 1.3981, + "step": 3462 + }, + { + "epoch": 0.5974294833088933, + "grad_norm": 0.6015625, + "learning_rate": 1.5986535049468736e-05, + "loss": 1.4419, + "step": 3463 + }, + { + "epoch": 0.5976020012076253, + "grad_norm": 0.84375, + "learning_rate": 1.5984354580662623e-05, + "loss": 1.389, + "step": 3464 + }, + { + "epoch": 0.5977745191063573, + "grad_norm": 0.5859375, + "learning_rate": 1.5982173668498135e-05, + "loss": 1.4545, + "step": 3465 + }, + { + "epoch": 0.5979470370050893, + "grad_norm": 0.66796875, + "learning_rate": 1.5979992313136836e-05, + "loss": 1.5201, + "step": 3466 + }, + { + "epoch": 0.5981195549038213, + "grad_norm": 0.609375, + "learning_rate": 1.597781051474035e-05, + "loss": 1.4027, + "step": 3467 + }, + { + "epoch": 0.5982920728025533, + "grad_norm": 0.6640625, + "learning_rate": 1.5975628273470303e-05, + "loss": 1.439, + "step": 3468 + }, + { + "epoch": 0.5984645907012852, + "grad_norm": 0.71484375, + "learning_rate": 1.597344558948838e-05, + "loss": 1.5406, + "step": 3469 + }, + { + "epoch": 0.5986371086000173, + "grad_norm": 0.7265625, + "learning_rate": 1.5971262462956283e-05, + "loss": 1.425, + "step": 3470 + }, + { + "epoch": 0.5988096264987492, + "grad_norm": 0.62890625, + "learning_rate": 1.5969078894035753e-05, + "loss": 1.5376, + "step": 3471 + }, + { + "epoch": 0.5989821443974812, + "grad_norm": 0.71875, + "learning_rate": 1.596689488288856e-05, + "loss": 1.3337, + "step": 3472 + }, + { + "epoch": 0.5991546622962133, + "grad_norm": 0.76953125, + "learning_rate": 1.5964710429676514e-05, + "loss": 1.4247, + "step": 3473 + }, + { + "epoch": 0.5993271801949452, + "grad_norm": 0.77734375, + "learning_rate": 1.596252553456145e-05, + "loss": 1.4202, + "step": 3474 + }, + { + "epoch": 0.5994996980936772, + "grad_norm": 0.6640625, + "learning_rate": 1.5960340197705235e-05, + "loss": 1.4148, + "step": 3475 + }, + { + "epoch": 0.5996722159924092, + "grad_norm": 0.6484375, + "learning_rate": 1.5958154419269782e-05, + "loss": 1.4566, + "step": 3476 + }, + { + "epoch": 0.5998447338911412, + "grad_norm": 0.59375, + "learning_rate": 1.5955968199417018e-05, + "loss": 1.4067, + "step": 3477 + }, + { + "epoch": 0.6000172517898732, + "grad_norm": 0.62109375, + "learning_rate": 1.5953781538308914e-05, + "loss": 1.4475, + "step": 3478 + }, + { + "epoch": 0.6001897696886052, + "grad_norm": 0.57421875, + "learning_rate": 1.5951594436107475e-05, + "loss": 1.4135, + "step": 3479 + }, + { + "epoch": 0.6003622875873372, + "grad_norm": 0.640625, + "learning_rate": 1.5949406892974733e-05, + "loss": 1.459, + "step": 3480 + }, + { + "epoch": 0.6005348054860692, + "grad_norm": 0.81640625, + "learning_rate": 1.5947218909072755e-05, + "loss": 1.4852, + "step": 3481 + }, + { + "epoch": 0.6007073233848012, + "grad_norm": 0.63671875, + "learning_rate": 1.594503048456364e-05, + "loss": 1.3822, + "step": 3482 + }, + { + "epoch": 0.6008798412835331, + "grad_norm": 0.625, + "learning_rate": 1.5942841619609515e-05, + "loss": 1.4175, + "step": 3483 + }, + { + "epoch": 0.6010523591822652, + "grad_norm": 0.6171875, + "learning_rate": 1.5940652314372558e-05, + "loss": 1.4183, + "step": 3484 + }, + { + "epoch": 0.6012248770809971, + "grad_norm": 0.6640625, + "learning_rate": 1.5938462569014954e-05, + "loss": 1.4889, + "step": 3485 + }, + { + "epoch": 0.6013973949797291, + "grad_norm": 0.5703125, + "learning_rate": 1.5936272383698937e-05, + "loss": 1.473, + "step": 3486 + }, + { + "epoch": 0.6015699128784612, + "grad_norm": 0.67578125, + "learning_rate": 1.5934081758586775e-05, + "loss": 1.45, + "step": 3487 + }, + { + "epoch": 0.6017424307771931, + "grad_norm": 0.6875, + "learning_rate": 1.5931890693840755e-05, + "loss": 1.3931, + "step": 3488 + }, + { + "epoch": 0.6019149486759251, + "grad_norm": 0.6171875, + "learning_rate": 1.592969918962321e-05, + "loss": 1.5415, + "step": 3489 + }, + { + "epoch": 0.6020874665746572, + "grad_norm": 0.64453125, + "learning_rate": 1.5927507246096497e-05, + "loss": 1.4022, + "step": 3490 + }, + { + "epoch": 0.6022599844733891, + "grad_norm": 0.55859375, + "learning_rate": 1.592531486342301e-05, + "loss": 1.4198, + "step": 3491 + }, + { + "epoch": 0.6024325023721211, + "grad_norm": 0.625, + "learning_rate": 1.592312204176518e-05, + "loss": 1.3621, + "step": 3492 + }, + { + "epoch": 0.602605020270853, + "grad_norm": 0.58203125, + "learning_rate": 1.5920928781285456e-05, + "loss": 1.414, + "step": 3493 + }, + { + "epoch": 0.6027775381695851, + "grad_norm": 0.61328125, + "learning_rate": 1.591873508214633e-05, + "loss": 1.4441, + "step": 3494 + }, + { + "epoch": 0.6029500560683171, + "grad_norm": 0.6640625, + "learning_rate": 1.591654094451033e-05, + "loss": 1.4143, + "step": 3495 + }, + { + "epoch": 0.603122573967049, + "grad_norm": 0.63671875, + "learning_rate": 1.5914346368540007e-05, + "loss": 1.5217, + "step": 3496 + }, + { + "epoch": 0.6032950918657811, + "grad_norm": 0.66796875, + "learning_rate": 1.591215135439795e-05, + "loss": 1.4534, + "step": 3497 + }, + { + "epoch": 0.6034676097645131, + "grad_norm": 0.69140625, + "learning_rate": 1.5909955902246782e-05, + "loss": 1.4631, + "step": 3498 + }, + { + "epoch": 0.603640127663245, + "grad_norm": 0.73828125, + "learning_rate": 1.5907760012249148e-05, + "loss": 1.4084, + "step": 3499 + }, + { + "epoch": 0.603812645561977, + "grad_norm": 0.67578125, + "learning_rate": 1.590556368456775e-05, + "loss": 1.4021, + "step": 3500 + }, + { + "epoch": 0.603812645561977, + "eval_loss": 1.4319883584976196, + "eval_runtime": 10.882, + "eval_samples_per_second": 94.101, + "eval_steps_per_second": 23.525, + "step": 3500 + }, + { + "epoch": 0.6039851634607091, + "grad_norm": 0.64453125, + "learning_rate": 1.5903366919365283e-05, + "loss": 1.4274, + "step": 3501 + }, + { + "epoch": 0.604157681359441, + "grad_norm": 0.65625, + "learning_rate": 1.5901169716804516e-05, + "loss": 1.6006, + "step": 3502 + }, + { + "epoch": 0.604330199258173, + "grad_norm": 0.69140625, + "learning_rate": 1.589897207704822e-05, + "loss": 1.4748, + "step": 3503 + }, + { + "epoch": 0.6045027171569051, + "grad_norm": 0.64453125, + "learning_rate": 1.5896774000259218e-05, + "loss": 1.4845, + "step": 3504 + }, + { + "epoch": 0.604675235055637, + "grad_norm": 0.73828125, + "learning_rate": 1.5894575486600354e-05, + "loss": 1.5827, + "step": 3505 + }, + { + "epoch": 0.604847752954369, + "grad_norm": 0.7265625, + "learning_rate": 1.58923765362345e-05, + "loss": 1.4438, + "step": 3506 + }, + { + "epoch": 0.6050202708531011, + "grad_norm": 0.56640625, + "learning_rate": 1.5890177149324583e-05, + "loss": 1.363, + "step": 3507 + }, + { + "epoch": 0.605192788751833, + "grad_norm": 0.6171875, + "learning_rate": 1.5887977326033533e-05, + "loss": 1.4079, + "step": 3508 + }, + { + "epoch": 0.605365306650565, + "grad_norm": 0.66015625, + "learning_rate": 1.5885777066524335e-05, + "loss": 1.4491, + "step": 3509 + }, + { + "epoch": 0.6055378245492969, + "grad_norm": 0.80859375, + "learning_rate": 1.588357637096e-05, + "loss": 1.4404, + "step": 3510 + }, + { + "epoch": 0.605710342448029, + "grad_norm": 0.63671875, + "learning_rate": 1.5881375239503558e-05, + "loss": 1.4348, + "step": 3511 + }, + { + "epoch": 0.605882860346761, + "grad_norm": 0.609375, + "learning_rate": 1.5879173672318095e-05, + "loss": 1.3538, + "step": 3512 + }, + { + "epoch": 0.6060553782454929, + "grad_norm": 0.94140625, + "learning_rate": 1.587697166956671e-05, + "loss": 1.4551, + "step": 3513 + }, + { + "epoch": 0.606227896144225, + "grad_norm": 0.58203125, + "learning_rate": 1.587476923141254e-05, + "loss": 1.3779, + "step": 3514 + }, + { + "epoch": 0.606400414042957, + "grad_norm": 0.68359375, + "learning_rate": 1.587256635801876e-05, + "loss": 1.4017, + "step": 3515 + }, + { + "epoch": 0.6065729319416889, + "grad_norm": 0.6328125, + "learning_rate": 1.5870363049548573e-05, + "loss": 1.3796, + "step": 3516 + }, + { + "epoch": 0.6067454498404209, + "grad_norm": 0.67578125, + "learning_rate": 1.5868159306165208e-05, + "loss": 1.4247, + "step": 3517 + }, + { + "epoch": 0.606917967739153, + "grad_norm": 0.6328125, + "learning_rate": 1.5865955128031937e-05, + "loss": 1.4333, + "step": 3518 + }, + { + "epoch": 0.6070904856378849, + "grad_norm": 0.74609375, + "learning_rate": 1.586375051531206e-05, + "loss": 1.4746, + "step": 3519 + }, + { + "epoch": 0.6072630035366169, + "grad_norm": 0.6640625, + "learning_rate": 1.5861545468168907e-05, + "loss": 1.4033, + "step": 3520 + }, + { + "epoch": 0.607435521435349, + "grad_norm": 0.64453125, + "learning_rate": 1.5859339986765837e-05, + "loss": 1.4576, + "step": 3521 + }, + { + "epoch": 0.6076080393340809, + "grad_norm": 0.7265625, + "learning_rate": 1.5857134071266253e-05, + "loss": 1.4398, + "step": 3522 + }, + { + "epoch": 0.6077805572328129, + "grad_norm": 0.69140625, + "learning_rate": 1.5854927721833577e-05, + "loss": 1.4282, + "step": 3523 + }, + { + "epoch": 0.607953075131545, + "grad_norm": 0.828125, + "learning_rate": 1.5852720938631273e-05, + "loss": 1.3739, + "step": 3524 + }, + { + "epoch": 0.6081255930302769, + "grad_norm": 0.609375, + "learning_rate": 1.5850513721822832e-05, + "loss": 1.4447, + "step": 3525 + }, + { + "epoch": 0.6082981109290089, + "grad_norm": 0.6796875, + "learning_rate": 1.5848306071571782e-05, + "loss": 1.5001, + "step": 3526 + }, + { + "epoch": 0.6084706288277408, + "grad_norm": 0.62109375, + "learning_rate": 1.5846097988041672e-05, + "loss": 1.4358, + "step": 3527 + }, + { + "epoch": 0.6086431467264729, + "grad_norm": 0.63671875, + "learning_rate": 1.58438894713961e-05, + "loss": 1.3999, + "step": 3528 + }, + { + "epoch": 0.6088156646252049, + "grad_norm": 0.9140625, + "learning_rate": 1.584168052179868e-05, + "loss": 1.5327, + "step": 3529 + }, + { + "epoch": 0.6089881825239368, + "grad_norm": 0.703125, + "learning_rate": 1.5839471139413065e-05, + "loss": 1.4658, + "step": 3530 + }, + { + "epoch": 0.6091607004226689, + "grad_norm": 0.6171875, + "learning_rate": 1.583726132440294e-05, + "loss": 1.5247, + "step": 3531 + }, + { + "epoch": 0.6093332183214009, + "grad_norm": 0.71875, + "learning_rate": 1.5835051076932028e-05, + "loss": 1.528, + "step": 3532 + }, + { + "epoch": 0.6095057362201328, + "grad_norm": 0.62890625, + "learning_rate": 1.5832840397164067e-05, + "loss": 1.4172, + "step": 3533 + }, + { + "epoch": 0.6096782541188648, + "grad_norm": 0.63671875, + "learning_rate": 1.5830629285262848e-05, + "loss": 1.5032, + "step": 3534 + }, + { + "epoch": 0.6098507720175969, + "grad_norm": 0.77734375, + "learning_rate": 1.582841774139218e-05, + "loss": 1.4678, + "step": 3535 + }, + { + "epoch": 0.6100232899163288, + "grad_norm": 0.64453125, + "learning_rate": 1.5826205765715905e-05, + "loss": 1.4701, + "step": 3536 + }, + { + "epoch": 0.6101958078150608, + "grad_norm": 0.62890625, + "learning_rate": 1.5823993358397906e-05, + "loss": 1.4376, + "step": 3537 + }, + { + "epoch": 0.6103683257137928, + "grad_norm": 0.7109375, + "learning_rate": 1.582178051960209e-05, + "loss": 1.4783, + "step": 3538 + }, + { + "epoch": 0.6105408436125248, + "grad_norm": 0.6953125, + "learning_rate": 1.5819567249492392e-05, + "loss": 1.4107, + "step": 3539 + }, + { + "epoch": 0.6107133615112568, + "grad_norm": 0.58984375, + "learning_rate": 1.5817353548232795e-05, + "loss": 1.3483, + "step": 3540 + }, + { + "epoch": 0.6108858794099888, + "grad_norm": 0.6875, + "learning_rate": 1.5815139415987296e-05, + "loss": 1.3945, + "step": 3541 + }, + { + "epoch": 0.6110583973087208, + "grad_norm": 0.69140625, + "learning_rate": 1.5812924852919936e-05, + "loss": 1.3612, + "step": 3542 + }, + { + "epoch": 0.6112309152074528, + "grad_norm": 0.640625, + "learning_rate": 1.581070985919478e-05, + "loss": 1.4229, + "step": 3543 + }, + { + "epoch": 0.6114034331061847, + "grad_norm": 0.63671875, + "learning_rate": 1.580849443497593e-05, + "loss": 1.4903, + "step": 3544 + }, + { + "epoch": 0.6115759510049168, + "grad_norm": 0.73828125, + "learning_rate": 1.5806278580427523e-05, + "loss": 1.4872, + "step": 3545 + }, + { + "epoch": 0.6117484689036488, + "grad_norm": 0.6640625, + "learning_rate": 1.580406229571372e-05, + "loss": 1.4751, + "step": 3546 + }, + { + "epoch": 0.6119209868023807, + "grad_norm": 0.6796875, + "learning_rate": 1.5801845580998708e-05, + "loss": 1.3846, + "step": 3547 + }, + { + "epoch": 0.6120935047011128, + "grad_norm": 0.83984375, + "learning_rate": 1.5799628436446733e-05, + "loss": 1.5369, + "step": 3548 + }, + { + "epoch": 0.6122660225998448, + "grad_norm": 0.62890625, + "learning_rate": 1.5797410862222043e-05, + "loss": 1.4508, + "step": 3549 + }, + { + "epoch": 0.6124385404985767, + "grad_norm": 0.63671875, + "learning_rate": 1.579519285848893e-05, + "loss": 1.3334, + "step": 3550 + }, + { + "epoch": 0.6126110583973087, + "grad_norm": 0.73828125, + "learning_rate": 1.579297442541172e-05, + "loss": 1.4288, + "step": 3551 + }, + { + "epoch": 0.6127835762960407, + "grad_norm": 0.6796875, + "learning_rate": 1.5790755563154773e-05, + "loss": 1.4861, + "step": 3552 + }, + { + "epoch": 0.6129560941947727, + "grad_norm": 0.578125, + "learning_rate": 1.578853627188247e-05, + "loss": 1.5465, + "step": 3553 + }, + { + "epoch": 0.6131286120935047, + "grad_norm": 0.6328125, + "learning_rate": 1.5786316551759232e-05, + "loss": 1.3731, + "step": 3554 + }, + { + "epoch": 0.6133011299922367, + "grad_norm": 0.9296875, + "learning_rate": 1.5784096402949507e-05, + "loss": 1.4603, + "step": 3555 + }, + { + "epoch": 0.6134736478909687, + "grad_norm": 0.640625, + "learning_rate": 1.578187582561778e-05, + "loss": 1.3989, + "step": 3556 + }, + { + "epoch": 0.6136461657897007, + "grad_norm": 0.6015625, + "learning_rate": 1.5779654819928565e-05, + "loss": 1.5056, + "step": 3557 + }, + { + "epoch": 0.6138186836884326, + "grad_norm": 0.69140625, + "learning_rate": 1.577743338604641e-05, + "loss": 1.5337, + "step": 3558 + }, + { + "epoch": 0.6139912015871647, + "grad_norm": 0.62109375, + "learning_rate": 1.5775211524135888e-05, + "loss": 1.3694, + "step": 3559 + }, + { + "epoch": 0.6141637194858967, + "grad_norm": 0.58984375, + "learning_rate": 1.5772989234361612e-05, + "loss": 1.4417, + "step": 3560 + }, + { + "epoch": 0.6143362373846286, + "grad_norm": 0.609375, + "learning_rate": 1.5770766516888224e-05, + "loss": 1.5343, + "step": 3561 + }, + { + "epoch": 0.6145087552833607, + "grad_norm": 0.65625, + "learning_rate": 1.5768543371880394e-05, + "loss": 1.4724, + "step": 3562 + }, + { + "epoch": 0.6146812731820926, + "grad_norm": 0.5859375, + "learning_rate": 1.5766319799502822e-05, + "loss": 1.4003, + "step": 3563 + }, + { + "epoch": 0.6148537910808246, + "grad_norm": 0.62109375, + "learning_rate": 1.5764095799920256e-05, + "loss": 1.4674, + "step": 3564 + }, + { + "epoch": 0.6150263089795567, + "grad_norm": 0.64453125, + "learning_rate": 1.576187137329746e-05, + "loss": 1.4818, + "step": 3565 + }, + { + "epoch": 0.6151988268782886, + "grad_norm": 0.703125, + "learning_rate": 1.5759646519799227e-05, + "loss": 1.4898, + "step": 3566 + }, + { + "epoch": 0.6153713447770206, + "grad_norm": 0.609375, + "learning_rate": 1.5757421239590388e-05, + "loss": 1.4803, + "step": 3567 + }, + { + "epoch": 0.6155438626757526, + "grad_norm": 0.60546875, + "learning_rate": 1.5755195532835814e-05, + "loss": 1.5081, + "step": 3568 + }, + { + "epoch": 0.6157163805744846, + "grad_norm": 0.7265625, + "learning_rate": 1.5752969399700396e-05, + "loss": 1.4798, + "step": 3569 + }, + { + "epoch": 0.6158888984732166, + "grad_norm": 0.62109375, + "learning_rate": 1.5750742840349054e-05, + "loss": 1.4868, + "step": 3570 + }, + { + "epoch": 0.6160614163719486, + "grad_norm": 0.61328125, + "learning_rate": 1.5748515854946753e-05, + "loss": 1.4558, + "step": 3571 + }, + { + "epoch": 0.6162339342706806, + "grad_norm": 0.59375, + "learning_rate": 1.574628844365848e-05, + "loss": 1.4448, + "step": 3572 + }, + { + "epoch": 0.6164064521694126, + "grad_norm": 0.859375, + "learning_rate": 1.5744060606649253e-05, + "loss": 1.4433, + "step": 3573 + }, + { + "epoch": 0.6165789700681445, + "grad_norm": 0.66796875, + "learning_rate": 1.5741832344084126e-05, + "loss": 1.4628, + "step": 3574 + }, + { + "epoch": 0.6167514879668765, + "grad_norm": 0.93359375, + "learning_rate": 1.5739603656128183e-05, + "loss": 1.4946, + "step": 3575 + }, + { + "epoch": 0.6169240058656086, + "grad_norm": 0.79296875, + "learning_rate": 1.573737454294654e-05, + "loss": 1.5086, + "step": 3576 + }, + { + "epoch": 0.6170965237643405, + "grad_norm": 0.74609375, + "learning_rate": 1.573514500470434e-05, + "loss": 1.4041, + "step": 3577 + }, + { + "epoch": 0.6172690416630725, + "grad_norm": 0.6640625, + "learning_rate": 1.5732915041566764e-05, + "loss": 1.4129, + "step": 3578 + }, + { + "epoch": 0.6174415595618046, + "grad_norm": 0.765625, + "learning_rate": 1.5730684653699017e-05, + "loss": 1.4428, + "step": 3579 + }, + { + "epoch": 0.6176140774605365, + "grad_norm": 0.8203125, + "learning_rate": 1.5728453841266345e-05, + "loss": 1.4511, + "step": 3580 + }, + { + "epoch": 0.6177865953592685, + "grad_norm": 0.67578125, + "learning_rate": 1.5726222604434023e-05, + "loss": 1.4721, + "step": 3581 + }, + { + "epoch": 0.6179591132580006, + "grad_norm": 0.65625, + "learning_rate": 1.572399094336735e-05, + "loss": 1.4831, + "step": 3582 + }, + { + "epoch": 0.6181316311567325, + "grad_norm": 0.90625, + "learning_rate": 1.5721758858231662e-05, + "loss": 1.4647, + "step": 3583 + }, + { + "epoch": 0.6183041490554645, + "grad_norm": 0.67578125, + "learning_rate": 1.571952634919233e-05, + "loss": 1.4295, + "step": 3584 + }, + { + "epoch": 0.6184766669541965, + "grad_norm": 0.6875, + "learning_rate": 1.5717293416414743e-05, + "loss": 1.4365, + "step": 3585 + }, + { + "epoch": 0.6186491848529285, + "grad_norm": 0.671875, + "learning_rate": 1.571506006006434e-05, + "loss": 1.3541, + "step": 3586 + }, + { + "epoch": 0.6188217027516605, + "grad_norm": 0.7109375, + "learning_rate": 1.5712826280306578e-05, + "loss": 1.3913, + "step": 3587 + }, + { + "epoch": 0.6189942206503924, + "grad_norm": 0.7109375, + "learning_rate": 1.571059207730695e-05, + "loss": 1.4749, + "step": 3588 + }, + { + "epoch": 0.6191667385491245, + "grad_norm": 0.61328125, + "learning_rate": 1.5708357451230978e-05, + "loss": 1.5027, + "step": 3589 + }, + { + "epoch": 0.6193392564478565, + "grad_norm": 0.61328125, + "learning_rate": 1.570612240224422e-05, + "loss": 1.3575, + "step": 3590 + }, + { + "epoch": 0.6195117743465884, + "grad_norm": 0.75390625, + "learning_rate": 1.570388693051226e-05, + "loss": 1.4667, + "step": 3591 + }, + { + "epoch": 0.6196842922453204, + "grad_norm": 0.66015625, + "learning_rate": 1.570165103620072e-05, + "loss": 1.4951, + "step": 3592 + }, + { + "epoch": 0.6198568101440525, + "grad_norm": 0.6796875, + "learning_rate": 1.5699414719475243e-05, + "loss": 1.4742, + "step": 3593 + }, + { + "epoch": 0.6200293280427844, + "grad_norm": 0.62890625, + "learning_rate": 1.5697177980501507e-05, + "loss": 1.4339, + "step": 3594 + }, + { + "epoch": 0.6202018459415164, + "grad_norm": 0.609375, + "learning_rate": 1.5694940819445234e-05, + "loss": 1.505, + "step": 3595 + }, + { + "epoch": 0.6203743638402485, + "grad_norm": 0.5859375, + "learning_rate": 1.5692703236472162e-05, + "loss": 1.483, + "step": 3596 + }, + { + "epoch": 0.6205468817389804, + "grad_norm": 0.734375, + "learning_rate": 1.5690465231748064e-05, + "loss": 1.5035, + "step": 3597 + }, + { + "epoch": 0.6207193996377124, + "grad_norm": 0.73046875, + "learning_rate": 1.5688226805438745e-05, + "loss": 1.3519, + "step": 3598 + }, + { + "epoch": 0.6208919175364445, + "grad_norm": 0.8984375, + "learning_rate": 1.5685987957710043e-05, + "loss": 1.4842, + "step": 3599 + }, + { + "epoch": 0.6210644354351764, + "grad_norm": 0.72265625, + "learning_rate": 1.5683748688727827e-05, + "loss": 1.3299, + "step": 3600 + }, + { + "epoch": 0.6210644354351764, + "eval_loss": 1.4302067756652832, + "eval_runtime": 10.8016, + "eval_samples_per_second": 94.801, + "eval_steps_per_second": 23.7, + "step": 3600 + }, + { + "epoch": 0.6212369533339084, + "grad_norm": 0.89453125, + "learning_rate": 1.568150899865799e-05, + "loss": 1.4975, + "step": 3601 + }, + { + "epoch": 0.6214094712326403, + "grad_norm": 0.73828125, + "learning_rate": 1.567926888766647e-05, + "loss": 1.406, + "step": 3602 + }, + { + "epoch": 0.6215819891313724, + "grad_norm": 0.60546875, + "learning_rate": 1.5677028355919225e-05, + "loss": 1.4159, + "step": 3603 + }, + { + "epoch": 0.6217545070301044, + "grad_norm": 0.7421875, + "learning_rate": 1.5674787403582247e-05, + "loss": 1.3803, + "step": 3604 + }, + { + "epoch": 0.6219270249288363, + "grad_norm": 0.72265625, + "learning_rate": 1.567254603082156e-05, + "loss": 1.4789, + "step": 3605 + }, + { + "epoch": 0.6220995428275684, + "grad_norm": 0.65625, + "learning_rate": 1.567030423780322e-05, + "loss": 1.4876, + "step": 3606 + }, + { + "epoch": 0.6222720607263004, + "grad_norm": 0.609375, + "learning_rate": 1.5668062024693314e-05, + "loss": 1.4734, + "step": 3607 + }, + { + "epoch": 0.6224445786250323, + "grad_norm": 0.6328125, + "learning_rate": 1.5665819391657955e-05, + "loss": 1.3378, + "step": 3608 + }, + { + "epoch": 0.6226170965237643, + "grad_norm": 0.796875, + "learning_rate": 1.5663576338863294e-05, + "loss": 1.4462, + "step": 3609 + }, + { + "epoch": 0.6227896144224964, + "grad_norm": 0.66015625, + "learning_rate": 1.5661332866475514e-05, + "loss": 1.4788, + "step": 3610 + }, + { + "epoch": 0.6229621323212283, + "grad_norm": 0.6171875, + "learning_rate": 1.5659088974660813e-05, + "loss": 1.5331, + "step": 3611 + }, + { + "epoch": 0.6231346502199603, + "grad_norm": 0.73828125, + "learning_rate": 1.5656844663585448e-05, + "loss": 1.4245, + "step": 3612 + }, + { + "epoch": 0.6233071681186924, + "grad_norm": 0.6484375, + "learning_rate": 1.5654599933415683e-05, + "loss": 1.3844, + "step": 3613 + }, + { + "epoch": 0.6234796860174243, + "grad_norm": 0.640625, + "learning_rate": 1.565235478431783e-05, + "loss": 1.3807, + "step": 3614 + }, + { + "epoch": 0.6236522039161563, + "grad_norm": 0.62890625, + "learning_rate": 1.5650109216458208e-05, + "loss": 1.4553, + "step": 3615 + }, + { + "epoch": 0.6238247218148883, + "grad_norm": 0.7578125, + "learning_rate": 1.5647863230003193e-05, + "loss": 1.398, + "step": 3616 + }, + { + "epoch": 0.6239972397136203, + "grad_norm": 0.66796875, + "learning_rate": 1.564561682511918e-05, + "loss": 1.4887, + "step": 3617 + }, + { + "epoch": 0.6241697576123523, + "grad_norm": 0.67578125, + "learning_rate": 1.5643370001972602e-05, + "loss": 1.4707, + "step": 3618 + }, + { + "epoch": 0.6243422755110842, + "grad_norm": 0.8671875, + "learning_rate": 1.564112276072991e-05, + "loss": 1.4133, + "step": 3619 + }, + { + "epoch": 0.6245147934098163, + "grad_norm": 0.84375, + "learning_rate": 1.5638875101557596e-05, + "loss": 1.4796, + "step": 3620 + }, + { + "epoch": 0.6246873113085483, + "grad_norm": 0.7578125, + "learning_rate": 1.5636627024622183e-05, + "loss": 1.3614, + "step": 3621 + }, + { + "epoch": 0.6248598292072802, + "grad_norm": 0.6953125, + "learning_rate": 1.5634378530090217e-05, + "loss": 1.3653, + "step": 3622 + }, + { + "epoch": 0.6250323471060123, + "grad_norm": 0.8203125, + "learning_rate": 1.5632129618128285e-05, + "loss": 1.4674, + "step": 3623 + }, + { + "epoch": 0.6252048650047443, + "grad_norm": 0.68359375, + "learning_rate": 1.5629880288903002e-05, + "loss": 1.427, + "step": 3624 + }, + { + "epoch": 0.6253773829034762, + "grad_norm": 0.61328125, + "learning_rate": 1.5627630542581013e-05, + "loss": 1.4733, + "step": 3625 + }, + { + "epoch": 0.6255499008022082, + "grad_norm": 0.69140625, + "learning_rate": 1.5625380379328984e-05, + "loss": 1.5152, + "step": 3626 + }, + { + "epoch": 0.6257224187009403, + "grad_norm": 0.71875, + "learning_rate": 1.562312979931363e-05, + "loss": 1.482, + "step": 3627 + }, + { + "epoch": 0.6258949365996722, + "grad_norm": 0.80078125, + "learning_rate": 1.5620878802701687e-05, + "loss": 1.4838, + "step": 3628 + }, + { + "epoch": 0.6260674544984042, + "grad_norm": 0.65625, + "learning_rate": 1.561862738965992e-05, + "loss": 1.4425, + "step": 3629 + }, + { + "epoch": 0.6262399723971362, + "grad_norm": 0.76953125, + "learning_rate": 1.561637556035513e-05, + "loss": 1.5229, + "step": 3630 + }, + { + "epoch": 0.6264124902958682, + "grad_norm": 0.8984375, + "learning_rate": 1.5614123314954144e-05, + "loss": 1.457, + "step": 3631 + }, + { + "epoch": 0.6265850081946002, + "grad_norm": 0.9140625, + "learning_rate": 1.5611870653623826e-05, + "loss": 1.5144, + "step": 3632 + }, + { + "epoch": 0.6267575260933321, + "grad_norm": 0.734375, + "learning_rate": 1.5609617576531065e-05, + "loss": 1.4189, + "step": 3633 + }, + { + "epoch": 0.6269300439920642, + "grad_norm": 0.91796875, + "learning_rate": 1.560736408384278e-05, + "loss": 1.5909, + "step": 3634 + }, + { + "epoch": 0.6271025618907962, + "grad_norm": 0.60546875, + "learning_rate": 1.560511017572593e-05, + "loss": 1.4118, + "step": 3635 + }, + { + "epoch": 0.6272750797895281, + "grad_norm": 0.62109375, + "learning_rate": 1.56028558523475e-05, + "loss": 1.4832, + "step": 3636 + }, + { + "epoch": 0.6274475976882602, + "grad_norm": 0.73828125, + "learning_rate": 1.5600601113874497e-05, + "loss": 1.4742, + "step": 3637 + }, + { + "epoch": 0.6276201155869922, + "grad_norm": 0.69140625, + "learning_rate": 1.5598345960473965e-05, + "loss": 1.4938, + "step": 3638 + }, + { + "epoch": 0.6277926334857241, + "grad_norm": 0.84375, + "learning_rate": 1.559609039231299e-05, + "loss": 1.4624, + "step": 3639 + }, + { + "epoch": 0.6279651513844562, + "grad_norm": 0.6796875, + "learning_rate": 1.559383440955867e-05, + "loss": 1.4947, + "step": 3640 + }, + { + "epoch": 0.6281376692831881, + "grad_norm": 0.80078125, + "learning_rate": 1.5591578012378148e-05, + "loss": 1.4709, + "step": 3641 + }, + { + "epoch": 0.6283101871819201, + "grad_norm": 0.70703125, + "learning_rate": 1.5589321200938584e-05, + "loss": 1.4289, + "step": 3642 + }, + { + "epoch": 0.6284827050806521, + "grad_norm": 0.66015625, + "learning_rate": 1.5587063975407185e-05, + "loss": 1.4556, + "step": 3643 + }, + { + "epoch": 0.6286552229793841, + "grad_norm": 0.64453125, + "learning_rate": 1.5584806335951177e-05, + "loss": 1.5221, + "step": 3644 + }, + { + "epoch": 0.6288277408781161, + "grad_norm": 0.69921875, + "learning_rate": 1.558254828273782e-05, + "loss": 1.407, + "step": 3645 + }, + { + "epoch": 0.6290002587768481, + "grad_norm": 1.34375, + "learning_rate": 1.55802898159344e-05, + "loss": 1.5986, + "step": 3646 + }, + { + "epoch": 0.6291727766755801, + "grad_norm": 0.78515625, + "learning_rate": 1.557803093570825e-05, + "loss": 1.4878, + "step": 3647 + }, + { + "epoch": 0.6293452945743121, + "grad_norm": 0.58203125, + "learning_rate": 1.5575771642226715e-05, + "loss": 1.45, + "step": 3648 + }, + { + "epoch": 0.6295178124730441, + "grad_norm": 0.8125, + "learning_rate": 1.5573511935657174e-05, + "loss": 1.3846, + "step": 3649 + }, + { + "epoch": 0.629690330371776, + "grad_norm": 0.91015625, + "learning_rate": 1.5571251816167047e-05, + "loss": 1.4499, + "step": 3650 + }, + { + "epoch": 0.6298628482705081, + "grad_norm": 0.66796875, + "learning_rate": 1.5568991283923772e-05, + "loss": 1.5331, + "step": 3651 + }, + { + "epoch": 0.63003536616924, + "grad_norm": 0.6953125, + "learning_rate": 1.5566730339094827e-05, + "loss": 1.4142, + "step": 3652 + }, + { + "epoch": 0.630207884067972, + "grad_norm": 0.8828125, + "learning_rate": 1.5564468981847716e-05, + "loss": 1.3816, + "step": 3653 + }, + { + "epoch": 0.6303804019667041, + "grad_norm": 0.8671875, + "learning_rate": 1.5562207212349975e-05, + "loss": 1.3857, + "step": 3654 + }, + { + "epoch": 0.630552919865436, + "grad_norm": 0.6328125, + "learning_rate": 1.555994503076917e-05, + "loss": 1.5437, + "step": 3655 + }, + { + "epoch": 0.630725437764168, + "grad_norm": 0.68359375, + "learning_rate": 1.5557682437272898e-05, + "loss": 1.4104, + "step": 3656 + }, + { + "epoch": 0.6308979556629001, + "grad_norm": 0.69921875, + "learning_rate": 1.555541943202878e-05, + "loss": 1.54, + "step": 3657 + }, + { + "epoch": 0.631070473561632, + "grad_norm": 0.6484375, + "learning_rate": 1.555315601520448e-05, + "loss": 1.4845, + "step": 3658 + }, + { + "epoch": 0.631242991460364, + "grad_norm": 0.671875, + "learning_rate": 1.5550892186967685e-05, + "loss": 1.4748, + "step": 3659 + }, + { + "epoch": 0.631415509359096, + "grad_norm": 0.859375, + "learning_rate": 1.5548627947486114e-05, + "loss": 1.3876, + "step": 3660 + }, + { + "epoch": 0.631588027257828, + "grad_norm": 0.76171875, + "learning_rate": 1.5546363296927518e-05, + "loss": 1.4506, + "step": 3661 + }, + { + "epoch": 0.63176054515656, + "grad_norm": 0.8359375, + "learning_rate": 1.5544098235459673e-05, + "loss": 1.4053, + "step": 3662 + }, + { + "epoch": 0.631933063055292, + "grad_norm": 0.60546875, + "learning_rate": 1.5541832763250386e-05, + "loss": 1.4933, + "step": 3663 + }, + { + "epoch": 0.632105580954024, + "grad_norm": 1.1796875, + "learning_rate": 1.5539566880467502e-05, + "loss": 1.5495, + "step": 3664 + }, + { + "epoch": 0.632278098852756, + "grad_norm": 0.828125, + "learning_rate": 1.553730058727889e-05, + "loss": 1.3917, + "step": 3665 + }, + { + "epoch": 0.632450616751488, + "grad_norm": 0.8125, + "learning_rate": 1.5535033883852456e-05, + "loss": 1.521, + "step": 3666 + }, + { + "epoch": 0.6326231346502199, + "grad_norm": 0.734375, + "learning_rate": 1.5532766770356125e-05, + "loss": 1.5505, + "step": 3667 + }, + { + "epoch": 0.632795652548952, + "grad_norm": 0.99609375, + "learning_rate": 1.553049924695786e-05, + "loss": 1.563, + "step": 3668 + }, + { + "epoch": 0.6329681704476839, + "grad_norm": 0.8359375, + "learning_rate": 1.552823131382566e-05, + "loss": 1.4697, + "step": 3669 + }, + { + "epoch": 0.6331406883464159, + "grad_norm": 0.640625, + "learning_rate": 1.5525962971127536e-05, + "loss": 1.3944, + "step": 3670 + }, + { + "epoch": 0.633313206245148, + "grad_norm": 0.9296875, + "learning_rate": 1.5523694219031548e-05, + "loss": 1.4171, + "step": 3671 + }, + { + "epoch": 0.6334857241438799, + "grad_norm": 0.8125, + "learning_rate": 1.552142505770578e-05, + "loss": 1.4392, + "step": 3672 + }, + { + "epoch": 0.6336582420426119, + "grad_norm": 0.7109375, + "learning_rate": 1.5519155487318345e-05, + "loss": 1.4225, + "step": 3673 + }, + { + "epoch": 0.633830759941344, + "grad_norm": 0.796875, + "learning_rate": 1.5516885508037388e-05, + "loss": 1.438, + "step": 3674 + }, + { + "epoch": 0.6340032778400759, + "grad_norm": 0.81640625, + "learning_rate": 1.5514615120031077e-05, + "loss": 1.448, + "step": 3675 + }, + { + "epoch": 0.6341757957388079, + "grad_norm": 0.83203125, + "learning_rate": 1.551234432346762e-05, + "loss": 1.5713, + "step": 3676 + }, + { + "epoch": 0.6343483136375399, + "grad_norm": 1.7421875, + "learning_rate": 1.551007311851526e-05, + "loss": 1.4049, + "step": 3677 + }, + { + "epoch": 0.6345208315362719, + "grad_norm": 0.95703125, + "learning_rate": 1.550780150534225e-05, + "loss": 1.4776, + "step": 3678 + }, + { + "epoch": 0.6346933494350039, + "grad_norm": 0.72265625, + "learning_rate": 1.550552948411689e-05, + "loss": 1.3499, + "step": 3679 + }, + { + "epoch": 0.6348658673337358, + "grad_norm": 0.76953125, + "learning_rate": 1.5503257055007502e-05, + "loss": 1.4269, + "step": 3680 + }, + { + "epoch": 0.6350383852324679, + "grad_norm": 0.57421875, + "learning_rate": 1.5500984218182452e-05, + "loss": 1.5338, + "step": 3681 + }, + { + "epoch": 0.6352109031311999, + "grad_norm": 0.66796875, + "learning_rate": 1.549871097381012e-05, + "loss": 1.5733, + "step": 3682 + }, + { + "epoch": 0.6353834210299318, + "grad_norm": 0.734375, + "learning_rate": 1.5496437322058915e-05, + "loss": 1.518, + "step": 3683 + }, + { + "epoch": 0.6355559389286638, + "grad_norm": 0.53515625, + "learning_rate": 1.5494163263097294e-05, + "loss": 1.4033, + "step": 3684 + }, + { + "epoch": 0.6357284568273959, + "grad_norm": 0.6015625, + "learning_rate": 1.549188879709373e-05, + "loss": 1.5143, + "step": 3685 + }, + { + "epoch": 0.6359009747261278, + "grad_norm": 0.6015625, + "learning_rate": 1.5489613924216728e-05, + "loss": 1.4921, + "step": 3686 + }, + { + "epoch": 0.6360734926248598, + "grad_norm": 0.6640625, + "learning_rate": 1.5487338644634825e-05, + "loss": 1.5659, + "step": 3687 + }, + { + "epoch": 0.6362460105235919, + "grad_norm": 0.609375, + "learning_rate": 1.548506295851659e-05, + "loss": 1.5478, + "step": 3688 + }, + { + "epoch": 0.6364185284223238, + "grad_norm": 0.609375, + "learning_rate": 1.5482786866030618e-05, + "loss": 1.5006, + "step": 3689 + }, + { + "epoch": 0.6365910463210558, + "grad_norm": 1.0546875, + "learning_rate": 1.5480510367345537e-05, + "loss": 1.4564, + "step": 3690 + }, + { + "epoch": 0.6367635642197877, + "grad_norm": 0.73828125, + "learning_rate": 1.547823346263001e-05, + "loss": 1.4481, + "step": 3691 + }, + { + "epoch": 0.6369360821185198, + "grad_norm": 0.59375, + "learning_rate": 1.5475956152052706e-05, + "loss": 1.5064, + "step": 3692 + }, + { + "epoch": 0.6371086000172518, + "grad_norm": 0.61328125, + "learning_rate": 1.5473678435782365e-05, + "loss": 1.4855, + "step": 3693 + }, + { + "epoch": 0.6372811179159837, + "grad_norm": 0.59765625, + "learning_rate": 1.547140031398772e-05, + "loss": 1.3907, + "step": 3694 + }, + { + "epoch": 0.6374536358147158, + "grad_norm": 0.7109375, + "learning_rate": 1.5469121786837553e-05, + "loss": 1.3983, + "step": 3695 + }, + { + "epoch": 0.6376261537134478, + "grad_norm": 0.6953125, + "learning_rate": 1.546684285450067e-05, + "loss": 1.3785, + "step": 3696 + }, + { + "epoch": 0.6377986716121797, + "grad_norm": 0.6015625, + "learning_rate": 1.5464563517145916e-05, + "loss": 1.5285, + "step": 3697 + }, + { + "epoch": 0.6379711895109118, + "grad_norm": 0.70703125, + "learning_rate": 1.546228377494215e-05, + "loss": 1.4713, + "step": 3698 + }, + { + "epoch": 0.6381437074096438, + "grad_norm": 0.64453125, + "learning_rate": 1.546000362805827e-05, + "loss": 1.5182, + "step": 3699 + }, + { + "epoch": 0.6383162253083757, + "grad_norm": 0.6328125, + "learning_rate": 1.5457723076663206e-05, + "loss": 1.5226, + "step": 3700 + }, + { + "epoch": 0.6383162253083757, + "eval_loss": 1.4285202026367188, + "eval_runtime": 10.826, + "eval_samples_per_second": 94.587, + "eval_steps_per_second": 23.647, + "step": 3700 + }, + { + "epoch": 0.6384887432071077, + "grad_norm": 0.58203125, + "learning_rate": 1.5455442120925916e-05, + "loss": 1.393, + "step": 3701 + }, + { + "epoch": 0.6386612611058398, + "grad_norm": 0.59765625, + "learning_rate": 1.5453160761015386e-05, + "loss": 1.4783, + "step": 3702 + }, + { + "epoch": 0.6388337790045717, + "grad_norm": 0.6171875, + "learning_rate": 1.5450878997100634e-05, + "loss": 1.4916, + "step": 3703 + }, + { + "epoch": 0.6390062969033037, + "grad_norm": 0.66015625, + "learning_rate": 1.5448596829350706e-05, + "loss": 1.4923, + "step": 3704 + }, + { + "epoch": 0.6391788148020358, + "grad_norm": 0.625, + "learning_rate": 1.544631425793468e-05, + "loss": 1.4529, + "step": 3705 + }, + { + "epoch": 0.6393513327007677, + "grad_norm": 2.890625, + "learning_rate": 1.5444031283021668e-05, + "loss": 1.4464, + "step": 3706 + }, + { + "epoch": 0.6395238505994997, + "grad_norm": 0.609375, + "learning_rate": 1.54417479047808e-05, + "loss": 1.4512, + "step": 3707 + }, + { + "epoch": 0.6396963684982316, + "grad_norm": 0.61328125, + "learning_rate": 1.5439464123381243e-05, + "loss": 1.4565, + "step": 3708 + }, + { + "epoch": 0.6398688863969637, + "grad_norm": 0.5859375, + "learning_rate": 1.54371799389922e-05, + "loss": 1.4812, + "step": 3709 + }, + { + "epoch": 0.6400414042956957, + "grad_norm": 0.62109375, + "learning_rate": 1.543489535178289e-05, + "loss": 1.4413, + "step": 3710 + }, + { + "epoch": 0.6402139221944276, + "grad_norm": 0.6015625, + "learning_rate": 1.5432610361922578e-05, + "loss": 1.3908, + "step": 3711 + }, + { + "epoch": 0.6403864400931597, + "grad_norm": 0.73828125, + "learning_rate": 1.5430324969580545e-05, + "loss": 1.4154, + "step": 3712 + }, + { + "epoch": 0.6405589579918917, + "grad_norm": 0.5625, + "learning_rate": 1.5428039174926106e-05, + "loss": 1.4293, + "step": 3713 + }, + { + "epoch": 0.6407314758906236, + "grad_norm": 0.72265625, + "learning_rate": 1.5425752978128612e-05, + "loss": 1.4786, + "step": 3714 + }, + { + "epoch": 0.6409039937893557, + "grad_norm": 0.58984375, + "learning_rate": 1.5423466379357433e-05, + "loss": 1.4136, + "step": 3715 + }, + { + "epoch": 0.6410765116880877, + "grad_norm": 0.69140625, + "learning_rate": 1.542117937878198e-05, + "loss": 1.3674, + "step": 3716 + }, + { + "epoch": 0.6412490295868196, + "grad_norm": 0.73046875, + "learning_rate": 1.5418891976571682e-05, + "loss": 1.3893, + "step": 3717 + }, + { + "epoch": 0.6414215474855516, + "grad_norm": 4.875, + "learning_rate": 1.541660417289601e-05, + "loss": 1.524, + "step": 3718 + }, + { + "epoch": 0.6415940653842837, + "grad_norm": 0.73046875, + "learning_rate": 1.5414315967924454e-05, + "loss": 1.422, + "step": 3719 + }, + { + "epoch": 0.6417665832830156, + "grad_norm": 0.72265625, + "learning_rate": 1.5412027361826544e-05, + "loss": 1.486, + "step": 3720 + }, + { + "epoch": 0.6419391011817476, + "grad_norm": 0.64453125, + "learning_rate": 1.5409738354771832e-05, + "loss": 1.443, + "step": 3721 + }, + { + "epoch": 0.6421116190804796, + "grad_norm": 0.59375, + "learning_rate": 1.5407448946929898e-05, + "loss": 1.4336, + "step": 3722 + }, + { + "epoch": 0.6422841369792116, + "grad_norm": 0.7265625, + "learning_rate": 1.540515913847036e-05, + "loss": 1.4494, + "step": 3723 + }, + { + "epoch": 0.6424566548779436, + "grad_norm": 0.640625, + "learning_rate": 1.5402868929562857e-05, + "loss": 1.4959, + "step": 3724 + }, + { + "epoch": 0.6426291727766755, + "grad_norm": 0.6953125, + "learning_rate": 1.540057832037707e-05, + "loss": 1.3744, + "step": 3725 + }, + { + "epoch": 0.6428016906754076, + "grad_norm": 0.6796875, + "learning_rate": 1.539828731108269e-05, + "loss": 1.4017, + "step": 3726 + }, + { + "epoch": 0.6429742085741396, + "grad_norm": 0.60546875, + "learning_rate": 1.539599590184946e-05, + "loss": 1.5099, + "step": 3727 + }, + { + "epoch": 0.6431467264728715, + "grad_norm": 0.7265625, + "learning_rate": 1.5393704092847143e-05, + "loss": 1.4387, + "step": 3728 + }, + { + "epoch": 0.6433192443716036, + "grad_norm": 0.59765625, + "learning_rate": 1.5391411884245517e-05, + "loss": 1.3447, + "step": 3729 + }, + { + "epoch": 0.6434917622703356, + "grad_norm": 0.6953125, + "learning_rate": 1.5389119276214415e-05, + "loss": 1.5693, + "step": 3730 + }, + { + "epoch": 0.6436642801690675, + "grad_norm": 0.59375, + "learning_rate": 1.5386826268923685e-05, + "loss": 1.4577, + "step": 3731 + }, + { + "epoch": 0.6438367980677996, + "grad_norm": 0.63671875, + "learning_rate": 1.5384532862543207e-05, + "loss": 1.4509, + "step": 3732 + }, + { + "epoch": 0.6440093159665315, + "grad_norm": 0.6796875, + "learning_rate": 1.538223905724289e-05, + "loss": 1.3712, + "step": 3733 + }, + { + "epoch": 0.6441818338652635, + "grad_norm": 0.609375, + "learning_rate": 1.537994485319267e-05, + "loss": 1.4282, + "step": 3734 + }, + { + "epoch": 0.6443543517639955, + "grad_norm": 0.62109375, + "learning_rate": 1.537765025056252e-05, + "loss": 1.4045, + "step": 3735 + }, + { + "epoch": 0.6445268696627275, + "grad_norm": 0.56640625, + "learning_rate": 1.5375355249522444e-05, + "loss": 1.4643, + "step": 3736 + }, + { + "epoch": 0.6446993875614595, + "grad_norm": 0.58984375, + "learning_rate": 1.537305985024246e-05, + "loss": 1.5169, + "step": 3737 + }, + { + "epoch": 0.6448719054601915, + "grad_norm": 0.64453125, + "learning_rate": 1.5370764052892634e-05, + "loss": 1.4313, + "step": 3738 + }, + { + "epoch": 0.6450444233589235, + "grad_norm": 0.640625, + "learning_rate": 1.5368467857643045e-05, + "loss": 1.534, + "step": 3739 + }, + { + "epoch": 0.6452169412576555, + "grad_norm": 0.6484375, + "learning_rate": 1.5366171264663816e-05, + "loss": 1.4798, + "step": 3740 + }, + { + "epoch": 0.6453894591563875, + "grad_norm": 0.61328125, + "learning_rate": 1.5363874274125086e-05, + "loss": 1.4276, + "step": 3741 + }, + { + "epoch": 0.6455619770551194, + "grad_norm": 0.640625, + "learning_rate": 1.5361576886197037e-05, + "loss": 1.478, + "step": 3742 + }, + { + "epoch": 0.6457344949538515, + "grad_norm": 0.66796875, + "learning_rate": 1.535927910104987e-05, + "loss": 1.516, + "step": 3743 + }, + { + "epoch": 0.6459070128525835, + "grad_norm": 0.63671875, + "learning_rate": 1.5356980918853827e-05, + "loss": 1.4182, + "step": 3744 + }, + { + "epoch": 0.6460795307513154, + "grad_norm": 0.66015625, + "learning_rate": 1.535468233977916e-05, + "loss": 1.5294, + "step": 3745 + }, + { + "epoch": 0.6462520486500475, + "grad_norm": 0.6796875, + "learning_rate": 1.5352383363996172e-05, + "loss": 1.518, + "step": 3746 + }, + { + "epoch": 0.6464245665487794, + "grad_norm": 0.73046875, + "learning_rate": 1.5350083991675177e-05, + "loss": 1.4986, + "step": 3747 + }, + { + "epoch": 0.6465970844475114, + "grad_norm": 0.67578125, + "learning_rate": 1.534778422298653e-05, + "loss": 1.4611, + "step": 3748 + }, + { + "epoch": 0.6467696023462435, + "grad_norm": 0.6875, + "learning_rate": 1.534548405810062e-05, + "loss": 1.5083, + "step": 3749 + }, + { + "epoch": 0.6469421202449754, + "grad_norm": 0.6875, + "learning_rate": 1.5343183497187844e-05, + "loss": 1.5194, + "step": 3750 + }, + { + "epoch": 0.6471146381437074, + "grad_norm": 0.71875, + "learning_rate": 1.534088254041865e-05, + "loss": 1.4361, + "step": 3751 + }, + { + "epoch": 0.6472871560424394, + "grad_norm": 0.6953125, + "learning_rate": 1.533858118796351e-05, + "loss": 1.4349, + "step": 3752 + }, + { + "epoch": 0.6474596739411714, + "grad_norm": 0.703125, + "learning_rate": 1.5336279439992918e-05, + "loss": 1.4821, + "step": 3753 + }, + { + "epoch": 0.6476321918399034, + "grad_norm": 0.65625, + "learning_rate": 1.53339772966774e-05, + "loss": 1.3555, + "step": 3754 + }, + { + "epoch": 0.6478047097386354, + "grad_norm": 0.6328125, + "learning_rate": 1.5331674758187516e-05, + "loss": 1.3945, + "step": 3755 + }, + { + "epoch": 0.6479772276373674, + "grad_norm": 0.69140625, + "learning_rate": 1.532937182469385e-05, + "loss": 1.4914, + "step": 3756 + }, + { + "epoch": 0.6481497455360994, + "grad_norm": 0.6484375, + "learning_rate": 1.5327068496367023e-05, + "loss": 1.5468, + "step": 3757 + }, + { + "epoch": 0.6483222634348313, + "grad_norm": 0.6171875, + "learning_rate": 1.5324764773377677e-05, + "loss": 1.471, + "step": 3758 + }, + { + "epoch": 0.6484947813335633, + "grad_norm": 0.5859375, + "learning_rate": 1.532246065589648e-05, + "loss": 1.468, + "step": 3759 + }, + { + "epoch": 0.6486672992322954, + "grad_norm": 0.6484375, + "learning_rate": 1.532015614409415e-05, + "loss": 1.4026, + "step": 3760 + }, + { + "epoch": 0.6488398171310273, + "grad_norm": 0.78125, + "learning_rate": 1.5317851238141406e-05, + "loss": 1.5021, + "step": 3761 + }, + { + "epoch": 0.6490123350297593, + "grad_norm": 0.69140625, + "learning_rate": 1.5315545938209016e-05, + "loss": 1.3502, + "step": 3762 + }, + { + "epoch": 0.6491848529284914, + "grad_norm": 0.5703125, + "learning_rate": 1.531324024446777e-05, + "loss": 1.4444, + "step": 3763 + }, + { + "epoch": 0.6493573708272233, + "grad_norm": 0.59375, + "learning_rate": 1.531093415708849e-05, + "loss": 1.4432, + "step": 3764 + }, + { + "epoch": 0.6495298887259553, + "grad_norm": 0.6328125, + "learning_rate": 1.530862767624202e-05, + "loss": 1.4107, + "step": 3765 + }, + { + "epoch": 0.6497024066246873, + "grad_norm": 0.6328125, + "learning_rate": 1.5306320802099243e-05, + "loss": 1.4785, + "step": 3766 + }, + { + "epoch": 0.6498749245234193, + "grad_norm": 0.57421875, + "learning_rate": 1.5304013534831064e-05, + "loss": 1.4472, + "step": 3767 + }, + { + "epoch": 0.6500474424221513, + "grad_norm": 0.66796875, + "learning_rate": 1.5301705874608423e-05, + "loss": 1.4185, + "step": 3768 + }, + { + "epoch": 0.6502199603208833, + "grad_norm": 0.68359375, + "learning_rate": 1.5299397821602284e-05, + "loss": 1.5577, + "step": 3769 + }, + { + "epoch": 0.6503924782196153, + "grad_norm": 0.5859375, + "learning_rate": 1.5297089375983644e-05, + "loss": 1.4669, + "step": 3770 + }, + { + "epoch": 0.6505649961183473, + "grad_norm": 0.6015625, + "learning_rate": 1.5294780537923523e-05, + "loss": 1.5208, + "step": 3771 + }, + { + "epoch": 0.6507375140170792, + "grad_norm": 0.70703125, + "learning_rate": 1.5292471307592975e-05, + "loss": 1.4643, + "step": 3772 + }, + { + "epoch": 0.6509100319158113, + "grad_norm": 0.62109375, + "learning_rate": 1.5290161685163086e-05, + "loss": 1.4022, + "step": 3773 + }, + { + "epoch": 0.6510825498145433, + "grad_norm": 0.796875, + "learning_rate": 1.5287851670804963e-05, + "loss": 1.4932, + "step": 3774 + }, + { + "epoch": 0.6512550677132752, + "grad_norm": 0.71484375, + "learning_rate": 1.528554126468975e-05, + "loss": 1.4705, + "step": 3775 + }, + { + "epoch": 0.6514275856120072, + "grad_norm": 0.67578125, + "learning_rate": 1.5283230466988615e-05, + "loss": 1.3752, + "step": 3776 + }, + { + "epoch": 0.6516001035107393, + "grad_norm": 0.640625, + "learning_rate": 1.5280919277872753e-05, + "loss": 1.4108, + "step": 3777 + }, + { + "epoch": 0.6517726214094712, + "grad_norm": 0.72265625, + "learning_rate": 1.5278607697513396e-05, + "loss": 1.4798, + "step": 3778 + }, + { + "epoch": 0.6519451393082032, + "grad_norm": 0.6171875, + "learning_rate": 1.52762957260818e-05, + "loss": 1.4351, + "step": 3779 + }, + { + "epoch": 0.6521176572069353, + "grad_norm": 0.76171875, + "learning_rate": 1.5273983363749246e-05, + "loss": 1.4795, + "step": 3780 + }, + { + "epoch": 0.6522901751056672, + "grad_norm": 0.5703125, + "learning_rate": 1.5271670610687058e-05, + "loss": 1.4506, + "step": 3781 + }, + { + "epoch": 0.6524626930043992, + "grad_norm": 0.7578125, + "learning_rate": 1.5269357467066566e-05, + "loss": 1.4128, + "step": 3782 + }, + { + "epoch": 0.6526352109031311, + "grad_norm": 0.6640625, + "learning_rate": 1.5267043933059147e-05, + "loss": 1.5003, + "step": 3783 + }, + { + "epoch": 0.6528077288018632, + "grad_norm": 0.58984375, + "learning_rate": 1.5264730008836205e-05, + "loss": 1.4195, + "step": 3784 + }, + { + "epoch": 0.6529802467005952, + "grad_norm": 0.6015625, + "learning_rate": 1.526241569456917e-05, + "loss": 1.4756, + "step": 3785 + }, + { + "epoch": 0.6531527645993271, + "grad_norm": 0.625, + "learning_rate": 1.52601009904295e-05, + "loss": 1.4688, + "step": 3786 + }, + { + "epoch": 0.6533252824980592, + "grad_norm": 0.69921875, + "learning_rate": 1.525778589658868e-05, + "loss": 1.4437, + "step": 3787 + }, + { + "epoch": 0.6534978003967912, + "grad_norm": 0.6796875, + "learning_rate": 1.5255470413218228e-05, + "loss": 1.485, + "step": 3788 + }, + { + "epoch": 0.6536703182955231, + "grad_norm": 0.85546875, + "learning_rate": 1.525315454048969e-05, + "loss": 1.3897, + "step": 3789 + }, + { + "epoch": 0.6538428361942552, + "grad_norm": 0.61328125, + "learning_rate": 1.525083827857464e-05, + "loss": 1.5565, + "step": 3790 + }, + { + "epoch": 0.6540153540929872, + "grad_norm": 0.640625, + "learning_rate": 1.5248521627644684e-05, + "loss": 1.3609, + "step": 3791 + }, + { + "epoch": 0.6541878719917191, + "grad_norm": 0.6328125, + "learning_rate": 1.524620458787145e-05, + "loss": 1.4696, + "step": 3792 + }, + { + "epoch": 0.6543603898904511, + "grad_norm": 0.5859375, + "learning_rate": 1.5243887159426603e-05, + "loss": 1.4728, + "step": 3793 + }, + { + "epoch": 0.6545329077891832, + "grad_norm": 0.59765625, + "learning_rate": 1.5241569342481826e-05, + "loss": 1.4916, + "step": 3794 + }, + { + "epoch": 0.6547054256879151, + "grad_norm": 0.65625, + "learning_rate": 1.5239251137208844e-05, + "loss": 1.3874, + "step": 3795 + }, + { + "epoch": 0.6548779435866471, + "grad_norm": 0.70703125, + "learning_rate": 1.52369325437794e-05, + "loss": 1.4444, + "step": 3796 + }, + { + "epoch": 0.6550504614853792, + "grad_norm": 0.55859375, + "learning_rate": 1.5234613562365272e-05, + "loss": 1.3956, + "step": 3797 + }, + { + "epoch": 0.6552229793841111, + "grad_norm": 0.61328125, + "learning_rate": 1.5232294193138264e-05, + "loss": 1.357, + "step": 3798 + }, + { + "epoch": 0.6553954972828431, + "grad_norm": 0.578125, + "learning_rate": 1.5229974436270207e-05, + "loss": 1.4616, + "step": 3799 + }, + { + "epoch": 0.655568015181575, + "grad_norm": 0.75390625, + "learning_rate": 1.5227654291932967e-05, + "loss": 1.4532, + "step": 3800 + }, + { + "epoch": 0.655568015181575, + "eval_loss": 1.4269640445709229, + "eval_runtime": 10.8642, + "eval_samples_per_second": 94.254, + "eval_steps_per_second": 23.564, + "step": 3800 + }, + { + "epoch": 0.6557405330803071, + "grad_norm": 0.6953125, + "learning_rate": 1.5225333760298435e-05, + "loss": 1.5473, + "step": 3801 + }, + { + "epoch": 0.6559130509790391, + "grad_norm": 0.59765625, + "learning_rate": 1.5223012841538527e-05, + "loss": 1.5213, + "step": 3802 + }, + { + "epoch": 0.656085568877771, + "grad_norm": 0.58984375, + "learning_rate": 1.5220691535825194e-05, + "loss": 1.4789, + "step": 3803 + }, + { + "epoch": 0.6562580867765031, + "grad_norm": 0.6640625, + "learning_rate": 1.5218369843330409e-05, + "loss": 1.5483, + "step": 3804 + }, + { + "epoch": 0.6564306046752351, + "grad_norm": 0.59375, + "learning_rate": 1.5216047764226183e-05, + "loss": 1.3859, + "step": 3805 + }, + { + "epoch": 0.656603122573967, + "grad_norm": 0.67578125, + "learning_rate": 1.5213725298684546e-05, + "loss": 1.4533, + "step": 3806 + }, + { + "epoch": 0.6567756404726991, + "grad_norm": 0.6171875, + "learning_rate": 1.5211402446877561e-05, + "loss": 1.417, + "step": 3807 + }, + { + "epoch": 0.6569481583714311, + "grad_norm": 0.63671875, + "learning_rate": 1.5209079208977322e-05, + "loss": 1.5, + "step": 3808 + }, + { + "epoch": 0.657120676270163, + "grad_norm": 0.64453125, + "learning_rate": 1.5206755585155946e-05, + "loss": 1.494, + "step": 3809 + }, + { + "epoch": 0.657293194168895, + "grad_norm": 0.640625, + "learning_rate": 1.5204431575585586e-05, + "loss": 1.4712, + "step": 3810 + }, + { + "epoch": 0.657465712067627, + "grad_norm": 0.73046875, + "learning_rate": 1.520210718043841e-05, + "loss": 1.355, + "step": 3811 + }, + { + "epoch": 0.657638229966359, + "grad_norm": 0.61328125, + "learning_rate": 1.5199782399886633e-05, + "loss": 1.4093, + "step": 3812 + }, + { + "epoch": 0.657810747865091, + "grad_norm": 0.72265625, + "learning_rate": 1.519745723410249e-05, + "loss": 1.4952, + "step": 3813 + }, + { + "epoch": 0.657983265763823, + "grad_norm": 0.90625, + "learning_rate": 1.5195131683258235e-05, + "loss": 1.5056, + "step": 3814 + }, + { + "epoch": 0.658155783662555, + "grad_norm": 0.8203125, + "learning_rate": 1.5192805747526168e-05, + "loss": 1.4134, + "step": 3815 + }, + { + "epoch": 0.658328301561287, + "grad_norm": 0.64453125, + "learning_rate": 1.5190479427078602e-05, + "loss": 1.5335, + "step": 3816 + }, + { + "epoch": 0.6585008194600189, + "grad_norm": 0.85546875, + "learning_rate": 1.518815272208789e-05, + "loss": 1.4615, + "step": 3817 + }, + { + "epoch": 0.658673337358751, + "grad_norm": 0.69921875, + "learning_rate": 1.5185825632726405e-05, + "loss": 1.3429, + "step": 3818 + }, + { + "epoch": 0.658845855257483, + "grad_norm": 0.60546875, + "learning_rate": 1.5183498159166557e-05, + "loss": 1.4464, + "step": 3819 + }, + { + "epoch": 0.6590183731562149, + "grad_norm": 0.6328125, + "learning_rate": 1.5181170301580776e-05, + "loss": 1.5144, + "step": 3820 + }, + { + "epoch": 0.659190891054947, + "grad_norm": 0.6484375, + "learning_rate": 1.5178842060141526e-05, + "loss": 1.4413, + "step": 3821 + }, + { + "epoch": 0.659363408953679, + "grad_norm": 0.59765625, + "learning_rate": 1.5176513435021297e-05, + "loss": 1.4433, + "step": 3822 + }, + { + "epoch": 0.6595359268524109, + "grad_norm": 0.640625, + "learning_rate": 1.5174184426392609e-05, + "loss": 1.5579, + "step": 3823 + }, + { + "epoch": 0.659708444751143, + "grad_norm": 0.8125, + "learning_rate": 1.5171855034428006e-05, + "loss": 1.2912, + "step": 3824 + }, + { + "epoch": 0.659880962649875, + "grad_norm": 0.6328125, + "learning_rate": 1.5169525259300071e-05, + "loss": 1.518, + "step": 3825 + }, + { + "epoch": 0.6600534805486069, + "grad_norm": 0.62890625, + "learning_rate": 1.5167195101181405e-05, + "loss": 1.4642, + "step": 3826 + }, + { + "epoch": 0.6602259984473389, + "grad_norm": 0.640625, + "learning_rate": 1.5164864560244636e-05, + "loss": 1.3841, + "step": 3827 + }, + { + "epoch": 0.6603985163460709, + "grad_norm": 0.76171875, + "learning_rate": 1.516253363666243e-05, + "loss": 1.5328, + "step": 3828 + }, + { + "epoch": 0.6605710342448029, + "grad_norm": 0.625, + "learning_rate": 1.5160202330607476e-05, + "loss": 1.4713, + "step": 3829 + }, + { + "epoch": 0.6607435521435349, + "grad_norm": 0.71875, + "learning_rate": 1.515787064225249e-05, + "loss": 1.4119, + "step": 3830 + }, + { + "epoch": 0.6609160700422669, + "grad_norm": 0.71484375, + "learning_rate": 1.515553857177022e-05, + "loss": 1.5059, + "step": 3831 + }, + { + "epoch": 0.6610885879409989, + "grad_norm": 0.671875, + "learning_rate": 1.5153206119333436e-05, + "loss": 1.4441, + "step": 3832 + }, + { + "epoch": 0.6612611058397309, + "grad_norm": 1.21875, + "learning_rate": 1.5150873285114948e-05, + "loss": 1.4158, + "step": 3833 + }, + { + "epoch": 0.6614336237384628, + "grad_norm": 0.828125, + "learning_rate": 1.5148540069287583e-05, + "loss": 1.437, + "step": 3834 + }, + { + "epoch": 0.6616061416371949, + "grad_norm": 0.61328125, + "learning_rate": 1.5146206472024196e-05, + "loss": 1.4809, + "step": 3835 + }, + { + "epoch": 0.6617786595359268, + "grad_norm": 0.67578125, + "learning_rate": 1.5143872493497683e-05, + "loss": 1.4009, + "step": 3836 + }, + { + "epoch": 0.6619511774346588, + "grad_norm": 0.6015625, + "learning_rate": 1.5141538133880951e-05, + "loss": 1.4531, + "step": 3837 + }, + { + "epoch": 0.6621236953333909, + "grad_norm": 0.6640625, + "learning_rate": 1.5139203393346953e-05, + "loss": 1.4045, + "step": 3838 + }, + { + "epoch": 0.6622962132321228, + "grad_norm": 1.203125, + "learning_rate": 1.5136868272068653e-05, + "loss": 1.4653, + "step": 3839 + }, + { + "epoch": 0.6624687311308548, + "grad_norm": 0.671875, + "learning_rate": 1.5134532770219054e-05, + "loss": 1.5117, + "step": 3840 + }, + { + "epoch": 0.6626412490295868, + "grad_norm": 0.6015625, + "learning_rate": 1.513219688797119e-05, + "loss": 1.4574, + "step": 3841 + }, + { + "epoch": 0.6628137669283188, + "grad_norm": 0.578125, + "learning_rate": 1.512986062549811e-05, + "loss": 1.4325, + "step": 3842 + }, + { + "epoch": 0.6629862848270508, + "grad_norm": 0.73828125, + "learning_rate": 1.51275239829729e-05, + "loss": 1.6238, + "step": 3843 + }, + { + "epoch": 0.6631588027257828, + "grad_norm": 0.71484375, + "learning_rate": 1.5125186960568678e-05, + "loss": 1.4831, + "step": 3844 + }, + { + "epoch": 0.6633313206245148, + "grad_norm": 0.71484375, + "learning_rate": 1.5122849558458583e-05, + "loss": 1.3717, + "step": 3845 + }, + { + "epoch": 0.6635038385232468, + "grad_norm": 0.6171875, + "learning_rate": 1.512051177681578e-05, + "loss": 1.3688, + "step": 3846 + }, + { + "epoch": 0.6636763564219788, + "grad_norm": 0.6953125, + "learning_rate": 1.5118173615813474e-05, + "loss": 1.4871, + "step": 3847 + }, + { + "epoch": 0.6638488743207108, + "grad_norm": 0.6796875, + "learning_rate": 1.5115835075624885e-05, + "loss": 1.4088, + "step": 3848 + }, + { + "epoch": 0.6640213922194428, + "grad_norm": 0.640625, + "learning_rate": 1.5113496156423271e-05, + "loss": 1.3805, + "step": 3849 + }, + { + "epoch": 0.6641939101181747, + "grad_norm": 0.671875, + "learning_rate": 1.5111156858381906e-05, + "loss": 1.531, + "step": 3850 + }, + { + "epoch": 0.6643664280169067, + "grad_norm": 0.7109375, + "learning_rate": 1.510881718167411e-05, + "loss": 1.494, + "step": 3851 + }, + { + "epoch": 0.6645389459156388, + "grad_norm": 0.6015625, + "learning_rate": 1.5106477126473209e-05, + "loss": 1.4728, + "step": 3852 + }, + { + "epoch": 0.6647114638143707, + "grad_norm": 0.6484375, + "learning_rate": 1.5104136692952582e-05, + "loss": 1.4782, + "step": 3853 + }, + { + "epoch": 0.6648839817131027, + "grad_norm": 0.62109375, + "learning_rate": 1.5101795881285614e-05, + "loss": 1.4778, + "step": 3854 + }, + { + "epoch": 0.6650564996118348, + "grad_norm": 0.5625, + "learning_rate": 1.5099454691645731e-05, + "loss": 1.4344, + "step": 3855 + }, + { + "epoch": 0.6652290175105667, + "grad_norm": 0.6875, + "learning_rate": 1.509711312420638e-05, + "loss": 1.4528, + "step": 3856 + }, + { + "epoch": 0.6654015354092987, + "grad_norm": 0.63671875, + "learning_rate": 1.5094771179141043e-05, + "loss": 1.4104, + "step": 3857 + }, + { + "epoch": 0.6655740533080307, + "grad_norm": 0.66015625, + "learning_rate": 1.5092428856623222e-05, + "loss": 1.4662, + "step": 3858 + }, + { + "epoch": 0.6657465712067627, + "grad_norm": 0.64453125, + "learning_rate": 1.509008615682645e-05, + "loss": 1.4011, + "step": 3859 + }, + { + "epoch": 0.6659190891054947, + "grad_norm": 0.58203125, + "learning_rate": 1.5087743079924293e-05, + "loss": 1.3801, + "step": 3860 + }, + { + "epoch": 0.6660916070042266, + "grad_norm": 0.7109375, + "learning_rate": 1.5085399626090343e-05, + "loss": 1.478, + "step": 3861 + }, + { + "epoch": 0.6662641249029587, + "grad_norm": 0.625, + "learning_rate": 1.5083055795498209e-05, + "loss": 1.5616, + "step": 3862 + }, + { + "epoch": 0.6664366428016907, + "grad_norm": 0.66796875, + "learning_rate": 1.5080711588321544e-05, + "loss": 1.4164, + "step": 3863 + }, + { + "epoch": 0.6666091607004226, + "grad_norm": 0.8828125, + "learning_rate": 1.5078367004734014e-05, + "loss": 1.4833, + "step": 3864 + }, + { + "epoch": 0.6667816785991547, + "grad_norm": 0.59375, + "learning_rate": 1.507602204490933e-05, + "loss": 1.4232, + "step": 3865 + }, + { + "epoch": 0.6669541964978867, + "grad_norm": 0.68359375, + "learning_rate": 1.5073676709021214e-05, + "loss": 1.4294, + "step": 3866 + }, + { + "epoch": 0.6671267143966186, + "grad_norm": 0.60546875, + "learning_rate": 1.5071330997243427e-05, + "loss": 1.4871, + "step": 3867 + }, + { + "epoch": 0.6672992322953506, + "grad_norm": 0.59375, + "learning_rate": 1.506898490974975e-05, + "loss": 1.3962, + "step": 3868 + }, + { + "epoch": 0.6674717501940827, + "grad_norm": 0.68359375, + "learning_rate": 1.5066638446714002e-05, + "loss": 1.452, + "step": 3869 + }, + { + "epoch": 0.6676442680928146, + "grad_norm": 0.6640625, + "learning_rate": 1.5064291608310017e-05, + "loss": 1.5522, + "step": 3870 + }, + { + "epoch": 0.6678167859915466, + "grad_norm": 0.609375, + "learning_rate": 1.5061944394711669e-05, + "loss": 1.5069, + "step": 3871 + }, + { + "epoch": 0.6679893038902787, + "grad_norm": 0.59765625, + "learning_rate": 1.505959680609285e-05, + "loss": 1.4598, + "step": 3872 + }, + { + "epoch": 0.6681618217890106, + "grad_norm": 0.66796875, + "learning_rate": 1.5057248842627488e-05, + "loss": 1.4333, + "step": 3873 + }, + { + "epoch": 0.6683343396877426, + "grad_norm": 0.58984375, + "learning_rate": 1.505490050448953e-05, + "loss": 1.4226, + "step": 3874 + }, + { + "epoch": 0.6685068575864745, + "grad_norm": 0.62890625, + "learning_rate": 1.505255179185296e-05, + "loss": 1.4309, + "step": 3875 + }, + { + "epoch": 0.6686793754852066, + "grad_norm": 0.890625, + "learning_rate": 1.5050202704891783e-05, + "loss": 1.4525, + "step": 3876 + }, + { + "epoch": 0.6688518933839386, + "grad_norm": 0.703125, + "learning_rate": 1.5047853243780033e-05, + "loss": 1.4938, + "step": 3877 + }, + { + "epoch": 0.6690244112826705, + "grad_norm": 0.6328125, + "learning_rate": 1.5045503408691776e-05, + "loss": 1.3831, + "step": 3878 + }, + { + "epoch": 0.6691969291814026, + "grad_norm": 0.5703125, + "learning_rate": 1.5043153199801101e-05, + "loss": 1.5183, + "step": 3879 + }, + { + "epoch": 0.6693694470801346, + "grad_norm": 0.6171875, + "learning_rate": 1.5040802617282124e-05, + "loss": 1.4216, + "step": 3880 + }, + { + "epoch": 0.6695419649788665, + "grad_norm": 0.71875, + "learning_rate": 1.5038451661308994e-05, + "loss": 1.3953, + "step": 3881 + }, + { + "epoch": 0.6697144828775986, + "grad_norm": 0.69921875, + "learning_rate": 1.5036100332055884e-05, + "loss": 1.5554, + "step": 3882 + }, + { + "epoch": 0.6698870007763306, + "grad_norm": 0.7109375, + "learning_rate": 1.5033748629696994e-05, + "loss": 1.4274, + "step": 3883 + }, + { + "epoch": 0.6700595186750625, + "grad_norm": 0.6328125, + "learning_rate": 1.5031396554406548e-05, + "loss": 1.4097, + "step": 3884 + }, + { + "epoch": 0.6702320365737945, + "grad_norm": 0.6796875, + "learning_rate": 1.5029044106358815e-05, + "loss": 1.4944, + "step": 3885 + }, + { + "epoch": 0.6704045544725266, + "grad_norm": 0.7421875, + "learning_rate": 1.5026691285728067e-05, + "loss": 1.4382, + "step": 3886 + }, + { + "epoch": 0.6705770723712585, + "grad_norm": 0.65234375, + "learning_rate": 1.5024338092688622e-05, + "loss": 1.3788, + "step": 3887 + }, + { + "epoch": 0.6707495902699905, + "grad_norm": 0.59765625, + "learning_rate": 1.502198452741482e-05, + "loss": 1.4075, + "step": 3888 + }, + { + "epoch": 0.6709221081687226, + "grad_norm": 0.70703125, + "learning_rate": 1.5019630590081025e-05, + "loss": 1.5182, + "step": 3889 + }, + { + "epoch": 0.6710946260674545, + "grad_norm": 0.640625, + "learning_rate": 1.5017276280861628e-05, + "loss": 1.5196, + "step": 3890 + }, + { + "epoch": 0.6712671439661865, + "grad_norm": 0.6015625, + "learning_rate": 1.5014921599931056e-05, + "loss": 1.3461, + "step": 3891 + }, + { + "epoch": 0.6714396618649184, + "grad_norm": 0.59765625, + "learning_rate": 1.5012566547463758e-05, + "loss": 1.4824, + "step": 3892 + }, + { + "epoch": 0.6716121797636505, + "grad_norm": 0.6484375, + "learning_rate": 1.501021112363421e-05, + "loss": 1.5494, + "step": 3893 + }, + { + "epoch": 0.6717846976623825, + "grad_norm": 0.6640625, + "learning_rate": 1.5007855328616917e-05, + "loss": 1.4709, + "step": 3894 + }, + { + "epoch": 0.6719572155611144, + "grad_norm": 0.6171875, + "learning_rate": 1.500549916258641e-05, + "loss": 1.3718, + "step": 3895 + }, + { + "epoch": 0.6721297334598465, + "grad_norm": 0.59765625, + "learning_rate": 1.500314262571725e-05, + "loss": 1.347, + "step": 3896 + }, + { + "epoch": 0.6723022513585785, + "grad_norm": 0.75, + "learning_rate": 1.5000785718184026e-05, + "loss": 1.525, + "step": 3897 + }, + { + "epoch": 0.6724747692573104, + "grad_norm": 0.7421875, + "learning_rate": 1.4998428440161345e-05, + "loss": 1.4657, + "step": 3898 + }, + { + "epoch": 0.6726472871560425, + "grad_norm": 0.6328125, + "learning_rate": 1.4996070791823856e-05, + "loss": 1.4022, + "step": 3899 + }, + { + "epoch": 0.6728198050547745, + "grad_norm": 0.69140625, + "learning_rate": 1.4993712773346225e-05, + "loss": 1.3463, + "step": 3900 + }, + { + "epoch": 0.6728198050547745, + "eval_loss": 1.4253987073898315, + "eval_runtime": 10.893, + "eval_samples_per_second": 94.005, + "eval_steps_per_second": 23.501, + "step": 3900 + }, + { + "epoch": 0.6729923229535064, + "grad_norm": 1.6171875, + "learning_rate": 1.4991354384903148e-05, + "loss": 1.4465, + "step": 3901 + }, + { + "epoch": 0.6731648408522384, + "grad_norm": 0.66796875, + "learning_rate": 1.4988995626669352e-05, + "loss": 1.4454, + "step": 3902 + }, + { + "epoch": 0.6733373587509704, + "grad_norm": 0.66796875, + "learning_rate": 1.4986636498819586e-05, + "loss": 1.3895, + "step": 3903 + }, + { + "epoch": 0.6735098766497024, + "grad_norm": 0.66015625, + "learning_rate": 1.4984277001528634e-05, + "loss": 1.4343, + "step": 3904 + }, + { + "epoch": 0.6736823945484344, + "grad_norm": 0.609375, + "learning_rate": 1.4981917134971298e-05, + "loss": 1.5163, + "step": 3905 + }, + { + "epoch": 0.6738549124471664, + "grad_norm": 0.7578125, + "learning_rate": 1.497955689932241e-05, + "loss": 1.4312, + "step": 3906 + }, + { + "epoch": 0.6740274303458984, + "grad_norm": 0.75, + "learning_rate": 1.4977196294756832e-05, + "loss": 1.4852, + "step": 3907 + }, + { + "epoch": 0.6741999482446304, + "grad_norm": 0.62109375, + "learning_rate": 1.4974835321449454e-05, + "loss": 1.3581, + "step": 3908 + }, + { + "epoch": 0.6743724661433623, + "grad_norm": 0.68359375, + "learning_rate": 1.4972473979575195e-05, + "loss": 1.4458, + "step": 3909 + }, + { + "epoch": 0.6745449840420944, + "grad_norm": 0.640625, + "learning_rate": 1.4970112269308991e-05, + "loss": 1.5073, + "step": 3910 + }, + { + "epoch": 0.6747175019408264, + "grad_norm": 0.6875, + "learning_rate": 1.4967750190825816e-05, + "loss": 1.4186, + "step": 3911 + }, + { + "epoch": 0.6748900198395583, + "grad_norm": 1.078125, + "learning_rate": 1.4965387744300665e-05, + "loss": 1.498, + "step": 3912 + }, + { + "epoch": 0.6750625377382904, + "grad_norm": 0.640625, + "learning_rate": 1.4963024929908568e-05, + "loss": 1.541, + "step": 3913 + }, + { + "epoch": 0.6752350556370224, + "grad_norm": 0.5625, + "learning_rate": 1.4960661747824575e-05, + "loss": 1.451, + "step": 3914 + }, + { + "epoch": 0.6754075735357543, + "grad_norm": 0.6796875, + "learning_rate": 1.495829819822376e-05, + "loss": 1.4844, + "step": 3915 + }, + { + "epoch": 0.6755800914344863, + "grad_norm": 0.64453125, + "learning_rate": 1.4955934281281234e-05, + "loss": 1.3985, + "step": 3916 + }, + { + "epoch": 0.6757526093332183, + "grad_norm": 0.66015625, + "learning_rate": 1.4953569997172133e-05, + "loss": 1.4535, + "step": 3917 + }, + { + "epoch": 0.6759251272319503, + "grad_norm": 0.6953125, + "learning_rate": 1.4951205346071613e-05, + "loss": 1.4962, + "step": 3918 + }, + { + "epoch": 0.6760976451306823, + "grad_norm": 0.6328125, + "learning_rate": 1.4948840328154863e-05, + "loss": 1.4271, + "step": 3919 + }, + { + "epoch": 0.6762701630294143, + "grad_norm": 0.65625, + "learning_rate": 1.4946474943597103e-05, + "loss": 1.3688, + "step": 3920 + }, + { + "epoch": 0.6764426809281463, + "grad_norm": 0.67578125, + "learning_rate": 1.4944109192573572e-05, + "loss": 1.4197, + "step": 3921 + }, + { + "epoch": 0.6766151988268783, + "grad_norm": 0.578125, + "learning_rate": 1.4941743075259539e-05, + "loss": 1.4688, + "step": 3922 + }, + { + "epoch": 0.6767877167256103, + "grad_norm": 0.6640625, + "learning_rate": 1.4939376591830301e-05, + "loss": 1.4973, + "step": 3923 + }, + { + "epoch": 0.6769602346243423, + "grad_norm": 0.60546875, + "learning_rate": 1.4937009742461181e-05, + "loss": 1.3811, + "step": 3924 + }, + { + "epoch": 0.6771327525230743, + "grad_norm": 0.58984375, + "learning_rate": 1.4934642527327537e-05, + "loss": 1.477, + "step": 3925 + }, + { + "epoch": 0.6773052704218062, + "grad_norm": 0.58203125, + "learning_rate": 1.4932274946604736e-05, + "loss": 1.3482, + "step": 3926 + }, + { + "epoch": 0.6774777883205383, + "grad_norm": 0.609375, + "learning_rate": 1.492990700046819e-05, + "loss": 1.4613, + "step": 3927 + }, + { + "epoch": 0.6776503062192702, + "grad_norm": 0.625, + "learning_rate": 1.492753868909333e-05, + "loss": 1.4796, + "step": 3928 + }, + { + "epoch": 0.6778228241180022, + "grad_norm": 0.5859375, + "learning_rate": 1.4925170012655614e-05, + "loss": 1.4654, + "step": 3929 + }, + { + "epoch": 0.6779953420167343, + "grad_norm": 0.59765625, + "learning_rate": 1.4922800971330533e-05, + "loss": 1.4711, + "step": 3930 + }, + { + "epoch": 0.6781678599154662, + "grad_norm": 0.63671875, + "learning_rate": 1.4920431565293596e-05, + "loss": 1.5027, + "step": 3931 + }, + { + "epoch": 0.6783403778141982, + "grad_norm": 0.59765625, + "learning_rate": 1.4918061794720342e-05, + "loss": 1.4228, + "step": 3932 + }, + { + "epoch": 0.6785128957129302, + "grad_norm": 0.58203125, + "learning_rate": 1.4915691659786342e-05, + "loss": 1.4788, + "step": 3933 + }, + { + "epoch": 0.6786854136116622, + "grad_norm": 0.7109375, + "learning_rate": 1.4913321160667189e-05, + "loss": 1.5247, + "step": 3934 + }, + { + "epoch": 0.6788579315103942, + "grad_norm": 0.734375, + "learning_rate": 1.4910950297538505e-05, + "loss": 1.4611, + "step": 3935 + }, + { + "epoch": 0.6790304494091262, + "grad_norm": 0.625, + "learning_rate": 1.4908579070575936e-05, + "loss": 1.5271, + "step": 3936 + }, + { + "epoch": 0.6792029673078582, + "grad_norm": 0.796875, + "learning_rate": 1.4906207479955164e-05, + "loss": 1.4239, + "step": 3937 + }, + { + "epoch": 0.6793754852065902, + "grad_norm": 0.921875, + "learning_rate": 1.4903835525851884e-05, + "loss": 1.413, + "step": 3938 + }, + { + "epoch": 0.6795480031053222, + "grad_norm": 0.69140625, + "learning_rate": 1.4901463208441827e-05, + "loss": 1.4711, + "step": 3939 + }, + { + "epoch": 0.6797205210040542, + "grad_norm": 0.9609375, + "learning_rate": 1.489909052790075e-05, + "loss": 1.4645, + "step": 3940 + }, + { + "epoch": 0.6798930389027862, + "grad_norm": 0.75, + "learning_rate": 1.4896717484404437e-05, + "loss": 1.4735, + "step": 3941 + }, + { + "epoch": 0.6800655568015181, + "grad_norm": 0.65234375, + "learning_rate": 1.4894344078128696e-05, + "loss": 1.4367, + "step": 3942 + }, + { + "epoch": 0.6802380747002501, + "grad_norm": 0.66796875, + "learning_rate": 1.4891970309249361e-05, + "loss": 1.5428, + "step": 3943 + }, + { + "epoch": 0.6804105925989822, + "grad_norm": 0.796875, + "learning_rate": 1.4889596177942303e-05, + "loss": 1.3409, + "step": 3944 + }, + { + "epoch": 0.6805831104977141, + "grad_norm": 0.6875, + "learning_rate": 1.4887221684383407e-05, + "loss": 1.5084, + "step": 3945 + }, + { + "epoch": 0.6807556283964461, + "grad_norm": 0.578125, + "learning_rate": 1.4884846828748594e-05, + "loss": 1.4532, + "step": 3946 + }, + { + "epoch": 0.6809281462951782, + "grad_norm": 0.66796875, + "learning_rate": 1.4882471611213802e-05, + "loss": 1.4367, + "step": 3947 + }, + { + "epoch": 0.6811006641939101, + "grad_norm": 0.8359375, + "learning_rate": 1.488009603195501e-05, + "loss": 1.4749, + "step": 3948 + }, + { + "epoch": 0.6812731820926421, + "grad_norm": 0.6875, + "learning_rate": 1.4877720091148209e-05, + "loss": 1.4333, + "step": 3949 + }, + { + "epoch": 0.681445699991374, + "grad_norm": 0.64453125, + "learning_rate": 1.4875343788969426e-05, + "loss": 1.3904, + "step": 3950 + }, + { + "epoch": 0.6816182178901061, + "grad_norm": 0.609375, + "learning_rate": 1.4872967125594713e-05, + "loss": 1.4519, + "step": 3951 + }, + { + "epoch": 0.6817907357888381, + "grad_norm": 0.58203125, + "learning_rate": 1.4870590101200148e-05, + "loss": 1.4636, + "step": 3952 + }, + { + "epoch": 0.68196325368757, + "grad_norm": 0.73828125, + "learning_rate": 1.4868212715961838e-05, + "loss": 1.5242, + "step": 3953 + }, + { + "epoch": 0.6821357715863021, + "grad_norm": 0.66015625, + "learning_rate": 1.486583497005591e-05, + "loss": 1.4555, + "step": 3954 + }, + { + "epoch": 0.6823082894850341, + "grad_norm": 0.609375, + "learning_rate": 1.4863456863658522e-05, + "loss": 1.5198, + "step": 3955 + }, + { + "epoch": 0.682480807383766, + "grad_norm": 0.6640625, + "learning_rate": 1.4861078396945865e-05, + "loss": 1.3825, + "step": 3956 + }, + { + "epoch": 0.6826533252824981, + "grad_norm": 0.66796875, + "learning_rate": 1.4858699570094144e-05, + "loss": 1.3163, + "step": 3957 + }, + { + "epoch": 0.6828258431812301, + "grad_norm": 0.7265625, + "learning_rate": 1.4856320383279603e-05, + "loss": 1.3736, + "step": 3958 + }, + { + "epoch": 0.682998361079962, + "grad_norm": 0.65625, + "learning_rate": 1.4853940836678504e-05, + "loss": 1.361, + "step": 3959 + }, + { + "epoch": 0.683170878978694, + "grad_norm": 0.734375, + "learning_rate": 1.4851560930467137e-05, + "loss": 1.4556, + "step": 3960 + }, + { + "epoch": 0.6833433968774261, + "grad_norm": 0.73828125, + "learning_rate": 1.4849180664821822e-05, + "loss": 1.5069, + "step": 3961 + }, + { + "epoch": 0.683515914776158, + "grad_norm": 0.60546875, + "learning_rate": 1.4846800039918908e-05, + "loss": 1.4745, + "step": 3962 + }, + { + "epoch": 0.68368843267489, + "grad_norm": 0.77734375, + "learning_rate": 1.4844419055934761e-05, + "loss": 1.415, + "step": 3963 + }, + { + "epoch": 0.6838609505736221, + "grad_norm": 0.6015625, + "learning_rate": 1.4842037713045778e-05, + "loss": 1.3378, + "step": 3964 + }, + { + "epoch": 0.684033468472354, + "grad_norm": 0.66796875, + "learning_rate": 1.483965601142839e-05, + "loss": 1.5044, + "step": 3965 + }, + { + "epoch": 0.684205986371086, + "grad_norm": 0.64453125, + "learning_rate": 1.4837273951259044e-05, + "loss": 1.4319, + "step": 3966 + }, + { + "epoch": 0.6843785042698179, + "grad_norm": 0.67578125, + "learning_rate": 1.4834891532714218e-05, + "loss": 1.3895, + "step": 3967 + }, + { + "epoch": 0.68455102216855, + "grad_norm": 0.6484375, + "learning_rate": 1.4832508755970418e-05, + "loss": 1.3703, + "step": 3968 + }, + { + "epoch": 0.684723540067282, + "grad_norm": 0.5859375, + "learning_rate": 1.4830125621204177e-05, + "loss": 1.4502, + "step": 3969 + }, + { + "epoch": 0.6848960579660139, + "grad_norm": 0.57421875, + "learning_rate": 1.4827742128592046e-05, + "loss": 1.4776, + "step": 3970 + }, + { + "epoch": 0.685068575864746, + "grad_norm": 0.6328125, + "learning_rate": 1.4825358278310615e-05, + "loss": 1.4527, + "step": 3971 + }, + { + "epoch": 0.685241093763478, + "grad_norm": 0.63671875, + "learning_rate": 1.4822974070536493e-05, + "loss": 1.4456, + "step": 3972 + }, + { + "epoch": 0.6854136116622099, + "grad_norm": 0.61328125, + "learning_rate": 1.4820589505446316e-05, + "loss": 1.4968, + "step": 3973 + }, + { + "epoch": 0.6855861295609419, + "grad_norm": 0.60546875, + "learning_rate": 1.4818204583216749e-05, + "loss": 1.4718, + "step": 3974 + }, + { + "epoch": 0.685758647459674, + "grad_norm": 0.66796875, + "learning_rate": 1.4815819304024482e-05, + "loss": 1.4568, + "step": 3975 + }, + { + "epoch": 0.6859311653584059, + "grad_norm": 0.63671875, + "learning_rate": 1.4813433668046227e-05, + "loss": 1.4359, + "step": 3976 + }, + { + "epoch": 0.6861036832571379, + "grad_norm": 0.640625, + "learning_rate": 1.4811047675458729e-05, + "loss": 1.4681, + "step": 3977 + }, + { + "epoch": 0.68627620115587, + "grad_norm": 0.59765625, + "learning_rate": 1.4808661326438765e-05, + "loss": 1.4685, + "step": 3978 + }, + { + "epoch": 0.6864487190546019, + "grad_norm": 0.75390625, + "learning_rate": 1.480627462116312e-05, + "loss": 1.4094, + "step": 3979 + }, + { + "epoch": 0.6866212369533339, + "grad_norm": 0.5859375, + "learning_rate": 1.4803887559808618e-05, + "loss": 1.4911, + "step": 3980 + }, + { + "epoch": 0.686793754852066, + "grad_norm": 0.63671875, + "learning_rate": 1.4801500142552112e-05, + "loss": 1.4285, + "step": 3981 + }, + { + "epoch": 0.6869662727507979, + "grad_norm": 0.6640625, + "learning_rate": 1.4799112369570475e-05, + "loss": 1.4582, + "step": 3982 + }, + { + "epoch": 0.6871387906495299, + "grad_norm": 0.703125, + "learning_rate": 1.4796724241040604e-05, + "loss": 1.4502, + "step": 3983 + }, + { + "epoch": 0.6873113085482618, + "grad_norm": 0.62109375, + "learning_rate": 1.479433575713943e-05, + "loss": 1.5991, + "step": 3984 + }, + { + "epoch": 0.6874838264469939, + "grad_norm": 0.7109375, + "learning_rate": 1.4791946918043911e-05, + "loss": 1.5316, + "step": 3985 + }, + { + "epoch": 0.6876563443457259, + "grad_norm": 0.6015625, + "learning_rate": 1.478955772393102e-05, + "loss": 1.4841, + "step": 3986 + }, + { + "epoch": 0.6878288622444578, + "grad_norm": 0.60546875, + "learning_rate": 1.4787168174977768e-05, + "loss": 1.4967, + "step": 3987 + }, + { + "epoch": 0.6880013801431899, + "grad_norm": 0.6484375, + "learning_rate": 1.4784778271361185e-05, + "loss": 1.4086, + "step": 3988 + }, + { + "epoch": 0.6881738980419219, + "grad_norm": 0.671875, + "learning_rate": 1.478238801325833e-05, + "loss": 1.4017, + "step": 3989 + }, + { + "epoch": 0.6883464159406538, + "grad_norm": 0.7109375, + "learning_rate": 1.4779997400846292e-05, + "loss": 1.3941, + "step": 3990 + }, + { + "epoch": 0.6885189338393858, + "grad_norm": 0.9296875, + "learning_rate": 1.4777606434302176e-05, + "loss": 1.3313, + "step": 3991 + }, + { + "epoch": 0.6886914517381179, + "grad_norm": 0.8359375, + "learning_rate": 1.4775215113803127e-05, + "loss": 1.3779, + "step": 3992 + }, + { + "epoch": 0.6888639696368498, + "grad_norm": 0.703125, + "learning_rate": 1.4772823439526302e-05, + "loss": 1.4789, + "step": 3993 + }, + { + "epoch": 0.6890364875355818, + "grad_norm": 0.7265625, + "learning_rate": 1.4770431411648898e-05, + "loss": 1.5234, + "step": 3994 + }, + { + "epoch": 0.6892090054343138, + "grad_norm": 0.80078125, + "learning_rate": 1.4768039030348127e-05, + "loss": 1.5431, + "step": 3995 + }, + { + "epoch": 0.6893815233330458, + "grad_norm": 0.609375, + "learning_rate": 1.4765646295801231e-05, + "loss": 1.395, + "step": 3996 + }, + { + "epoch": 0.6895540412317778, + "grad_norm": 0.66015625, + "learning_rate": 1.4763253208185482e-05, + "loss": 1.391, + "step": 3997 + }, + { + "epoch": 0.6897265591305098, + "grad_norm": 0.8046875, + "learning_rate": 1.476085976767817e-05, + "loss": 1.4651, + "step": 3998 + }, + { + "epoch": 0.6898990770292418, + "grad_norm": 0.64453125, + "learning_rate": 1.4758465974456622e-05, + "loss": 1.5271, + "step": 3999 + }, + { + "epoch": 0.6900715949279738, + "grad_norm": 0.8515625, + "learning_rate": 1.4756071828698182e-05, + "loss": 1.4793, + "step": 4000 + }, + { + "epoch": 0.6900715949279738, + "eval_loss": 1.4241275787353516, + "eval_runtime": 10.9175, + "eval_samples_per_second": 93.794, + "eval_steps_per_second": 23.449, + "step": 4000 + }, + { + "epoch": 0.6902441128267057, + "grad_norm": 0.640625, + "learning_rate": 1.4753677330580223e-05, + "loss": 1.4217, + "step": 4001 + }, + { + "epoch": 0.6904166307254378, + "grad_norm": 0.6796875, + "learning_rate": 1.4751282480280147e-05, + "loss": 1.4645, + "step": 4002 + }, + { + "epoch": 0.6905891486241698, + "grad_norm": 0.578125, + "learning_rate": 1.4748887277975376e-05, + "loss": 1.4786, + "step": 4003 + }, + { + "epoch": 0.6907616665229017, + "grad_norm": 0.6953125, + "learning_rate": 1.4746491723843364e-05, + "loss": 1.4642, + "step": 4004 + }, + { + "epoch": 0.6909341844216338, + "grad_norm": 0.59765625, + "learning_rate": 1.4744095818061586e-05, + "loss": 1.4822, + "step": 4005 + }, + { + "epoch": 0.6911067023203658, + "grad_norm": 0.6640625, + "learning_rate": 1.474169956080755e-05, + "loss": 1.4705, + "step": 4006 + }, + { + "epoch": 0.6912792202190977, + "grad_norm": 0.890625, + "learning_rate": 1.473930295225878e-05, + "loss": 1.512, + "step": 4007 + }, + { + "epoch": 0.6914517381178297, + "grad_norm": 0.59375, + "learning_rate": 1.4736905992592837e-05, + "loss": 1.5389, + "step": 4008 + }, + { + "epoch": 0.6916242560165617, + "grad_norm": 0.5703125, + "learning_rate": 1.4734508681987296e-05, + "loss": 1.306, + "step": 4009 + }, + { + "epoch": 0.6917967739152937, + "grad_norm": 0.625, + "learning_rate": 1.4732111020619775e-05, + "loss": 1.4237, + "step": 4010 + }, + { + "epoch": 0.6919692918140257, + "grad_norm": 0.78515625, + "learning_rate": 1.4729713008667901e-05, + "loss": 1.4676, + "step": 4011 + }, + { + "epoch": 0.6921418097127577, + "grad_norm": 0.625, + "learning_rate": 1.4727314646309333e-05, + "loss": 1.5024, + "step": 4012 + }, + { + "epoch": 0.6923143276114897, + "grad_norm": 0.859375, + "learning_rate": 1.4724915933721758e-05, + "loss": 1.3662, + "step": 4013 + }, + { + "epoch": 0.6924868455102217, + "grad_norm": 0.63671875, + "learning_rate": 1.4722516871082886e-05, + "loss": 1.4298, + "step": 4014 + }, + { + "epoch": 0.6926593634089537, + "grad_norm": 0.6953125, + "learning_rate": 1.472011745857046e-05, + "loss": 1.3987, + "step": 4015 + }, + { + "epoch": 0.6928318813076857, + "grad_norm": 0.70703125, + "learning_rate": 1.471771769636224e-05, + "loss": 1.4815, + "step": 4016 + }, + { + "epoch": 0.6930043992064177, + "grad_norm": 3.40625, + "learning_rate": 1.4715317584636012e-05, + "loss": 1.524, + "step": 4017 + }, + { + "epoch": 0.6931769171051496, + "grad_norm": 0.76171875, + "learning_rate": 1.47129171235696e-05, + "loss": 1.4881, + "step": 4018 + }, + { + "epoch": 0.6933494350038817, + "grad_norm": 0.65234375, + "learning_rate": 1.4710516313340836e-05, + "loss": 1.5024, + "step": 4019 + }, + { + "epoch": 0.6935219529026136, + "grad_norm": 0.94140625, + "learning_rate": 1.470811515412759e-05, + "loss": 1.4709, + "step": 4020 + }, + { + "epoch": 0.6936944708013456, + "grad_norm": 0.6953125, + "learning_rate": 1.470571364610776e-05, + "loss": 1.4479, + "step": 4021 + }, + { + "epoch": 0.6938669887000777, + "grad_norm": 0.75, + "learning_rate": 1.4703311789459261e-05, + "loss": 1.4529, + "step": 4022 + }, + { + "epoch": 0.6940395065988096, + "grad_norm": 0.61328125, + "learning_rate": 1.4700909584360032e-05, + "loss": 1.3779, + "step": 4023 + }, + { + "epoch": 0.6942120244975416, + "grad_norm": 0.7578125, + "learning_rate": 1.4698507030988049e-05, + "loss": 1.482, + "step": 4024 + }, + { + "epoch": 0.6943845423962736, + "grad_norm": 0.734375, + "learning_rate": 1.469610412952131e-05, + "loss": 1.5018, + "step": 4025 + }, + { + "epoch": 0.6945570602950056, + "grad_norm": 0.6328125, + "learning_rate": 1.4693700880137835e-05, + "loss": 1.4349, + "step": 4026 + }, + { + "epoch": 0.6947295781937376, + "grad_norm": 0.85546875, + "learning_rate": 1.4691297283015669e-05, + "loss": 1.4857, + "step": 4027 + }, + { + "epoch": 0.6949020960924696, + "grad_norm": 0.83203125, + "learning_rate": 1.4688893338332888e-05, + "loss": 1.4458, + "step": 4028 + }, + { + "epoch": 0.6950746139912016, + "grad_norm": 0.5625, + "learning_rate": 1.4686489046267591e-05, + "loss": 1.3835, + "step": 4029 + }, + { + "epoch": 0.6952471318899336, + "grad_norm": 0.6875, + "learning_rate": 1.4684084406997903e-05, + "loss": 1.4679, + "step": 4030 + }, + { + "epoch": 0.6954196497886656, + "grad_norm": 0.71484375, + "learning_rate": 1.4681679420701972e-05, + "loss": 1.4534, + "step": 4031 + }, + { + "epoch": 0.6955921676873976, + "grad_norm": 0.82421875, + "learning_rate": 1.4679274087557981e-05, + "loss": 1.4716, + "step": 4032 + }, + { + "epoch": 0.6957646855861296, + "grad_norm": 0.60546875, + "learning_rate": 1.4676868407744126e-05, + "loss": 1.4279, + "step": 4033 + }, + { + "epoch": 0.6959372034848615, + "grad_norm": 0.94921875, + "learning_rate": 1.467446238143864e-05, + "loss": 1.3526, + "step": 4034 + }, + { + "epoch": 0.6961097213835935, + "grad_norm": 0.7265625, + "learning_rate": 1.467205600881977e-05, + "loss": 1.4274, + "step": 4035 + }, + { + "epoch": 0.6962822392823256, + "grad_norm": 0.5859375, + "learning_rate": 1.4669649290065801e-05, + "loss": 1.3703, + "step": 4036 + }, + { + "epoch": 0.6964547571810575, + "grad_norm": 0.70703125, + "learning_rate": 1.4667242225355034e-05, + "loss": 1.4281, + "step": 4037 + }, + { + "epoch": 0.6966272750797895, + "grad_norm": 0.66796875, + "learning_rate": 1.4664834814865802e-05, + "loss": 1.5386, + "step": 4038 + }, + { + "epoch": 0.6967997929785216, + "grad_norm": 0.59375, + "learning_rate": 1.4662427058776459e-05, + "loss": 1.5138, + "step": 4039 + }, + { + "epoch": 0.6969723108772535, + "grad_norm": 0.55859375, + "learning_rate": 1.4660018957265386e-05, + "loss": 1.4275, + "step": 4040 + }, + { + "epoch": 0.6971448287759855, + "grad_norm": 0.75390625, + "learning_rate": 1.4657610510510992e-05, + "loss": 1.415, + "step": 4041 + }, + { + "epoch": 0.6973173466747175, + "grad_norm": 0.6328125, + "learning_rate": 1.4655201718691712e-05, + "loss": 1.5508, + "step": 4042 + }, + { + "epoch": 0.6974898645734495, + "grad_norm": 0.66796875, + "learning_rate": 1.4652792581985997e-05, + "loss": 1.4285, + "step": 4043 + }, + { + "epoch": 0.6976623824721815, + "grad_norm": 0.640625, + "learning_rate": 1.4650383100572338e-05, + "loss": 1.4697, + "step": 4044 + }, + { + "epoch": 0.6978349003709134, + "grad_norm": 0.65625, + "learning_rate": 1.464797327462924e-05, + "loss": 1.4522, + "step": 4045 + }, + { + "epoch": 0.6980074182696455, + "grad_norm": 0.60546875, + "learning_rate": 1.4645563104335243e-05, + "loss": 1.5441, + "step": 4046 + }, + { + "epoch": 0.6981799361683775, + "grad_norm": 0.6484375, + "learning_rate": 1.4643152589868904e-05, + "loss": 1.499, + "step": 4047 + }, + { + "epoch": 0.6983524540671094, + "grad_norm": 0.7734375, + "learning_rate": 1.4640741731408805e-05, + "loss": 1.4008, + "step": 4048 + }, + { + "epoch": 0.6985249719658414, + "grad_norm": 0.69140625, + "learning_rate": 1.4638330529133566e-05, + "loss": 1.3955, + "step": 4049 + }, + { + "epoch": 0.6986974898645735, + "grad_norm": 0.7890625, + "learning_rate": 1.4635918983221823e-05, + "loss": 1.4477, + "step": 4050 + }, + { + "epoch": 0.6988700077633054, + "grad_norm": 0.73046875, + "learning_rate": 1.4633507093852229e-05, + "loss": 1.5475, + "step": 4051 + }, + { + "epoch": 0.6990425256620374, + "grad_norm": 0.57421875, + "learning_rate": 1.4631094861203478e-05, + "loss": 1.3691, + "step": 4052 + }, + { + "epoch": 0.6992150435607695, + "grad_norm": 0.78125, + "learning_rate": 1.4628682285454288e-05, + "loss": 1.4156, + "step": 4053 + }, + { + "epoch": 0.6993875614595014, + "grad_norm": 0.65234375, + "learning_rate": 1.462626936678339e-05, + "loss": 1.4583, + "step": 4054 + }, + { + "epoch": 0.6995600793582334, + "grad_norm": 0.68359375, + "learning_rate": 1.4623856105369552e-05, + "loss": 1.4267, + "step": 4055 + }, + { + "epoch": 0.6997325972569655, + "grad_norm": 0.58984375, + "learning_rate": 1.462144250139156e-05, + "loss": 1.4379, + "step": 4056 + }, + { + "epoch": 0.6999051151556974, + "grad_norm": 0.734375, + "learning_rate": 1.4619028555028234e-05, + "loss": 1.4115, + "step": 4057 + }, + { + "epoch": 0.7000776330544294, + "grad_norm": 0.7421875, + "learning_rate": 1.4616614266458413e-05, + "loss": 1.4709, + "step": 4058 + }, + { + "epoch": 0.7002501509531613, + "grad_norm": 0.7578125, + "learning_rate": 1.4614199635860958e-05, + "loss": 1.4998, + "step": 4059 + }, + { + "epoch": 0.7004226688518934, + "grad_norm": 0.6953125, + "learning_rate": 1.4611784663414765e-05, + "loss": 1.5, + "step": 4060 + }, + { + "epoch": 0.7005951867506254, + "grad_norm": 0.73828125, + "learning_rate": 1.4609369349298745e-05, + "loss": 1.4265, + "step": 4061 + }, + { + "epoch": 0.7007677046493573, + "grad_norm": 0.69140625, + "learning_rate": 1.4606953693691848e-05, + "loss": 1.4047, + "step": 4062 + }, + { + "epoch": 0.7009402225480894, + "grad_norm": 0.66796875, + "learning_rate": 1.4604537696773031e-05, + "loss": 1.4074, + "step": 4063 + }, + { + "epoch": 0.7011127404468214, + "grad_norm": 0.7109375, + "learning_rate": 1.4602121358721295e-05, + "loss": 1.5996, + "step": 4064 + }, + { + "epoch": 0.7012852583455533, + "grad_norm": 0.76171875, + "learning_rate": 1.4599704679715649e-05, + "loss": 1.448, + "step": 4065 + }, + { + "epoch": 0.7014577762442853, + "grad_norm": 0.62890625, + "learning_rate": 1.4597287659935146e-05, + "loss": 1.46, + "step": 4066 + }, + { + "epoch": 0.7016302941430174, + "grad_norm": 0.6015625, + "learning_rate": 1.4594870299558842e-05, + "loss": 1.5046, + "step": 4067 + }, + { + "epoch": 0.7018028120417493, + "grad_norm": 0.66796875, + "learning_rate": 1.459245259876584e-05, + "loss": 1.5037, + "step": 4068 + }, + { + "epoch": 0.7019753299404813, + "grad_norm": 0.7265625, + "learning_rate": 1.4590034557735253e-05, + "loss": 1.3754, + "step": 4069 + }, + { + "epoch": 0.7021478478392134, + "grad_norm": 0.66796875, + "learning_rate": 1.4587616176646229e-05, + "loss": 1.3533, + "step": 4070 + }, + { + "epoch": 0.7023203657379453, + "grad_norm": 0.65234375, + "learning_rate": 1.458519745567793e-05, + "loss": 1.4342, + "step": 4071 + }, + { + "epoch": 0.7024928836366773, + "grad_norm": 0.7265625, + "learning_rate": 1.4582778395009556e-05, + "loss": 1.408, + "step": 4072 + }, + { + "epoch": 0.7026654015354094, + "grad_norm": 0.5625, + "learning_rate": 1.4580358994820322e-05, + "loss": 1.423, + "step": 4073 + }, + { + "epoch": 0.7028379194341413, + "grad_norm": 0.69921875, + "learning_rate": 1.4577939255289477e-05, + "loss": 1.4098, + "step": 4074 + }, + { + "epoch": 0.7030104373328733, + "grad_norm": 0.75390625, + "learning_rate": 1.4575519176596286e-05, + "loss": 1.4431, + "step": 4075 + }, + { + "epoch": 0.7031829552316052, + "grad_norm": 0.64453125, + "learning_rate": 1.4573098758920046e-05, + "loss": 1.408, + "step": 4076 + }, + { + "epoch": 0.7033554731303373, + "grad_norm": 0.75390625, + "learning_rate": 1.457067800244007e-05, + "loss": 1.4398, + "step": 4077 + }, + { + "epoch": 0.7035279910290693, + "grad_norm": 0.7734375, + "learning_rate": 1.4568256907335717e-05, + "loss": 1.4929, + "step": 4078 + }, + { + "epoch": 0.7037005089278012, + "grad_norm": 1.03125, + "learning_rate": 1.4565835473786345e-05, + "loss": 1.4999, + "step": 4079 + }, + { + "epoch": 0.7038730268265333, + "grad_norm": 0.71875, + "learning_rate": 1.4563413701971354e-05, + "loss": 1.432, + "step": 4080 + }, + { + "epoch": 0.7040455447252653, + "grad_norm": 0.62109375, + "learning_rate": 1.4560991592070159e-05, + "loss": 1.4448, + "step": 4081 + }, + { + "epoch": 0.7042180626239972, + "grad_norm": 0.65234375, + "learning_rate": 1.4558569144262213e-05, + "loss": 1.3809, + "step": 4082 + }, + { + "epoch": 0.7043905805227292, + "grad_norm": 0.71875, + "learning_rate": 1.455614635872698e-05, + "loss": 1.4151, + "step": 4083 + }, + { + "epoch": 0.7045630984214613, + "grad_norm": 0.625, + "learning_rate": 1.4553723235643955e-05, + "loss": 1.4452, + "step": 4084 + }, + { + "epoch": 0.7047356163201932, + "grad_norm": 0.671875, + "learning_rate": 1.455129977519266e-05, + "loss": 1.3651, + "step": 4085 + }, + { + "epoch": 0.7049081342189252, + "grad_norm": 0.65234375, + "learning_rate": 1.4548875977552643e-05, + "loss": 1.3227, + "step": 4086 + }, + { + "epoch": 0.7050806521176572, + "grad_norm": 0.609375, + "learning_rate": 1.4546451842903468e-05, + "loss": 1.4218, + "step": 4087 + }, + { + "epoch": 0.7052531700163892, + "grad_norm": 0.7890625, + "learning_rate": 1.4544027371424732e-05, + "loss": 1.4026, + "step": 4088 + }, + { + "epoch": 0.7054256879151212, + "grad_norm": 0.73828125, + "learning_rate": 1.4541602563296058e-05, + "loss": 1.3546, + "step": 4089 + }, + { + "epoch": 0.7055982058138532, + "grad_norm": 0.6484375, + "learning_rate": 1.453917741869709e-05, + "loss": 1.4975, + "step": 4090 + }, + { + "epoch": 0.7057707237125852, + "grad_norm": 0.69140625, + "learning_rate": 1.4536751937807493e-05, + "loss": 1.4868, + "step": 4091 + }, + { + "epoch": 0.7059432416113172, + "grad_norm": 0.71875, + "learning_rate": 1.4534326120806968e-05, + "loss": 1.4914, + "step": 4092 + }, + { + "epoch": 0.7061157595100491, + "grad_norm": 0.69140625, + "learning_rate": 1.4531899967875229e-05, + "loss": 1.4802, + "step": 4093 + }, + { + "epoch": 0.7062882774087812, + "grad_norm": 0.6953125, + "learning_rate": 1.4529473479192027e-05, + "loss": 1.442, + "step": 4094 + }, + { + "epoch": 0.7064607953075132, + "grad_norm": 0.84765625, + "learning_rate": 1.4527046654937128e-05, + "loss": 1.5061, + "step": 4095 + }, + { + "epoch": 0.7066333132062451, + "grad_norm": 0.65234375, + "learning_rate": 1.4524619495290324e-05, + "loss": 1.3328, + "step": 4096 + }, + { + "epoch": 0.7068058311049772, + "grad_norm": 0.58984375, + "learning_rate": 1.4522192000431439e-05, + "loss": 1.5061, + "step": 4097 + }, + { + "epoch": 0.7069783490037092, + "grad_norm": 0.71875, + "learning_rate": 1.4519764170540315e-05, + "loss": 1.4898, + "step": 4098 + }, + { + "epoch": 0.7071508669024411, + "grad_norm": 0.89453125, + "learning_rate": 1.451733600579682e-05, + "loss": 1.4641, + "step": 4099 + }, + { + "epoch": 0.7073233848011731, + "grad_norm": 0.62109375, + "learning_rate": 1.451490750638085e-05, + "loss": 1.3865, + "step": 4100 + }, + { + "epoch": 0.7073233848011731, + "eval_loss": 1.4227409362792969, + "eval_runtime": 10.9937, + "eval_samples_per_second": 93.145, + "eval_steps_per_second": 23.286, + "step": 4100 + }, + { + "epoch": 0.7074959026999051, + "grad_norm": 0.62890625, + "learning_rate": 1.4512478672472318e-05, + "loss": 1.4121, + "step": 4101 + }, + { + "epoch": 0.7076684205986371, + "grad_norm": 0.76171875, + "learning_rate": 1.4510049504251174e-05, + "loss": 1.3894, + "step": 4102 + }, + { + "epoch": 0.7078409384973691, + "grad_norm": 0.62890625, + "learning_rate": 1.450762000189738e-05, + "loss": 1.4696, + "step": 4103 + }, + { + "epoch": 0.7080134563961011, + "grad_norm": 0.83203125, + "learning_rate": 1.4505190165590932e-05, + "loss": 1.4553, + "step": 4104 + }, + { + "epoch": 0.7081859742948331, + "grad_norm": 0.80078125, + "learning_rate": 1.4502759995511845e-05, + "loss": 1.4934, + "step": 4105 + }, + { + "epoch": 0.7083584921935651, + "grad_norm": 0.69140625, + "learning_rate": 1.4500329491840165e-05, + "loss": 1.5194, + "step": 4106 + }, + { + "epoch": 0.7085310100922971, + "grad_norm": 0.64453125, + "learning_rate": 1.4497898654755957e-05, + "loss": 1.4484, + "step": 4107 + }, + { + "epoch": 0.7087035279910291, + "grad_norm": 0.6015625, + "learning_rate": 1.4495467484439312e-05, + "loss": 1.5005, + "step": 4108 + }, + { + "epoch": 0.708876045889761, + "grad_norm": 0.66015625, + "learning_rate": 1.4493035981070347e-05, + "loss": 1.4317, + "step": 4109 + }, + { + "epoch": 0.709048563788493, + "grad_norm": 0.66796875, + "learning_rate": 1.4490604144829204e-05, + "loss": 1.4461, + "step": 4110 + }, + { + "epoch": 0.7092210816872251, + "grad_norm": 0.8046875, + "learning_rate": 1.4488171975896044e-05, + "loss": 1.5005, + "step": 4111 + }, + { + "epoch": 0.709393599585957, + "grad_norm": 0.64453125, + "learning_rate": 1.4485739474451061e-05, + "loss": 1.5018, + "step": 4112 + }, + { + "epoch": 0.709566117484689, + "grad_norm": 0.63671875, + "learning_rate": 1.4483306640674473e-05, + "loss": 1.4708, + "step": 4113 + }, + { + "epoch": 0.7097386353834211, + "grad_norm": 0.56640625, + "learning_rate": 1.4480873474746515e-05, + "loss": 1.3709, + "step": 4114 + }, + { + "epoch": 0.709911153282153, + "grad_norm": 0.74609375, + "learning_rate": 1.4478439976847452e-05, + "loss": 1.4258, + "step": 4115 + }, + { + "epoch": 0.710083671180885, + "grad_norm": 0.609375, + "learning_rate": 1.4476006147157571e-05, + "loss": 1.4294, + "step": 4116 + }, + { + "epoch": 0.710256189079617, + "grad_norm": 0.59765625, + "learning_rate": 1.4473571985857189e-05, + "loss": 1.4314, + "step": 4117 + }, + { + "epoch": 0.710428706978349, + "grad_norm": 0.6640625, + "learning_rate": 1.4471137493126643e-05, + "loss": 1.4131, + "step": 4118 + }, + { + "epoch": 0.710601224877081, + "grad_norm": 0.62109375, + "learning_rate": 1.4468702669146292e-05, + "loss": 1.3848, + "step": 4119 + }, + { + "epoch": 0.710773742775813, + "grad_norm": 0.6171875, + "learning_rate": 1.4466267514096527e-05, + "loss": 1.4524, + "step": 4120 + }, + { + "epoch": 0.710946260674545, + "grad_norm": 0.65625, + "learning_rate": 1.4463832028157758e-05, + "loss": 1.5744, + "step": 4121 + }, + { + "epoch": 0.711118778573277, + "grad_norm": 0.671875, + "learning_rate": 1.4461396211510421e-05, + "loss": 1.5232, + "step": 4122 + }, + { + "epoch": 0.711291296472009, + "grad_norm": 0.61328125, + "learning_rate": 1.4458960064334977e-05, + "loss": 1.4307, + "step": 4123 + }, + { + "epoch": 0.7114638143707409, + "grad_norm": 0.6484375, + "learning_rate": 1.4456523586811911e-05, + "loss": 1.4285, + "step": 4124 + }, + { + "epoch": 0.711636332269473, + "grad_norm": 0.625, + "learning_rate": 1.445408677912173e-05, + "loss": 1.3565, + "step": 4125 + }, + { + "epoch": 0.7118088501682049, + "grad_norm": 0.6328125, + "learning_rate": 1.445164964144497e-05, + "loss": 1.4898, + "step": 4126 + }, + { + "epoch": 0.7119813680669369, + "grad_norm": 0.59765625, + "learning_rate": 1.444921217396219e-05, + "loss": 1.4365, + "step": 4127 + }, + { + "epoch": 0.712153885965669, + "grad_norm": 0.59765625, + "learning_rate": 1.4446774376853973e-05, + "loss": 1.4543, + "step": 4128 + }, + { + "epoch": 0.7123264038644009, + "grad_norm": 0.63671875, + "learning_rate": 1.4444336250300926e-05, + "loss": 1.4361, + "step": 4129 + }, + { + "epoch": 0.7124989217631329, + "grad_norm": 0.859375, + "learning_rate": 1.4441897794483679e-05, + "loss": 1.4084, + "step": 4130 + }, + { + "epoch": 0.712671439661865, + "grad_norm": 0.578125, + "learning_rate": 1.443945900958289e-05, + "loss": 1.4301, + "step": 4131 + }, + { + "epoch": 0.7128439575605969, + "grad_norm": 0.62890625, + "learning_rate": 1.4437019895779235e-05, + "loss": 1.6045, + "step": 4132 + }, + { + "epoch": 0.7130164754593289, + "grad_norm": 0.66796875, + "learning_rate": 1.4434580453253426e-05, + "loss": 1.4778, + "step": 4133 + }, + { + "epoch": 0.7131889933580609, + "grad_norm": 0.6796875, + "learning_rate": 1.4432140682186192e-05, + "loss": 1.4129, + "step": 4134 + }, + { + "epoch": 0.7133615112567929, + "grad_norm": 0.6484375, + "learning_rate": 1.4429700582758276e-05, + "loss": 1.4951, + "step": 4135 + }, + { + "epoch": 0.7135340291555249, + "grad_norm": 1.1640625, + "learning_rate": 1.4427260155150466e-05, + "loss": 1.4966, + "step": 4136 + }, + { + "epoch": 0.7137065470542568, + "grad_norm": 0.62890625, + "learning_rate": 1.4424819399543559e-05, + "loss": 1.3954, + "step": 4137 + }, + { + "epoch": 0.7138790649529889, + "grad_norm": 0.58984375, + "learning_rate": 1.4422378316118384e-05, + "loss": 1.4021, + "step": 4138 + }, + { + "epoch": 0.7140515828517209, + "grad_norm": 0.640625, + "learning_rate": 1.4419936905055794e-05, + "loss": 1.4526, + "step": 4139 + }, + { + "epoch": 0.7142241007504528, + "grad_norm": 0.62890625, + "learning_rate": 1.4417495166536659e-05, + "loss": 1.3207, + "step": 4140 + }, + { + "epoch": 0.7143966186491848, + "grad_norm": 0.59375, + "learning_rate": 1.4415053100741879e-05, + "loss": 1.4194, + "step": 4141 + }, + { + "epoch": 0.7145691365479169, + "grad_norm": 0.73046875, + "learning_rate": 1.4412610707852378e-05, + "loss": 1.4431, + "step": 4142 + }, + { + "epoch": 0.7147416544466488, + "grad_norm": 0.64453125, + "learning_rate": 1.4410167988049106e-05, + "loss": 1.5113, + "step": 4143 + }, + { + "epoch": 0.7149141723453808, + "grad_norm": 0.82421875, + "learning_rate": 1.4407724941513035e-05, + "loss": 1.5396, + "step": 4144 + }, + { + "epoch": 0.7150866902441129, + "grad_norm": 0.84375, + "learning_rate": 1.4405281568425158e-05, + "loss": 1.5188, + "step": 4145 + }, + { + "epoch": 0.7152592081428448, + "grad_norm": 0.58984375, + "learning_rate": 1.4402837868966498e-05, + "loss": 1.3558, + "step": 4146 + }, + { + "epoch": 0.7154317260415768, + "grad_norm": 0.59375, + "learning_rate": 1.4400393843318097e-05, + "loss": 1.4856, + "step": 4147 + }, + { + "epoch": 0.7156042439403089, + "grad_norm": 0.61328125, + "learning_rate": 1.4397949491661027e-05, + "loss": 1.3808, + "step": 4148 + }, + { + "epoch": 0.7157767618390408, + "grad_norm": 0.60546875, + "learning_rate": 1.4395504814176376e-05, + "loss": 1.3694, + "step": 4149 + }, + { + "epoch": 0.7159492797377728, + "grad_norm": 0.6171875, + "learning_rate": 1.4393059811045267e-05, + "loss": 1.3789, + "step": 4150 + }, + { + "epoch": 0.7161217976365047, + "grad_norm": 0.6640625, + "learning_rate": 1.4390614482448837e-05, + "loss": 1.4717, + "step": 4151 + }, + { + "epoch": 0.7162943155352368, + "grad_norm": 0.5859375, + "learning_rate": 1.4388168828568252e-05, + "loss": 1.4533, + "step": 4152 + }, + { + "epoch": 0.7164668334339688, + "grad_norm": 0.66015625, + "learning_rate": 1.4385722849584702e-05, + "loss": 1.4272, + "step": 4153 + }, + { + "epoch": 0.7166393513327007, + "grad_norm": 2.0625, + "learning_rate": 1.4383276545679398e-05, + "loss": 1.4254, + "step": 4154 + }, + { + "epoch": 0.7168118692314328, + "grad_norm": 0.6640625, + "learning_rate": 1.4380829917033585e-05, + "loss": 1.5077, + "step": 4155 + }, + { + "epoch": 0.7169843871301648, + "grad_norm": 0.58984375, + "learning_rate": 1.4378382963828515e-05, + "loss": 1.3822, + "step": 4156 + }, + { + "epoch": 0.7171569050288967, + "grad_norm": 0.6484375, + "learning_rate": 1.4375935686245477e-05, + "loss": 1.5273, + "step": 4157 + }, + { + "epoch": 0.7173294229276287, + "grad_norm": 0.66015625, + "learning_rate": 1.4373488084465783e-05, + "loss": 1.4555, + "step": 4158 + }, + { + "epoch": 0.7175019408263608, + "grad_norm": 0.60546875, + "learning_rate": 1.4371040158670763e-05, + "loss": 1.5219, + "step": 4159 + }, + { + "epoch": 0.7176744587250927, + "grad_norm": 0.59765625, + "learning_rate": 1.4368591909041778e-05, + "loss": 1.4713, + "step": 4160 + }, + { + "epoch": 0.7178469766238247, + "grad_norm": 0.58984375, + "learning_rate": 1.4366143335760208e-05, + "loss": 1.5836, + "step": 4161 + }, + { + "epoch": 0.7180194945225568, + "grad_norm": 0.7890625, + "learning_rate": 1.436369443900746e-05, + "loss": 1.4414, + "step": 4162 + }, + { + "epoch": 0.7181920124212887, + "grad_norm": 0.66796875, + "learning_rate": 1.4361245218964961e-05, + "loss": 1.4173, + "step": 4163 + }, + { + "epoch": 0.7183645303200207, + "grad_norm": 0.625, + "learning_rate": 1.4358795675814165e-05, + "loss": 1.3769, + "step": 4164 + }, + { + "epoch": 0.7185370482187527, + "grad_norm": 0.640625, + "learning_rate": 1.4356345809736552e-05, + "loss": 1.4559, + "step": 4165 + }, + { + "epoch": 0.7187095661174847, + "grad_norm": 0.578125, + "learning_rate": 1.4353895620913619e-05, + "loss": 1.4834, + "step": 4166 + }, + { + "epoch": 0.7188820840162167, + "grad_norm": 0.60546875, + "learning_rate": 1.4351445109526897e-05, + "loss": 1.4671, + "step": 4167 + }, + { + "epoch": 0.7190546019149486, + "grad_norm": 0.609375, + "learning_rate": 1.4348994275757933e-05, + "loss": 1.4191, + "step": 4168 + }, + { + "epoch": 0.7192271198136807, + "grad_norm": 0.64453125, + "learning_rate": 1.4346543119788297e-05, + "loss": 1.4187, + "step": 4169 + }, + { + "epoch": 0.7193996377124127, + "grad_norm": 1.0234375, + "learning_rate": 1.4344091641799587e-05, + "loss": 1.4377, + "step": 4170 + }, + { + "epoch": 0.7195721556111446, + "grad_norm": 0.62109375, + "learning_rate": 1.434163984197343e-05, + "loss": 1.512, + "step": 4171 + }, + { + "epoch": 0.7197446735098767, + "grad_norm": 0.625, + "learning_rate": 1.433918772049146e-05, + "loss": 1.5385, + "step": 4172 + }, + { + "epoch": 0.7199171914086087, + "grad_norm": 0.640625, + "learning_rate": 1.4336735277535355e-05, + "loss": 1.4165, + "step": 4173 + }, + { + "epoch": 0.7200897093073406, + "grad_norm": 0.57421875, + "learning_rate": 1.4334282513286799e-05, + "loss": 1.5712, + "step": 4174 + }, + { + "epoch": 0.7202622272060726, + "grad_norm": 0.62109375, + "learning_rate": 1.4331829427927518e-05, + "loss": 1.4407, + "step": 4175 + }, + { + "epoch": 0.7204347451048047, + "grad_norm": 0.66796875, + "learning_rate": 1.4329376021639244e-05, + "loss": 1.5025, + "step": 4176 + }, + { + "epoch": 0.7206072630035366, + "grad_norm": 0.6328125, + "learning_rate": 1.4326922294603743e-05, + "loss": 1.4318, + "step": 4177 + }, + { + "epoch": 0.7207797809022686, + "grad_norm": 0.640625, + "learning_rate": 1.4324468247002802e-05, + "loss": 1.3621, + "step": 4178 + }, + { + "epoch": 0.7209522988010006, + "grad_norm": 0.73046875, + "learning_rate": 1.4322013879018233e-05, + "loss": 1.4792, + "step": 4179 + }, + { + "epoch": 0.7211248166997326, + "grad_norm": 0.60546875, + "learning_rate": 1.4319559190831872e-05, + "loss": 1.3502, + "step": 4180 + }, + { + "epoch": 0.7212973345984646, + "grad_norm": 0.6015625, + "learning_rate": 1.4317104182625573e-05, + "loss": 1.5503, + "step": 4181 + }, + { + "epoch": 0.7214698524971965, + "grad_norm": 0.65625, + "learning_rate": 1.4314648854581225e-05, + "loss": 1.429, + "step": 4182 + }, + { + "epoch": 0.7216423703959286, + "grad_norm": 0.625, + "learning_rate": 1.431219320688073e-05, + "loss": 1.3568, + "step": 4183 + }, + { + "epoch": 0.7218148882946606, + "grad_norm": 0.640625, + "learning_rate": 1.4309737239706019e-05, + "loss": 1.4368, + "step": 4184 + }, + { + "epoch": 0.7219874061933925, + "grad_norm": 0.703125, + "learning_rate": 1.4307280953239044e-05, + "loss": 1.4313, + "step": 4185 + }, + { + "epoch": 0.7221599240921246, + "grad_norm": 0.71484375, + "learning_rate": 1.4304824347661783e-05, + "loss": 1.399, + "step": 4186 + }, + { + "epoch": 0.7223324419908566, + "grad_norm": 0.5859375, + "learning_rate": 1.4302367423156236e-05, + "loss": 1.4066, + "step": 4187 + }, + { + "epoch": 0.7225049598895885, + "grad_norm": 0.64453125, + "learning_rate": 1.4299910179904429e-05, + "loss": 1.3609, + "step": 4188 + }, + { + "epoch": 0.7226774777883206, + "grad_norm": 0.90234375, + "learning_rate": 1.4297452618088407e-05, + "loss": 1.4241, + "step": 4189 + }, + { + "epoch": 0.7228499956870525, + "grad_norm": 0.58984375, + "learning_rate": 1.4294994737890245e-05, + "loss": 1.3455, + "step": 4190 + }, + { + "epoch": 0.7230225135857845, + "grad_norm": 0.671875, + "learning_rate": 1.4292536539492034e-05, + "loss": 1.5229, + "step": 4191 + }, + { + "epoch": 0.7231950314845165, + "grad_norm": 0.8984375, + "learning_rate": 1.4290078023075897e-05, + "loss": 1.5029, + "step": 4192 + }, + { + "epoch": 0.7233675493832485, + "grad_norm": 0.703125, + "learning_rate": 1.4287619188823975e-05, + "loss": 1.4714, + "step": 4193 + }, + { + "epoch": 0.7235400672819805, + "grad_norm": 0.75390625, + "learning_rate": 1.4285160036918431e-05, + "loss": 1.4747, + "step": 4194 + }, + { + "epoch": 0.7237125851807125, + "grad_norm": 0.8203125, + "learning_rate": 1.4282700567541461e-05, + "loss": 1.4821, + "step": 4195 + }, + { + "epoch": 0.7238851030794445, + "grad_norm": 0.71875, + "learning_rate": 1.428024078087527e-05, + "loss": 1.4791, + "step": 4196 + }, + { + "epoch": 0.7240576209781765, + "grad_norm": 0.58203125, + "learning_rate": 1.4277780677102098e-05, + "loss": 1.3904, + "step": 4197 + }, + { + "epoch": 0.7242301388769085, + "grad_norm": 0.9609375, + "learning_rate": 1.4275320256404203e-05, + "loss": 1.5428, + "step": 4198 + }, + { + "epoch": 0.7244026567756404, + "grad_norm": 0.74609375, + "learning_rate": 1.4272859518963874e-05, + "loss": 1.4717, + "step": 4199 + }, + { + "epoch": 0.7245751746743725, + "grad_norm": 0.66796875, + "learning_rate": 1.4270398464963411e-05, + "loss": 1.5452, + "step": 4200 + }, + { + "epoch": 0.7245751746743725, + "eval_loss": 1.4216216802597046, + "eval_runtime": 10.8689, + "eval_samples_per_second": 94.213, + "eval_steps_per_second": 23.553, + "step": 4200 + }, + { + "epoch": 0.7247476925731045, + "grad_norm": 0.734375, + "learning_rate": 1.4267937094585148e-05, + "loss": 1.445, + "step": 4201 + }, + { + "epoch": 0.7249202104718364, + "grad_norm": 0.71875, + "learning_rate": 1.4265475408011438e-05, + "loss": 1.5107, + "step": 4202 + }, + { + "epoch": 0.7250927283705685, + "grad_norm": 0.671875, + "learning_rate": 1.4263013405424657e-05, + "loss": 1.4799, + "step": 4203 + }, + { + "epoch": 0.7252652462693004, + "grad_norm": 0.62890625, + "learning_rate": 1.4260551087007205e-05, + "loss": 1.4503, + "step": 4204 + }, + { + "epoch": 0.7254377641680324, + "grad_norm": 0.75, + "learning_rate": 1.425808845294151e-05, + "loss": 1.4823, + "step": 4205 + }, + { + "epoch": 0.7256102820667645, + "grad_norm": 0.69921875, + "learning_rate": 1.4255625503410015e-05, + "loss": 1.3246, + "step": 4206 + }, + { + "epoch": 0.7257827999654964, + "grad_norm": 0.578125, + "learning_rate": 1.4253162238595192e-05, + "loss": 1.4137, + "step": 4207 + }, + { + "epoch": 0.7259553178642284, + "grad_norm": 0.578125, + "learning_rate": 1.4250698658679535e-05, + "loss": 1.359, + "step": 4208 + }, + { + "epoch": 0.7261278357629604, + "grad_norm": 0.7109375, + "learning_rate": 1.4248234763845565e-05, + "loss": 1.4221, + "step": 4209 + }, + { + "epoch": 0.7263003536616924, + "grad_norm": 0.59765625, + "learning_rate": 1.4245770554275817e-05, + "loss": 1.4439, + "step": 4210 + }, + { + "epoch": 0.7264728715604244, + "grad_norm": 0.67578125, + "learning_rate": 1.424330603015286e-05, + "loss": 1.3822, + "step": 4211 + }, + { + "epoch": 0.7266453894591564, + "grad_norm": 0.62109375, + "learning_rate": 1.4240841191659276e-05, + "loss": 1.4447, + "step": 4212 + }, + { + "epoch": 0.7268179073578884, + "grad_norm": 0.67578125, + "learning_rate": 1.423837603897768e-05, + "loss": 1.4547, + "step": 4213 + }, + { + "epoch": 0.7269904252566204, + "grad_norm": 0.58984375, + "learning_rate": 1.4235910572290704e-05, + "loss": 1.5104, + "step": 4214 + }, + { + "epoch": 0.7271629431553523, + "grad_norm": 0.65234375, + "learning_rate": 1.4233444791781005e-05, + "loss": 1.4798, + "step": 4215 + }, + { + "epoch": 0.7273354610540843, + "grad_norm": 0.61328125, + "learning_rate": 1.4230978697631266e-05, + "loss": 1.4985, + "step": 4216 + }, + { + "epoch": 0.7275079789528164, + "grad_norm": 0.63671875, + "learning_rate": 1.4228512290024185e-05, + "loss": 1.3978, + "step": 4217 + }, + { + "epoch": 0.7276804968515483, + "grad_norm": 0.69921875, + "learning_rate": 1.4226045569142497e-05, + "loss": 1.4935, + "step": 4218 + }, + { + "epoch": 0.7278530147502803, + "grad_norm": 0.640625, + "learning_rate": 1.4223578535168947e-05, + "loss": 1.4479, + "step": 4219 + }, + { + "epoch": 0.7280255326490124, + "grad_norm": 0.6484375, + "learning_rate": 1.4221111188286307e-05, + "loss": 1.4613, + "step": 4220 + }, + { + "epoch": 0.7281980505477443, + "grad_norm": 0.59375, + "learning_rate": 1.4218643528677377e-05, + "loss": 1.4544, + "step": 4221 + }, + { + "epoch": 0.7283705684464763, + "grad_norm": 0.60546875, + "learning_rate": 1.421617555652497e-05, + "loss": 1.4306, + "step": 4222 + }, + { + "epoch": 0.7285430863452084, + "grad_norm": 0.609375, + "learning_rate": 1.421370727201194e-05, + "loss": 1.5617, + "step": 4223 + }, + { + "epoch": 0.7287156042439403, + "grad_norm": 0.625, + "learning_rate": 1.4211238675321143e-05, + "loss": 1.3514, + "step": 4224 + }, + { + "epoch": 0.7288881221426723, + "grad_norm": 0.5859375, + "learning_rate": 1.4208769766635475e-05, + "loss": 1.5241, + "step": 4225 + }, + { + "epoch": 0.7290606400414043, + "grad_norm": 0.59375, + "learning_rate": 1.4206300546137844e-05, + "loss": 1.554, + "step": 4226 + }, + { + "epoch": 0.7292331579401363, + "grad_norm": 0.8046875, + "learning_rate": 1.4203831014011186e-05, + "loss": 1.4732, + "step": 4227 + }, + { + "epoch": 0.7294056758388683, + "grad_norm": 0.703125, + "learning_rate": 1.4201361170438461e-05, + "loss": 1.4524, + "step": 4228 + }, + { + "epoch": 0.7295781937376002, + "grad_norm": 0.67578125, + "learning_rate": 1.4198891015602648e-05, + "loss": 1.4721, + "step": 4229 + }, + { + "epoch": 0.7297507116363323, + "grad_norm": 0.640625, + "learning_rate": 1.419642054968675e-05, + "loss": 1.4939, + "step": 4230 + }, + { + "epoch": 0.7299232295350643, + "grad_norm": 0.578125, + "learning_rate": 1.4193949772873804e-05, + "loss": 1.4559, + "step": 4231 + }, + { + "epoch": 0.7300957474337962, + "grad_norm": 0.6640625, + "learning_rate": 1.4191478685346849e-05, + "loss": 1.3952, + "step": 4232 + }, + { + "epoch": 0.7302682653325282, + "grad_norm": 0.578125, + "learning_rate": 1.4189007287288963e-05, + "loss": 1.4461, + "step": 4233 + }, + { + "epoch": 0.7304407832312603, + "grad_norm": 0.61328125, + "learning_rate": 1.4186535578883244e-05, + "loss": 1.4821, + "step": 4234 + }, + { + "epoch": 0.7306133011299922, + "grad_norm": 0.703125, + "learning_rate": 1.4184063560312814e-05, + "loss": 1.4381, + "step": 4235 + }, + { + "epoch": 0.7307858190287242, + "grad_norm": 0.59765625, + "learning_rate": 1.4181591231760807e-05, + "loss": 1.4034, + "step": 4236 + }, + { + "epoch": 0.7309583369274563, + "grad_norm": 0.56640625, + "learning_rate": 1.4179118593410395e-05, + "loss": 1.3891, + "step": 4237 + }, + { + "epoch": 0.7311308548261882, + "grad_norm": 0.64453125, + "learning_rate": 1.4176645645444765e-05, + "loss": 1.523, + "step": 4238 + }, + { + "epoch": 0.7313033727249202, + "grad_norm": 0.58203125, + "learning_rate": 1.4174172388047132e-05, + "loss": 1.4271, + "step": 4239 + }, + { + "epoch": 0.7314758906236523, + "grad_norm": 0.75390625, + "learning_rate": 1.4171698821400724e-05, + "loss": 1.4633, + "step": 4240 + }, + { + "epoch": 0.7316484085223842, + "grad_norm": 0.63671875, + "learning_rate": 1.4169224945688801e-05, + "loss": 1.4301, + "step": 4241 + }, + { + "epoch": 0.7318209264211162, + "grad_norm": 0.5625, + "learning_rate": 1.4166750761094646e-05, + "loss": 1.3832, + "step": 4242 + }, + { + "epoch": 0.7319934443198481, + "grad_norm": 0.6171875, + "learning_rate": 1.416427626780156e-05, + "loss": 1.4208, + "step": 4243 + }, + { + "epoch": 0.7321659622185802, + "grad_norm": 0.59375, + "learning_rate": 1.4161801465992867e-05, + "loss": 1.4036, + "step": 4244 + }, + { + "epoch": 0.7323384801173122, + "grad_norm": 0.58984375, + "learning_rate": 1.4159326355851915e-05, + "loss": 1.3705, + "step": 4245 + }, + { + "epoch": 0.7325109980160441, + "grad_norm": 0.6171875, + "learning_rate": 1.4156850937562079e-05, + "loss": 1.4796, + "step": 4246 + }, + { + "epoch": 0.7326835159147762, + "grad_norm": 3.203125, + "learning_rate": 1.4154375211306754e-05, + "loss": 1.4343, + "step": 4247 + }, + { + "epoch": 0.7328560338135082, + "grad_norm": 0.61328125, + "learning_rate": 1.4151899177269357e-05, + "loss": 1.3781, + "step": 4248 + }, + { + "epoch": 0.7330285517122401, + "grad_norm": 0.69140625, + "learning_rate": 1.4149422835633324e-05, + "loss": 1.3809, + "step": 4249 + }, + { + "epoch": 0.7332010696109721, + "grad_norm": 0.578125, + "learning_rate": 1.414694618658212e-05, + "loss": 1.5156, + "step": 4250 + }, + { + "epoch": 0.7333735875097042, + "grad_norm": 0.63671875, + "learning_rate": 1.4144469230299234e-05, + "loss": 1.454, + "step": 4251 + }, + { + "epoch": 0.7335461054084361, + "grad_norm": 0.65625, + "learning_rate": 1.4141991966968169e-05, + "loss": 1.4892, + "step": 4252 + }, + { + "epoch": 0.7337186233071681, + "grad_norm": 0.6171875, + "learning_rate": 1.413951439677246e-05, + "loss": 1.3443, + "step": 4253 + }, + { + "epoch": 0.7338911412059002, + "grad_norm": 0.71484375, + "learning_rate": 1.4137036519895656e-05, + "loss": 1.5462, + "step": 4254 + }, + { + "epoch": 0.7340636591046321, + "grad_norm": 0.609375, + "learning_rate": 1.4134558336521342e-05, + "loss": 1.4905, + "step": 4255 + }, + { + "epoch": 0.7342361770033641, + "grad_norm": 0.60546875, + "learning_rate": 1.413207984683311e-05, + "loss": 1.4553, + "step": 4256 + }, + { + "epoch": 0.734408694902096, + "grad_norm": 0.61328125, + "learning_rate": 1.4129601051014586e-05, + "loss": 1.4901, + "step": 4257 + }, + { + "epoch": 0.7345812128008281, + "grad_norm": 0.7109375, + "learning_rate": 1.4127121949249412e-05, + "loss": 1.4201, + "step": 4258 + }, + { + "epoch": 0.7347537306995601, + "grad_norm": 0.59765625, + "learning_rate": 1.4124642541721258e-05, + "loss": 1.5189, + "step": 4259 + }, + { + "epoch": 0.734926248598292, + "grad_norm": 0.5859375, + "learning_rate": 1.4122162828613812e-05, + "loss": 1.4489, + "step": 4260 + }, + { + "epoch": 0.7350987664970241, + "grad_norm": 0.77734375, + "learning_rate": 1.4119682810110787e-05, + "loss": 1.4045, + "step": 4261 + }, + { + "epoch": 0.7352712843957561, + "grad_norm": 0.65625, + "learning_rate": 1.4117202486395918e-05, + "loss": 1.5362, + "step": 4262 + }, + { + "epoch": 0.735443802294488, + "grad_norm": 0.6328125, + "learning_rate": 1.4114721857652965e-05, + "loss": 1.4357, + "step": 4263 + }, + { + "epoch": 0.7356163201932201, + "grad_norm": 0.6171875, + "learning_rate": 1.4112240924065706e-05, + "loss": 1.4519, + "step": 4264 + }, + { + "epoch": 0.7357888380919521, + "grad_norm": 0.75390625, + "learning_rate": 1.4109759685817943e-05, + "loss": 1.4781, + "step": 4265 + }, + { + "epoch": 0.735961355990684, + "grad_norm": 0.6328125, + "learning_rate": 1.4107278143093505e-05, + "loss": 1.505, + "step": 4266 + }, + { + "epoch": 0.736133873889416, + "grad_norm": 0.59375, + "learning_rate": 1.410479629607624e-05, + "loss": 1.3641, + "step": 4267 + }, + { + "epoch": 0.736306391788148, + "grad_norm": 0.828125, + "learning_rate": 1.4102314144950016e-05, + "loss": 1.451, + "step": 4268 + }, + { + "epoch": 0.73647890968688, + "grad_norm": 0.8359375, + "learning_rate": 1.4099831689898728e-05, + "loss": 1.441, + "step": 4269 + }, + { + "epoch": 0.736651427585612, + "grad_norm": 0.67578125, + "learning_rate": 1.409734893110629e-05, + "loss": 1.4513, + "step": 4270 + }, + { + "epoch": 0.736823945484344, + "grad_norm": 0.671875, + "learning_rate": 1.4094865868756644e-05, + "loss": 1.4197, + "step": 4271 + }, + { + "epoch": 0.736996463383076, + "grad_norm": 0.6640625, + "learning_rate": 1.4092382503033746e-05, + "loss": 1.3671, + "step": 4272 + }, + { + "epoch": 0.737168981281808, + "grad_norm": 0.57421875, + "learning_rate": 1.4089898834121583e-05, + "loss": 1.3355, + "step": 4273 + }, + { + "epoch": 0.7373414991805399, + "grad_norm": 0.59765625, + "learning_rate": 1.4087414862204161e-05, + "loss": 1.4562, + "step": 4274 + }, + { + "epoch": 0.737514017079272, + "grad_norm": 0.6875, + "learning_rate": 1.4084930587465506e-05, + "loss": 1.4651, + "step": 4275 + }, + { + "epoch": 0.737686534978004, + "grad_norm": 0.61328125, + "learning_rate": 1.4082446010089667e-05, + "loss": 1.4966, + "step": 4276 + }, + { + "epoch": 0.7378590528767359, + "grad_norm": 0.63671875, + "learning_rate": 1.4079961130260722e-05, + "loss": 1.3972, + "step": 4277 + }, + { + "epoch": 0.738031570775468, + "grad_norm": 0.6328125, + "learning_rate": 1.4077475948162762e-05, + "loss": 1.4247, + "step": 4278 + }, + { + "epoch": 0.7382040886742, + "grad_norm": 0.72265625, + "learning_rate": 1.407499046397991e-05, + "loss": 1.4448, + "step": 4279 + }, + { + "epoch": 0.7383766065729319, + "grad_norm": 0.8359375, + "learning_rate": 1.40725046778963e-05, + "loss": 1.4223, + "step": 4280 + }, + { + "epoch": 0.738549124471664, + "grad_norm": 0.578125, + "learning_rate": 1.4070018590096096e-05, + "loss": 1.3966, + "step": 4281 + }, + { + "epoch": 0.738721642370396, + "grad_norm": 0.6953125, + "learning_rate": 1.4067532200763484e-05, + "loss": 1.4662, + "step": 4282 + }, + { + "epoch": 0.7388941602691279, + "grad_norm": 0.60546875, + "learning_rate": 1.4065045510082673e-05, + "loss": 1.5419, + "step": 4283 + }, + { + "epoch": 0.7390666781678599, + "grad_norm": 0.73046875, + "learning_rate": 1.4062558518237893e-05, + "loss": 1.4291, + "step": 4284 + }, + { + "epoch": 0.7392391960665919, + "grad_norm": 0.6328125, + "learning_rate": 1.4060071225413392e-05, + "loss": 1.4024, + "step": 4285 + }, + { + "epoch": 0.7394117139653239, + "grad_norm": 0.5703125, + "learning_rate": 1.4057583631793443e-05, + "loss": 1.4185, + "step": 4286 + }, + { + "epoch": 0.7395842318640559, + "grad_norm": 0.65625, + "learning_rate": 1.405509573756235e-05, + "loss": 1.4524, + "step": 4287 + }, + { + "epoch": 0.7397567497627879, + "grad_norm": 0.8046875, + "learning_rate": 1.4052607542904427e-05, + "loss": 1.5358, + "step": 4288 + }, + { + "epoch": 0.7399292676615199, + "grad_norm": 0.609375, + "learning_rate": 1.4050119048004012e-05, + "loss": 1.4615, + "step": 4289 + }, + { + "epoch": 0.7401017855602519, + "grad_norm": 0.63671875, + "learning_rate": 1.4047630253045475e-05, + "loss": 1.3796, + "step": 4290 + }, + { + "epoch": 0.7402743034589838, + "grad_norm": 0.625, + "learning_rate": 1.4045141158213197e-05, + "loss": 1.5141, + "step": 4291 + }, + { + "epoch": 0.7404468213577159, + "grad_norm": 0.640625, + "learning_rate": 1.4042651763691586e-05, + "loss": 1.4644, + "step": 4292 + }, + { + "epoch": 0.7406193392564479, + "grad_norm": 0.609375, + "learning_rate": 1.4040162069665076e-05, + "loss": 1.3469, + "step": 4293 + }, + { + "epoch": 0.7407918571551798, + "grad_norm": 0.83203125, + "learning_rate": 1.4037672076318112e-05, + "loss": 1.4975, + "step": 4294 + }, + { + "epoch": 0.7409643750539119, + "grad_norm": 0.63671875, + "learning_rate": 1.4035181783835174e-05, + "loss": 1.4385, + "step": 4295 + }, + { + "epoch": 0.7411368929526438, + "grad_norm": 0.78515625, + "learning_rate": 1.4032691192400756e-05, + "loss": 1.4352, + "step": 4296 + }, + { + "epoch": 0.7413094108513758, + "grad_norm": 0.578125, + "learning_rate": 1.4030200302199375e-05, + "loss": 1.4521, + "step": 4297 + }, + { + "epoch": 0.7414819287501079, + "grad_norm": 0.62890625, + "learning_rate": 1.4027709113415578e-05, + "loss": 1.4853, + "step": 4298 + }, + { + "epoch": 0.7416544466488398, + "grad_norm": 0.63671875, + "learning_rate": 1.4025217626233919e-05, + "loss": 1.4171, + "step": 4299 + }, + { + "epoch": 0.7418269645475718, + "grad_norm": 0.578125, + "learning_rate": 1.402272584083899e-05, + "loss": 1.4167, + "step": 4300 + }, + { + "epoch": 0.7418269645475718, + "eval_loss": 1.4205090999603271, + "eval_runtime": 11.0817, + "eval_samples_per_second": 92.405, + "eval_steps_per_second": 23.101, + "step": 4300 + }, + { + "epoch": 0.7419994824463038, + "grad_norm": 0.6640625, + "learning_rate": 1.4020233757415396e-05, + "loss": 1.4844, + "step": 4301 + }, + { + "epoch": 0.7421720003450358, + "grad_norm": 0.703125, + "learning_rate": 1.4017741376147762e-05, + "loss": 1.4134, + "step": 4302 + }, + { + "epoch": 0.7423445182437678, + "grad_norm": 0.6328125, + "learning_rate": 1.4015248697220746e-05, + "loss": 1.4863, + "step": 4303 + }, + { + "epoch": 0.7425170361424998, + "grad_norm": 1.5625, + "learning_rate": 1.401275572081902e-05, + "loss": 1.4767, + "step": 4304 + }, + { + "epoch": 0.7426895540412318, + "grad_norm": 0.7109375, + "learning_rate": 1.4010262447127275e-05, + "loss": 1.4214, + "step": 4305 + }, + { + "epoch": 0.7428620719399638, + "grad_norm": 0.7890625, + "learning_rate": 1.400776887633023e-05, + "loss": 1.4982, + "step": 4306 + }, + { + "epoch": 0.7430345898386957, + "grad_norm": 0.6328125, + "learning_rate": 1.4005275008612627e-05, + "loss": 1.4187, + "step": 4307 + }, + { + "epoch": 0.7432071077374277, + "grad_norm": 0.671875, + "learning_rate": 1.4002780844159225e-05, + "loss": 1.5067, + "step": 4308 + }, + { + "epoch": 0.7433796256361598, + "grad_norm": 0.67578125, + "learning_rate": 1.4000286383154804e-05, + "loss": 1.4092, + "step": 4309 + }, + { + "epoch": 0.7435521435348917, + "grad_norm": 0.703125, + "learning_rate": 1.3997791625784176e-05, + "loss": 1.5382, + "step": 4310 + }, + { + "epoch": 0.7437246614336237, + "grad_norm": 0.56640625, + "learning_rate": 1.3995296572232166e-05, + "loss": 1.4975, + "step": 4311 + }, + { + "epoch": 0.7438971793323558, + "grad_norm": 0.77734375, + "learning_rate": 1.399280122268362e-05, + "loss": 1.4389, + "step": 4312 + }, + { + "epoch": 0.7440696972310877, + "grad_norm": 0.6015625, + "learning_rate": 1.399030557732341e-05, + "loss": 1.4836, + "step": 4313 + }, + { + "epoch": 0.7442422151298197, + "grad_norm": 0.61328125, + "learning_rate": 1.3987809636336434e-05, + "loss": 1.4865, + "step": 4314 + }, + { + "epoch": 0.7444147330285518, + "grad_norm": 0.578125, + "learning_rate": 1.39853133999076e-05, + "loss": 1.5212, + "step": 4315 + }, + { + "epoch": 0.7445872509272837, + "grad_norm": 0.6328125, + "learning_rate": 1.398281686822185e-05, + "loss": 1.4433, + "step": 4316 + }, + { + "epoch": 0.7447597688260157, + "grad_norm": 0.6640625, + "learning_rate": 1.398032004146414e-05, + "loss": 1.4844, + "step": 4317 + }, + { + "epoch": 0.7449322867247476, + "grad_norm": 0.70703125, + "learning_rate": 1.3977822919819448e-05, + "loss": 1.4672, + "step": 4318 + }, + { + "epoch": 0.7451048046234797, + "grad_norm": 0.625, + "learning_rate": 1.3975325503472786e-05, + "loss": 1.4728, + "step": 4319 + }, + { + "epoch": 0.7452773225222117, + "grad_norm": 0.62890625, + "learning_rate": 1.3972827792609168e-05, + "loss": 1.4121, + "step": 4320 + }, + { + "epoch": 0.7454498404209436, + "grad_norm": 0.65625, + "learning_rate": 1.397032978741364e-05, + "loss": 1.4833, + "step": 4321 + }, + { + "epoch": 0.7456223583196757, + "grad_norm": 0.69921875, + "learning_rate": 1.3967831488071279e-05, + "loss": 1.4789, + "step": 4322 + }, + { + "epoch": 0.7457948762184077, + "grad_norm": 0.66015625, + "learning_rate": 1.396533289476717e-05, + "loss": 1.3527, + "step": 4323 + }, + { + "epoch": 0.7459673941171396, + "grad_norm": 0.69921875, + "learning_rate": 1.3962834007686418e-05, + "loss": 1.4939, + "step": 4324 + }, + { + "epoch": 0.7461399120158716, + "grad_norm": 0.609375, + "learning_rate": 1.3960334827014168e-05, + "loss": 1.4151, + "step": 4325 + }, + { + "epoch": 0.7463124299146037, + "grad_norm": 0.80078125, + "learning_rate": 1.3957835352935566e-05, + "loss": 1.4769, + "step": 4326 + }, + { + "epoch": 0.7464849478133356, + "grad_norm": 0.78515625, + "learning_rate": 1.395533558563579e-05, + "loss": 1.3579, + "step": 4327 + }, + { + "epoch": 0.7466574657120676, + "grad_norm": 0.640625, + "learning_rate": 1.395283552530004e-05, + "loss": 1.4144, + "step": 4328 + }, + { + "epoch": 0.7468299836107997, + "grad_norm": 0.609375, + "learning_rate": 1.3950335172113539e-05, + "loss": 1.4561, + "step": 4329 + }, + { + "epoch": 0.7470025015095316, + "grad_norm": 0.6796875, + "learning_rate": 1.3947834526261521e-05, + "loss": 1.4137, + "step": 4330 + }, + { + "epoch": 0.7471750194082636, + "grad_norm": 0.66015625, + "learning_rate": 1.3945333587929257e-05, + "loss": 1.5995, + "step": 4331 + }, + { + "epoch": 0.7473475373069955, + "grad_norm": 0.75, + "learning_rate": 1.394283235730203e-05, + "loss": 1.4139, + "step": 4332 + }, + { + "epoch": 0.7475200552057276, + "grad_norm": 0.6875, + "learning_rate": 1.3940330834565144e-05, + "loss": 1.4916, + "step": 4333 + }, + { + "epoch": 0.7476925731044596, + "grad_norm": 0.640625, + "learning_rate": 1.393782901990393e-05, + "loss": 1.4787, + "step": 4334 + }, + { + "epoch": 0.7478650910031915, + "grad_norm": 0.60546875, + "learning_rate": 1.3935326913503737e-05, + "loss": 1.341, + "step": 4335 + }, + { + "epoch": 0.7480376089019236, + "grad_norm": 0.65234375, + "learning_rate": 1.3932824515549936e-05, + "loss": 1.5001, + "step": 4336 + }, + { + "epoch": 0.7482101268006556, + "grad_norm": 0.6171875, + "learning_rate": 1.3930321826227924e-05, + "loss": 1.4086, + "step": 4337 + }, + { + "epoch": 0.7483826446993875, + "grad_norm": 0.703125, + "learning_rate": 1.3927818845723114e-05, + "loss": 1.491, + "step": 4338 + }, + { + "epoch": 0.7485551625981196, + "grad_norm": 0.6484375, + "learning_rate": 1.392531557422094e-05, + "loss": 1.4783, + "step": 4339 + }, + { + "epoch": 0.7487276804968516, + "grad_norm": 0.89453125, + "learning_rate": 1.3922812011906862e-05, + "loss": 1.403, + "step": 4340 + }, + { + "epoch": 0.7489001983955835, + "grad_norm": 0.62890625, + "learning_rate": 1.3920308158966358e-05, + "loss": 1.3879, + "step": 4341 + }, + { + "epoch": 0.7490727162943155, + "grad_norm": 0.59375, + "learning_rate": 1.3917804015584932e-05, + "loss": 1.4304, + "step": 4342 + }, + { + "epoch": 0.7492452341930476, + "grad_norm": 0.62109375, + "learning_rate": 1.3915299581948106e-05, + "loss": 1.4423, + "step": 4343 + }, + { + "epoch": 0.7494177520917795, + "grad_norm": 0.875, + "learning_rate": 1.3912794858241423e-05, + "loss": 1.483, + "step": 4344 + }, + { + "epoch": 0.7495902699905115, + "grad_norm": 0.75, + "learning_rate": 1.3910289844650446e-05, + "loss": 1.3615, + "step": 4345 + }, + { + "epoch": 0.7497627878892436, + "grad_norm": 0.59375, + "learning_rate": 1.3907784541360765e-05, + "loss": 1.5126, + "step": 4346 + }, + { + "epoch": 0.7499353057879755, + "grad_norm": 1.0078125, + "learning_rate": 1.390527894855799e-05, + "loss": 1.4707, + "step": 4347 + }, + { + "epoch": 0.7501078236867075, + "grad_norm": 0.7109375, + "learning_rate": 1.390277306642775e-05, + "loss": 1.385, + "step": 4348 + }, + { + "epoch": 0.7502803415854394, + "grad_norm": 0.61328125, + "learning_rate": 1.3900266895155694e-05, + "loss": 1.3342, + "step": 4349 + }, + { + "epoch": 0.7504528594841715, + "grad_norm": 0.7109375, + "learning_rate": 1.3897760434927495e-05, + "loss": 1.4733, + "step": 4350 + }, + { + "epoch": 0.7506253773829035, + "grad_norm": 0.7265625, + "learning_rate": 1.3895253685928851e-05, + "loss": 1.4812, + "step": 4351 + }, + { + "epoch": 0.7507978952816354, + "grad_norm": 0.63671875, + "learning_rate": 1.3892746648345475e-05, + "loss": 1.3713, + "step": 4352 + }, + { + "epoch": 0.7509704131803675, + "grad_norm": 0.69140625, + "learning_rate": 1.3890239322363102e-05, + "loss": 1.4003, + "step": 4353 + }, + { + "epoch": 0.7511429310790995, + "grad_norm": 0.79296875, + "learning_rate": 1.3887731708167493e-05, + "loss": 1.4404, + "step": 4354 + }, + { + "epoch": 0.7513154489778314, + "grad_norm": 0.6640625, + "learning_rate": 1.388522380594443e-05, + "loss": 1.4032, + "step": 4355 + }, + { + "epoch": 0.7514879668765635, + "grad_norm": 0.64453125, + "learning_rate": 1.3882715615879709e-05, + "loss": 1.398, + "step": 4356 + }, + { + "epoch": 0.7516604847752955, + "grad_norm": 0.6015625, + "learning_rate": 1.3880207138159158e-05, + "loss": 1.4063, + "step": 4357 + }, + { + "epoch": 0.7518330026740274, + "grad_norm": 0.66796875, + "learning_rate": 1.3877698372968612e-05, + "loss": 1.4327, + "step": 4358 + }, + { + "epoch": 0.7520055205727594, + "grad_norm": 0.7578125, + "learning_rate": 1.3875189320493947e-05, + "loss": 1.6136, + "step": 4359 + }, + { + "epoch": 0.7521780384714915, + "grad_norm": 0.6484375, + "learning_rate": 1.3872679980921044e-05, + "loss": 1.353, + "step": 4360 + }, + { + "epoch": 0.7523505563702234, + "grad_norm": 0.75390625, + "learning_rate": 1.3870170354435808e-05, + "loss": 1.5392, + "step": 4361 + }, + { + "epoch": 0.7525230742689554, + "grad_norm": 0.60546875, + "learning_rate": 1.3867660441224172e-05, + "loss": 1.4298, + "step": 4362 + }, + { + "epoch": 0.7526955921676874, + "grad_norm": 0.5859375, + "learning_rate": 1.386515024147208e-05, + "loss": 1.4125, + "step": 4363 + }, + { + "epoch": 0.7528681100664194, + "grad_norm": 0.59375, + "learning_rate": 1.3862639755365512e-05, + "loss": 1.5319, + "step": 4364 + }, + { + "epoch": 0.7530406279651514, + "grad_norm": 0.61328125, + "learning_rate": 1.3860128983090455e-05, + "loss": 1.3204, + "step": 4365 + }, + { + "epoch": 0.7532131458638833, + "grad_norm": 0.68359375, + "learning_rate": 1.3857617924832923e-05, + "loss": 1.468, + "step": 4366 + }, + { + "epoch": 0.7533856637626154, + "grad_norm": 0.62109375, + "learning_rate": 1.385510658077895e-05, + "loss": 1.4754, + "step": 4367 + }, + { + "epoch": 0.7535581816613474, + "grad_norm": 0.66015625, + "learning_rate": 1.3852594951114594e-05, + "loss": 1.448, + "step": 4368 + }, + { + "epoch": 0.7537306995600793, + "grad_norm": 0.77734375, + "learning_rate": 1.3850083036025934e-05, + "loss": 1.3702, + "step": 4369 + }, + { + "epoch": 0.7539032174588114, + "grad_norm": 0.6015625, + "learning_rate": 1.3847570835699066e-05, + "loss": 1.557, + "step": 4370 + }, + { + "epoch": 0.7540757353575434, + "grad_norm": 0.703125, + "learning_rate": 1.3845058350320109e-05, + "loss": 1.3292, + "step": 4371 + }, + { + "epoch": 0.7542482532562753, + "grad_norm": 0.71875, + "learning_rate": 1.3842545580075206e-05, + "loss": 1.5168, + "step": 4372 + }, + { + "epoch": 0.7544207711550074, + "grad_norm": 0.625, + "learning_rate": 1.3840032525150516e-05, + "loss": 1.47, + "step": 4373 + }, + { + "epoch": 0.7545932890537393, + "grad_norm": 0.64453125, + "learning_rate": 1.3837519185732222e-05, + "loss": 1.3561, + "step": 4374 + }, + { + "epoch": 0.7547658069524713, + "grad_norm": 0.82421875, + "learning_rate": 1.3835005562006529e-05, + "loss": 1.3902, + "step": 4375 + }, + { + "epoch": 0.7549383248512033, + "grad_norm": 0.6640625, + "learning_rate": 1.3832491654159663e-05, + "loss": 1.4831, + "step": 4376 + }, + { + "epoch": 0.7551108427499353, + "grad_norm": 0.640625, + "learning_rate": 1.3829977462377868e-05, + "loss": 1.4294, + "step": 4377 + }, + { + "epoch": 0.7552833606486673, + "grad_norm": 0.765625, + "learning_rate": 1.3827462986847411e-05, + "loss": 1.4088, + "step": 4378 + }, + { + "epoch": 0.7554558785473993, + "grad_norm": 0.62890625, + "learning_rate": 1.3824948227754583e-05, + "loss": 1.4768, + "step": 4379 + }, + { + "epoch": 0.7556283964461313, + "grad_norm": 0.6875, + "learning_rate": 1.3822433185285691e-05, + "loss": 1.4937, + "step": 4380 + }, + { + "epoch": 0.7558009143448633, + "grad_norm": 0.76171875, + "learning_rate": 1.3819917859627063e-05, + "loss": 1.3964, + "step": 4381 + }, + { + "epoch": 0.7559734322435953, + "grad_norm": 0.7109375, + "learning_rate": 1.381740225096505e-05, + "loss": 1.3588, + "step": 4382 + }, + { + "epoch": 0.7561459501423272, + "grad_norm": 0.73046875, + "learning_rate": 1.3814886359486028e-05, + "loss": 1.5209, + "step": 4383 + }, + { + "epoch": 0.7563184680410593, + "grad_norm": 0.9296875, + "learning_rate": 1.3812370185376389e-05, + "loss": 1.4714, + "step": 4384 + }, + { + "epoch": 0.7564909859397912, + "grad_norm": 0.6953125, + "learning_rate": 1.3809853728822545e-05, + "loss": 1.3945, + "step": 4385 + }, + { + "epoch": 0.7566635038385232, + "grad_norm": 0.62890625, + "learning_rate": 1.3807336990010934e-05, + "loss": 1.4341, + "step": 4386 + }, + { + "epoch": 0.7568360217372553, + "grad_norm": 0.69921875, + "learning_rate": 1.3804819969128006e-05, + "loss": 1.3775, + "step": 4387 + }, + { + "epoch": 0.7570085396359872, + "grad_norm": 0.73828125, + "learning_rate": 1.3802302666360244e-05, + "loss": 1.4069, + "step": 4388 + }, + { + "epoch": 0.7571810575347192, + "grad_norm": 0.71484375, + "learning_rate": 1.379978508189414e-05, + "loss": 1.4814, + "step": 4389 + }, + { + "epoch": 0.7573535754334513, + "grad_norm": 0.62109375, + "learning_rate": 1.3797267215916215e-05, + "loss": 1.364, + "step": 4390 + }, + { + "epoch": 0.7575260933321832, + "grad_norm": 0.90234375, + "learning_rate": 1.3794749068613009e-05, + "loss": 1.4798, + "step": 4391 + }, + { + "epoch": 0.7576986112309152, + "grad_norm": 0.58203125, + "learning_rate": 1.3792230640171085e-05, + "loss": 1.4018, + "step": 4392 + }, + { + "epoch": 0.7578711291296472, + "grad_norm": 0.68359375, + "learning_rate": 1.3789711930777017e-05, + "loss": 1.4674, + "step": 4393 + }, + { + "epoch": 0.7580436470283792, + "grad_norm": 0.6171875, + "learning_rate": 1.3787192940617408e-05, + "loss": 1.394, + "step": 4394 + }, + { + "epoch": 0.7582161649271112, + "grad_norm": 0.6796875, + "learning_rate": 1.3784673669878883e-05, + "loss": 1.4453, + "step": 4395 + }, + { + "epoch": 0.7583886828258432, + "grad_norm": 0.62109375, + "learning_rate": 1.378215411874809e-05, + "loss": 1.5361, + "step": 4396 + }, + { + "epoch": 0.7585612007245752, + "grad_norm": 0.66796875, + "learning_rate": 1.3779634287411683e-05, + "loss": 1.4041, + "step": 4397 + }, + { + "epoch": 0.7587337186233072, + "grad_norm": 0.73046875, + "learning_rate": 1.3777114176056353e-05, + "loss": 1.4944, + "step": 4398 + }, + { + "epoch": 0.7589062365220391, + "grad_norm": 0.64453125, + "learning_rate": 1.3774593784868802e-05, + "loss": 1.4541, + "step": 4399 + }, + { + "epoch": 0.7590787544207711, + "grad_norm": 0.58984375, + "learning_rate": 1.3772073114035762e-05, + "loss": 1.3346, + "step": 4400 + }, + { + "epoch": 0.7590787544207711, + "eval_loss": 1.4194252490997314, + "eval_runtime": 10.7734, + "eval_samples_per_second": 95.049, + "eval_steps_per_second": 23.762, + "step": 4400 + }, + { + "epoch": 0.7592512723195032, + "grad_norm": 0.66015625, + "learning_rate": 1.3769552163743974e-05, + "loss": 1.5201, + "step": 4401 + }, + { + "epoch": 0.7594237902182351, + "grad_norm": 0.7578125, + "learning_rate": 1.376703093418021e-05, + "loss": 1.4532, + "step": 4402 + }, + { + "epoch": 0.7595963081169671, + "grad_norm": 0.6171875, + "learning_rate": 1.3764509425531256e-05, + "loss": 1.4176, + "step": 4403 + }, + { + "epoch": 0.7597688260156992, + "grad_norm": 0.6015625, + "learning_rate": 1.3761987637983924e-05, + "loss": 1.4204, + "step": 4404 + }, + { + "epoch": 0.7599413439144311, + "grad_norm": 0.65234375, + "learning_rate": 1.375946557172504e-05, + "loss": 1.4231, + "step": 4405 + }, + { + "epoch": 0.7601138618131631, + "grad_norm": 0.59375, + "learning_rate": 1.3756943226941458e-05, + "loss": 1.3763, + "step": 4406 + }, + { + "epoch": 0.760286379711895, + "grad_norm": 0.625, + "learning_rate": 1.3754420603820045e-05, + "loss": 1.4556, + "step": 4407 + }, + { + "epoch": 0.7604588976106271, + "grad_norm": 0.75, + "learning_rate": 1.3751897702547698e-05, + "loss": 1.4294, + "step": 4408 + }, + { + "epoch": 0.7606314155093591, + "grad_norm": 0.74609375, + "learning_rate": 1.3749374523311325e-05, + "loss": 1.4873, + "step": 4409 + }, + { + "epoch": 0.760803933408091, + "grad_norm": 0.6328125, + "learning_rate": 1.374685106629786e-05, + "loss": 1.3729, + "step": 4410 + }, + { + "epoch": 0.7609764513068231, + "grad_norm": 0.6328125, + "learning_rate": 1.3744327331694254e-05, + "loss": 1.419, + "step": 4411 + }, + { + "epoch": 0.7611489692055551, + "grad_norm": 0.63671875, + "learning_rate": 1.3741803319687488e-05, + "loss": 1.4207, + "step": 4412 + }, + { + "epoch": 0.761321487104287, + "grad_norm": 0.640625, + "learning_rate": 1.373927903046455e-05, + "loss": 1.4772, + "step": 4413 + }, + { + "epoch": 0.7614940050030191, + "grad_norm": 0.61328125, + "learning_rate": 1.3736754464212456e-05, + "loss": 1.4847, + "step": 4414 + }, + { + "epoch": 0.7616665229017511, + "grad_norm": 0.6640625, + "learning_rate": 1.3734229621118243e-05, + "loss": 1.3985, + "step": 4415 + }, + { + "epoch": 0.761839040800483, + "grad_norm": 0.59765625, + "learning_rate": 1.373170450136897e-05, + "loss": 1.4265, + "step": 4416 + }, + { + "epoch": 0.762011558699215, + "grad_norm": 0.6796875, + "learning_rate": 1.3729179105151708e-05, + "loss": 1.4387, + "step": 4417 + }, + { + "epoch": 0.7621840765979471, + "grad_norm": 0.6328125, + "learning_rate": 1.3726653432653559e-05, + "loss": 1.514, + "step": 4418 + }, + { + "epoch": 0.762356594496679, + "grad_norm": 0.76953125, + "learning_rate": 1.3724127484061635e-05, + "loss": 1.5758, + "step": 4419 + }, + { + "epoch": 0.762529112395411, + "grad_norm": 0.56640625, + "learning_rate": 1.3721601259563083e-05, + "loss": 1.4226, + "step": 4420 + }, + { + "epoch": 0.7627016302941431, + "grad_norm": 0.5703125, + "learning_rate": 1.3719074759345052e-05, + "loss": 1.4775, + "step": 4421 + }, + { + "epoch": 0.762874148192875, + "grad_norm": 0.75, + "learning_rate": 1.3716547983594726e-05, + "loss": 1.5176, + "step": 4422 + }, + { + "epoch": 0.763046666091607, + "grad_norm": 0.71484375, + "learning_rate": 1.3714020932499303e-05, + "loss": 1.3939, + "step": 4423 + }, + { + "epoch": 0.763219183990339, + "grad_norm": 0.65234375, + "learning_rate": 1.3711493606246005e-05, + "loss": 1.4731, + "step": 4424 + }, + { + "epoch": 0.763391701889071, + "grad_norm": 0.61328125, + "learning_rate": 1.370896600502207e-05, + "loss": 1.5322, + "step": 4425 + }, + { + "epoch": 0.763564219787803, + "grad_norm": 0.58984375, + "learning_rate": 1.3706438129014756e-05, + "loss": 1.4391, + "step": 4426 + }, + { + "epoch": 0.7637367376865349, + "grad_norm": 0.73046875, + "learning_rate": 1.3703909978411348e-05, + "loss": 1.3965, + "step": 4427 + }, + { + "epoch": 0.763909255585267, + "grad_norm": 0.60546875, + "learning_rate": 1.3701381553399147e-05, + "loss": 1.4971, + "step": 4428 + }, + { + "epoch": 0.764081773483999, + "grad_norm": 0.58984375, + "learning_rate": 1.369885285416547e-05, + "loss": 1.4536, + "step": 4429 + }, + { + "epoch": 0.7642542913827309, + "grad_norm": 0.6171875, + "learning_rate": 1.3696323880897664e-05, + "loss": 1.4337, + "step": 4430 + }, + { + "epoch": 0.764426809281463, + "grad_norm": 0.65234375, + "learning_rate": 1.3693794633783087e-05, + "loss": 1.3907, + "step": 4431 + }, + { + "epoch": 0.764599327180195, + "grad_norm": 0.71484375, + "learning_rate": 1.3691265113009126e-05, + "loss": 1.5093, + "step": 4432 + }, + { + "epoch": 0.7647718450789269, + "grad_norm": 0.7578125, + "learning_rate": 1.3688735318763183e-05, + "loss": 1.374, + "step": 4433 + }, + { + "epoch": 0.7649443629776589, + "grad_norm": 0.74609375, + "learning_rate": 1.3686205251232676e-05, + "loss": 1.4475, + "step": 4434 + }, + { + "epoch": 0.765116880876391, + "grad_norm": 0.6015625, + "learning_rate": 1.3683674910605053e-05, + "loss": 1.3778, + "step": 4435 + }, + { + "epoch": 0.7652893987751229, + "grad_norm": 0.6953125, + "learning_rate": 1.3681144297067777e-05, + "loss": 1.3342, + "step": 4436 + }, + { + "epoch": 0.7654619166738549, + "grad_norm": 0.61328125, + "learning_rate": 1.367861341080833e-05, + "loss": 1.3892, + "step": 4437 + }, + { + "epoch": 0.765634434572587, + "grad_norm": 0.70703125, + "learning_rate": 1.3676082252014213e-05, + "loss": 1.4011, + "step": 4438 + }, + { + "epoch": 0.7658069524713189, + "grad_norm": 0.62890625, + "learning_rate": 1.3673550820872957e-05, + "loss": 1.465, + "step": 4439 + }, + { + "epoch": 0.7659794703700509, + "grad_norm": 0.55859375, + "learning_rate": 1.3671019117572104e-05, + "loss": 1.3866, + "step": 4440 + }, + { + "epoch": 0.7661519882687828, + "grad_norm": 0.6171875, + "learning_rate": 1.3668487142299217e-05, + "loss": 1.527, + "step": 4441 + }, + { + "epoch": 0.7663245061675149, + "grad_norm": 0.60546875, + "learning_rate": 1.3665954895241877e-05, + "loss": 1.4585, + "step": 4442 + }, + { + "epoch": 0.7664970240662469, + "grad_norm": 0.72265625, + "learning_rate": 1.3663422376587695e-05, + "loss": 1.4673, + "step": 4443 + }, + { + "epoch": 0.7666695419649788, + "grad_norm": 0.6953125, + "learning_rate": 1.3660889586524295e-05, + "loss": 1.4605, + "step": 4444 + }, + { + "epoch": 0.7668420598637109, + "grad_norm": 0.59765625, + "learning_rate": 1.3658356525239316e-05, + "loss": 1.4475, + "step": 4445 + }, + { + "epoch": 0.7670145777624429, + "grad_norm": 0.63671875, + "learning_rate": 1.365582319292043e-05, + "loss": 1.5515, + "step": 4446 + }, + { + "epoch": 0.7671870956611748, + "grad_norm": 0.59765625, + "learning_rate": 1.3653289589755314e-05, + "loss": 1.4148, + "step": 4447 + }, + { + "epoch": 0.7673596135599069, + "grad_norm": 0.61328125, + "learning_rate": 1.3650755715931685e-05, + "loss": 1.4041, + "step": 4448 + }, + { + "epoch": 0.7675321314586389, + "grad_norm": 0.75390625, + "learning_rate": 1.3648221571637259e-05, + "loss": 1.4792, + "step": 4449 + }, + { + "epoch": 0.7677046493573708, + "grad_norm": 0.6640625, + "learning_rate": 1.3645687157059783e-05, + "loss": 1.3566, + "step": 4450 + }, + { + "epoch": 0.7678771672561028, + "grad_norm": 0.6328125, + "learning_rate": 1.3643152472387024e-05, + "loss": 1.3547, + "step": 4451 + }, + { + "epoch": 0.7680496851548348, + "grad_norm": 0.60546875, + "learning_rate": 1.364061751780677e-05, + "loss": 1.4479, + "step": 4452 + }, + { + "epoch": 0.7682222030535668, + "grad_norm": 0.62890625, + "learning_rate": 1.3638082293506818e-05, + "loss": 1.4658, + "step": 4453 + }, + { + "epoch": 0.7683947209522988, + "grad_norm": 0.6328125, + "learning_rate": 1.3635546799674999e-05, + "loss": 1.4645, + "step": 4454 + }, + { + "epoch": 0.7685672388510308, + "grad_norm": 0.625, + "learning_rate": 1.3633011036499158e-05, + "loss": 1.4945, + "step": 4455 + }, + { + "epoch": 0.7687397567497628, + "grad_norm": 0.55078125, + "learning_rate": 1.3630475004167159e-05, + "loss": 1.4176, + "step": 4456 + }, + { + "epoch": 0.7689122746484948, + "grad_norm": 0.6875, + "learning_rate": 1.3627938702866885e-05, + "loss": 1.3811, + "step": 4457 + }, + { + "epoch": 0.7690847925472267, + "grad_norm": 0.57421875, + "learning_rate": 1.3625402132786247e-05, + "loss": 1.5251, + "step": 4458 + }, + { + "epoch": 0.7692573104459588, + "grad_norm": 0.69921875, + "learning_rate": 1.3622865294113164e-05, + "loss": 1.5809, + "step": 4459 + }, + { + "epoch": 0.7694298283446908, + "grad_norm": 0.71875, + "learning_rate": 1.3620328187035585e-05, + "loss": 1.4454, + "step": 4460 + }, + { + "epoch": 0.7696023462434227, + "grad_norm": 0.71484375, + "learning_rate": 1.3617790811741473e-05, + "loss": 1.5201, + "step": 4461 + }, + { + "epoch": 0.7697748641421548, + "grad_norm": 0.64453125, + "learning_rate": 1.3615253168418811e-05, + "loss": 1.5174, + "step": 4462 + }, + { + "epoch": 0.7699473820408868, + "grad_norm": 0.7109375, + "learning_rate": 1.3612715257255604e-05, + "loss": 1.5593, + "step": 4463 + }, + { + "epoch": 0.7701198999396187, + "grad_norm": 0.640625, + "learning_rate": 1.3610177078439882e-05, + "loss": 1.4046, + "step": 4464 + }, + { + "epoch": 0.7702924178383507, + "grad_norm": 0.5859375, + "learning_rate": 1.3607638632159681e-05, + "loss": 1.5045, + "step": 4465 + }, + { + "epoch": 0.7704649357370827, + "grad_norm": 0.62109375, + "learning_rate": 1.3605099918603069e-05, + "loss": 1.3441, + "step": 4466 + }, + { + "epoch": 0.7706374536358147, + "grad_norm": 0.61328125, + "learning_rate": 1.360256093795813e-05, + "loss": 1.3589, + "step": 4467 + }, + { + "epoch": 0.7708099715345467, + "grad_norm": 0.61328125, + "learning_rate": 1.3600021690412968e-05, + "loss": 1.3647, + "step": 4468 + }, + { + "epoch": 0.7709824894332787, + "grad_norm": 0.66796875, + "learning_rate": 1.3597482176155705e-05, + "loss": 1.5231, + "step": 4469 + }, + { + "epoch": 0.7711550073320107, + "grad_norm": 0.59765625, + "learning_rate": 1.3594942395374482e-05, + "loss": 1.3733, + "step": 4470 + }, + { + "epoch": 0.7713275252307427, + "grad_norm": 0.75390625, + "learning_rate": 1.3592402348257465e-05, + "loss": 1.4228, + "step": 4471 + }, + { + "epoch": 0.7715000431294747, + "grad_norm": 0.6640625, + "learning_rate": 1.3589862034992838e-05, + "loss": 1.5187, + "step": 4472 + }, + { + "epoch": 0.7716725610282067, + "grad_norm": 0.625, + "learning_rate": 1.3587321455768798e-05, + "loss": 1.5008, + "step": 4473 + }, + { + "epoch": 0.7718450789269387, + "grad_norm": 0.65625, + "learning_rate": 1.3584780610773572e-05, + "loss": 1.5196, + "step": 4474 + }, + { + "epoch": 0.7720175968256706, + "grad_norm": 0.6171875, + "learning_rate": 1.3582239500195399e-05, + "loss": 1.3848, + "step": 4475 + }, + { + "epoch": 0.7721901147244027, + "grad_norm": 0.58203125, + "learning_rate": 1.357969812422254e-05, + "loss": 1.4315, + "step": 4476 + }, + { + "epoch": 0.7723626326231346, + "grad_norm": 0.59375, + "learning_rate": 1.3577156483043278e-05, + "loss": 1.4297, + "step": 4477 + }, + { + "epoch": 0.7725351505218666, + "grad_norm": 0.79296875, + "learning_rate": 1.3574614576845912e-05, + "loss": 1.5183, + "step": 4478 + }, + { + "epoch": 0.7727076684205987, + "grad_norm": 0.68359375, + "learning_rate": 1.3572072405818762e-05, + "loss": 1.3574, + "step": 4479 + }, + { + "epoch": 0.7728801863193306, + "grad_norm": 0.69921875, + "learning_rate": 1.356952997015017e-05, + "loss": 1.3782, + "step": 4480 + }, + { + "epoch": 0.7730527042180626, + "grad_norm": 0.77734375, + "learning_rate": 1.3566987270028495e-05, + "loss": 1.3915, + "step": 4481 + }, + { + "epoch": 0.7732252221167946, + "grad_norm": 0.63671875, + "learning_rate": 1.3564444305642116e-05, + "loss": 1.4268, + "step": 4482 + }, + { + "epoch": 0.7733977400155266, + "grad_norm": 0.59375, + "learning_rate": 1.3561901077179429e-05, + "loss": 1.4138, + "step": 4483 + }, + { + "epoch": 0.7735702579142586, + "grad_norm": 0.83984375, + "learning_rate": 1.3559357584828857e-05, + "loss": 1.384, + "step": 4484 + }, + { + "epoch": 0.7737427758129906, + "grad_norm": 0.76953125, + "learning_rate": 1.3556813828778833e-05, + "loss": 1.454, + "step": 4485 + }, + { + "epoch": 0.7739152937117226, + "grad_norm": 0.5703125, + "learning_rate": 1.3554269809217817e-05, + "loss": 1.3827, + "step": 4486 + }, + { + "epoch": 0.7740878116104546, + "grad_norm": 0.69921875, + "learning_rate": 1.3551725526334286e-05, + "loss": 1.4953, + "step": 4487 + }, + { + "epoch": 0.7742603295091866, + "grad_norm": 0.703125, + "learning_rate": 1.3549180980316737e-05, + "loss": 1.3566, + "step": 4488 + }, + { + "epoch": 0.7744328474079186, + "grad_norm": 0.859375, + "learning_rate": 1.3546636171353681e-05, + "loss": 1.4721, + "step": 4489 + }, + { + "epoch": 0.7746053653066506, + "grad_norm": 0.68359375, + "learning_rate": 1.354409109963366e-05, + "loss": 1.4763, + "step": 4490 + }, + { + "epoch": 0.7747778832053825, + "grad_norm": 0.83984375, + "learning_rate": 1.3541545765345222e-05, + "loss": 1.4729, + "step": 4491 + }, + { + "epoch": 0.7749504011041145, + "grad_norm": 0.6640625, + "learning_rate": 1.353900016867695e-05, + "loss": 1.4975, + "step": 4492 + }, + { + "epoch": 0.7751229190028466, + "grad_norm": 0.625, + "learning_rate": 1.3536454309817433e-05, + "loss": 1.4912, + "step": 4493 + }, + { + "epoch": 0.7752954369015785, + "grad_norm": 1.015625, + "learning_rate": 1.3533908188955281e-05, + "loss": 1.4514, + "step": 4494 + }, + { + "epoch": 0.7754679548003105, + "grad_norm": 0.65625, + "learning_rate": 1.3531361806279128e-05, + "loss": 1.5305, + "step": 4495 + }, + { + "epoch": 0.7756404726990426, + "grad_norm": 0.6484375, + "learning_rate": 1.352881516197763e-05, + "loss": 1.4676, + "step": 4496 + }, + { + "epoch": 0.7758129905977745, + "grad_norm": 0.70703125, + "learning_rate": 1.3526268256239456e-05, + "loss": 1.4794, + "step": 4497 + }, + { + "epoch": 0.7759855084965065, + "grad_norm": 0.6171875, + "learning_rate": 1.3523721089253296e-05, + "loss": 1.4701, + "step": 4498 + }, + { + "epoch": 0.7761580263952385, + "grad_norm": 0.765625, + "learning_rate": 1.352117366120786e-05, + "loss": 1.483, + "step": 4499 + }, + { + "epoch": 0.7763305442939705, + "grad_norm": 0.89453125, + "learning_rate": 1.351862597229188e-05, + "loss": 1.4353, + "step": 4500 + }, + { + "epoch": 0.7763305442939705, + "eval_loss": 1.418400526046753, + "eval_runtime": 10.861, + "eval_samples_per_second": 94.282, + "eval_steps_per_second": 23.571, + "step": 4500 + }, + { + "epoch": 0.7765030621927025, + "grad_norm": 0.58984375, + "learning_rate": 1.35160780226941e-05, + "loss": 1.5238, + "step": 4501 + }, + { + "epoch": 0.7766755800914344, + "grad_norm": 0.66796875, + "learning_rate": 1.351352981260329e-05, + "loss": 1.4664, + "step": 4502 + }, + { + "epoch": 0.7768480979901665, + "grad_norm": 0.7421875, + "learning_rate": 1.351098134220824e-05, + "loss": 1.379, + "step": 4503 + }, + { + "epoch": 0.7770206158888985, + "grad_norm": 0.7890625, + "learning_rate": 1.3508432611697755e-05, + "loss": 1.3869, + "step": 4504 + }, + { + "epoch": 0.7771931337876304, + "grad_norm": 0.7421875, + "learning_rate": 1.350588362126066e-05, + "loss": 1.5221, + "step": 4505 + }, + { + "epoch": 0.7773656516863625, + "grad_norm": 0.625, + "learning_rate": 1.35033343710858e-05, + "loss": 1.3187, + "step": 4506 + }, + { + "epoch": 0.7775381695850945, + "grad_norm": 0.68359375, + "learning_rate": 1.350078486136204e-05, + "loss": 1.3784, + "step": 4507 + }, + { + "epoch": 0.7777106874838264, + "grad_norm": 1.0078125, + "learning_rate": 1.3498235092278263e-05, + "loss": 1.4881, + "step": 4508 + }, + { + "epoch": 0.7778832053825584, + "grad_norm": 0.62109375, + "learning_rate": 1.3495685064023374e-05, + "loss": 1.3369, + "step": 4509 + }, + { + "epoch": 0.7780557232812905, + "grad_norm": 0.61328125, + "learning_rate": 1.3493134776786292e-05, + "loss": 1.4683, + "step": 4510 + }, + { + "epoch": 0.7782282411800224, + "grad_norm": 0.9453125, + "learning_rate": 1.3490584230755956e-05, + "loss": 1.4639, + "step": 4511 + }, + { + "epoch": 0.7784007590787544, + "grad_norm": 0.59765625, + "learning_rate": 1.3488033426121336e-05, + "loss": 1.3748, + "step": 4512 + }, + { + "epoch": 0.7785732769774865, + "grad_norm": 0.5859375, + "learning_rate": 1.3485482363071402e-05, + "loss": 1.3758, + "step": 4513 + }, + { + "epoch": 0.7787457948762184, + "grad_norm": 0.61328125, + "learning_rate": 1.3482931041795158e-05, + "loss": 1.3721, + "step": 4514 + }, + { + "epoch": 0.7789183127749504, + "grad_norm": 0.671875, + "learning_rate": 1.3480379462481619e-05, + "loss": 1.4502, + "step": 4515 + }, + { + "epoch": 0.7790908306736823, + "grad_norm": 0.765625, + "learning_rate": 1.3477827625319826e-05, + "loss": 1.4347, + "step": 4516 + }, + { + "epoch": 0.7792633485724144, + "grad_norm": 0.65234375, + "learning_rate": 1.3475275530498828e-05, + "loss": 1.4338, + "step": 4517 + }, + { + "epoch": 0.7794358664711464, + "grad_norm": 0.5703125, + "learning_rate": 1.3472723178207708e-05, + "loss": 1.4674, + "step": 4518 + }, + { + "epoch": 0.7796083843698783, + "grad_norm": 0.62890625, + "learning_rate": 1.3470170568635552e-05, + "loss": 1.51, + "step": 4519 + }, + { + "epoch": 0.7797809022686104, + "grad_norm": 0.65234375, + "learning_rate": 1.346761770197148e-05, + "loss": 1.4031, + "step": 4520 + }, + { + "epoch": 0.7799534201673424, + "grad_norm": 0.62890625, + "learning_rate": 1.3465064578404623e-05, + "loss": 1.4428, + "step": 4521 + }, + { + "epoch": 0.7801259380660743, + "grad_norm": 0.65234375, + "learning_rate": 1.346251119812413e-05, + "loss": 1.5229, + "step": 4522 + }, + { + "epoch": 0.7802984559648064, + "grad_norm": 0.78125, + "learning_rate": 1.3459957561319175e-05, + "loss": 1.4638, + "step": 4523 + }, + { + "epoch": 0.7804709738635384, + "grad_norm": 0.625, + "learning_rate": 1.3457403668178941e-05, + "loss": 1.384, + "step": 4524 + }, + { + "epoch": 0.7806434917622703, + "grad_norm": 0.7578125, + "learning_rate": 1.3454849518892644e-05, + "loss": 1.4915, + "step": 4525 + }, + { + "epoch": 0.7808160096610023, + "grad_norm": 0.703125, + "learning_rate": 1.3452295113649505e-05, + "loss": 1.4184, + "step": 4526 + }, + { + "epoch": 0.7809885275597344, + "grad_norm": 0.67578125, + "learning_rate": 1.3449740452638773e-05, + "loss": 1.3411, + "step": 4527 + }, + { + "epoch": 0.7811610454584663, + "grad_norm": 0.61328125, + "learning_rate": 1.3447185536049716e-05, + "loss": 1.4498, + "step": 4528 + }, + { + "epoch": 0.7813335633571983, + "grad_norm": 0.58984375, + "learning_rate": 1.3444630364071614e-05, + "loss": 1.4373, + "step": 4529 + }, + { + "epoch": 0.7815060812559304, + "grad_norm": 0.61328125, + "learning_rate": 1.344207493689377e-05, + "loss": 1.5779, + "step": 4530 + }, + { + "epoch": 0.7816785991546623, + "grad_norm": 0.77734375, + "learning_rate": 1.3439519254705509e-05, + "loss": 1.395, + "step": 4531 + }, + { + "epoch": 0.7818511170533943, + "grad_norm": 0.609375, + "learning_rate": 1.3436963317696172e-05, + "loss": 1.498, + "step": 4532 + }, + { + "epoch": 0.7820236349521262, + "grad_norm": 0.6796875, + "learning_rate": 1.3434407126055115e-05, + "loss": 1.4939, + "step": 4533 + }, + { + "epoch": 0.7821961528508583, + "grad_norm": 0.578125, + "learning_rate": 1.343185067997172e-05, + "loss": 1.411, + "step": 4534 + }, + { + "epoch": 0.7823686707495903, + "grad_norm": 0.6875, + "learning_rate": 1.3429293979635382e-05, + "loss": 1.4517, + "step": 4535 + }, + { + "epoch": 0.7825411886483222, + "grad_norm": 0.640625, + "learning_rate": 1.3426737025235521e-05, + "loss": 1.4377, + "step": 4536 + }, + { + "epoch": 0.7827137065470543, + "grad_norm": 0.7890625, + "learning_rate": 1.3424179816961572e-05, + "loss": 1.4659, + "step": 4537 + }, + { + "epoch": 0.7828862244457863, + "grad_norm": 0.83984375, + "learning_rate": 1.3421622355002981e-05, + "loss": 1.5162, + "step": 4538 + }, + { + "epoch": 0.7830587423445182, + "grad_norm": 0.59765625, + "learning_rate": 1.3419064639549232e-05, + "loss": 1.4428, + "step": 4539 + }, + { + "epoch": 0.7832312602432502, + "grad_norm": 0.65234375, + "learning_rate": 1.3416506670789807e-05, + "loss": 1.454, + "step": 4540 + }, + { + "epoch": 0.7834037781419823, + "grad_norm": 0.625, + "learning_rate": 1.3413948448914223e-05, + "loss": 1.4996, + "step": 4541 + }, + { + "epoch": 0.7835762960407142, + "grad_norm": 0.6484375, + "learning_rate": 1.3411389974112006e-05, + "loss": 1.4176, + "step": 4542 + }, + { + "epoch": 0.7837488139394462, + "grad_norm": 0.6015625, + "learning_rate": 1.3408831246572703e-05, + "loss": 1.4108, + "step": 4543 + }, + { + "epoch": 0.7839213318381782, + "grad_norm": 0.80078125, + "learning_rate": 1.3406272266485882e-05, + "loss": 1.4055, + "step": 4544 + }, + { + "epoch": 0.7840938497369102, + "grad_norm": 0.59375, + "learning_rate": 1.340371303404113e-05, + "loss": 1.4123, + "step": 4545 + }, + { + "epoch": 0.7842663676356422, + "grad_norm": 0.7890625, + "learning_rate": 1.3401153549428048e-05, + "loss": 1.4729, + "step": 4546 + }, + { + "epoch": 0.7844388855343742, + "grad_norm": 0.671875, + "learning_rate": 1.3398593812836259e-05, + "loss": 1.5255, + "step": 4547 + }, + { + "epoch": 0.7846114034331062, + "grad_norm": 0.79296875, + "learning_rate": 1.3396033824455408e-05, + "loss": 1.4295, + "step": 4548 + }, + { + "epoch": 0.7847839213318382, + "grad_norm": 0.64453125, + "learning_rate": 1.339347358447515e-05, + "loss": 1.3872, + "step": 4549 + }, + { + "epoch": 0.7849564392305701, + "grad_norm": 0.69921875, + "learning_rate": 1.3390913093085165e-05, + "loss": 1.3967, + "step": 4550 + }, + { + "epoch": 0.7851289571293022, + "grad_norm": 0.671875, + "learning_rate": 1.338835235047515e-05, + "loss": 1.4759, + "step": 4551 + }, + { + "epoch": 0.7853014750280342, + "grad_norm": 0.83984375, + "learning_rate": 1.3385791356834821e-05, + "loss": 1.3507, + "step": 4552 + }, + { + "epoch": 0.7854739929267661, + "grad_norm": 0.578125, + "learning_rate": 1.3383230112353917e-05, + "loss": 1.3755, + "step": 4553 + }, + { + "epoch": 0.7856465108254982, + "grad_norm": 0.59765625, + "learning_rate": 1.3380668617222183e-05, + "loss": 1.4981, + "step": 4554 + }, + { + "epoch": 0.7858190287242302, + "grad_norm": 0.578125, + "learning_rate": 1.3378106871629395e-05, + "loss": 1.4185, + "step": 4555 + }, + { + "epoch": 0.7859915466229621, + "grad_norm": 0.60546875, + "learning_rate": 1.3375544875765343e-05, + "loss": 1.5073, + "step": 4556 + }, + { + "epoch": 0.7861640645216941, + "grad_norm": 0.6328125, + "learning_rate": 1.3372982629819837e-05, + "loss": 1.4959, + "step": 4557 + }, + { + "epoch": 0.7863365824204261, + "grad_norm": 0.62109375, + "learning_rate": 1.33704201339827e-05, + "loss": 1.4126, + "step": 4558 + }, + { + "epoch": 0.7865091003191581, + "grad_norm": 0.55859375, + "learning_rate": 1.336785738844378e-05, + "loss": 1.456, + "step": 4559 + }, + { + "epoch": 0.7866816182178901, + "grad_norm": 0.75390625, + "learning_rate": 1.3365294393392942e-05, + "loss": 1.545, + "step": 4560 + }, + { + "epoch": 0.7868541361166221, + "grad_norm": 0.6171875, + "learning_rate": 1.336273114902007e-05, + "loss": 1.4324, + "step": 4561 + }, + { + "epoch": 0.7870266540153541, + "grad_norm": 0.59375, + "learning_rate": 1.336016765551506e-05, + "loss": 1.4027, + "step": 4562 + }, + { + "epoch": 0.7871991719140861, + "grad_norm": 0.671875, + "learning_rate": 1.3357603913067836e-05, + "loss": 1.4484, + "step": 4563 + }, + { + "epoch": 0.7873716898128181, + "grad_norm": 0.6015625, + "learning_rate": 1.3355039921868334e-05, + "loss": 1.447, + "step": 4564 + }, + { + "epoch": 0.7875442077115501, + "grad_norm": 0.6484375, + "learning_rate": 1.3352475682106515e-05, + "loss": 1.5331, + "step": 4565 + }, + { + "epoch": 0.787716725610282, + "grad_norm": 0.57421875, + "learning_rate": 1.3349911193972345e-05, + "loss": 1.5616, + "step": 4566 + }, + { + "epoch": 0.787889243509014, + "grad_norm": 0.62890625, + "learning_rate": 1.3347346457655826e-05, + "loss": 1.4837, + "step": 4567 + }, + { + "epoch": 0.7880617614077461, + "grad_norm": 0.8359375, + "learning_rate": 1.3344781473346964e-05, + "loss": 1.4843, + "step": 4568 + }, + { + "epoch": 0.788234279306478, + "grad_norm": 0.57421875, + "learning_rate": 1.3342216241235794e-05, + "loss": 1.3826, + "step": 4569 + }, + { + "epoch": 0.78840679720521, + "grad_norm": 0.57421875, + "learning_rate": 1.3339650761512361e-05, + "loss": 1.4971, + "step": 4570 + }, + { + "epoch": 0.7885793151039421, + "grad_norm": 0.68359375, + "learning_rate": 1.333708503436673e-05, + "loss": 1.4259, + "step": 4571 + }, + { + "epoch": 0.788751833002674, + "grad_norm": 0.68359375, + "learning_rate": 1.333451905998899e-05, + "loss": 1.5063, + "step": 4572 + }, + { + "epoch": 0.788924350901406, + "grad_norm": 0.640625, + "learning_rate": 1.3331952838569246e-05, + "loss": 1.4997, + "step": 4573 + }, + { + "epoch": 0.789096868800138, + "grad_norm": 0.703125, + "learning_rate": 1.3329386370297615e-05, + "loss": 1.4358, + "step": 4574 + }, + { + "epoch": 0.78926938669887, + "grad_norm": 0.6484375, + "learning_rate": 1.3326819655364236e-05, + "loss": 1.4828, + "step": 4575 + }, + { + "epoch": 0.789441904597602, + "grad_norm": 0.63671875, + "learning_rate": 1.3324252693959271e-05, + "loss": 1.4323, + "step": 4576 + }, + { + "epoch": 0.789614422496334, + "grad_norm": 0.6640625, + "learning_rate": 1.3321685486272898e-05, + "loss": 1.5302, + "step": 4577 + }, + { + "epoch": 0.789786940395066, + "grad_norm": 0.76953125, + "learning_rate": 1.3319118032495306e-05, + "loss": 1.5162, + "step": 4578 + }, + { + "epoch": 0.789959458293798, + "grad_norm": 0.7578125, + "learning_rate": 1.3316550332816713e-05, + "loss": 1.5533, + "step": 4579 + }, + { + "epoch": 0.79013197619253, + "grad_norm": 0.64453125, + "learning_rate": 1.3313982387427346e-05, + "loss": 1.3622, + "step": 4580 + }, + { + "epoch": 0.790304494091262, + "grad_norm": 0.5703125, + "learning_rate": 1.3311414196517462e-05, + "loss": 1.3823, + "step": 4581 + }, + { + "epoch": 0.790477011989994, + "grad_norm": 0.68359375, + "learning_rate": 1.3308845760277322e-05, + "loss": 1.4654, + "step": 4582 + }, + { + "epoch": 0.7906495298887259, + "grad_norm": 0.58984375, + "learning_rate": 1.3306277078897211e-05, + "loss": 1.3795, + "step": 4583 + }, + { + "epoch": 0.7908220477874579, + "grad_norm": 0.63671875, + "learning_rate": 1.3303708152567439e-05, + "loss": 1.4473, + "step": 4584 + }, + { + "epoch": 0.79099456568619, + "grad_norm": 0.6484375, + "learning_rate": 1.3301138981478322e-05, + "loss": 1.4669, + "step": 4585 + }, + { + "epoch": 0.7911670835849219, + "grad_norm": 1.8125, + "learning_rate": 1.3298569565820205e-05, + "loss": 1.4592, + "step": 4586 + }, + { + "epoch": 0.7913396014836539, + "grad_norm": 0.6640625, + "learning_rate": 1.3295999905783444e-05, + "loss": 1.463, + "step": 4587 + }, + { + "epoch": 0.791512119382386, + "grad_norm": 0.609375, + "learning_rate": 1.3293430001558411e-05, + "loss": 1.4708, + "step": 4588 + }, + { + "epoch": 0.7916846372811179, + "grad_norm": 0.58203125, + "learning_rate": 1.3290859853335512e-05, + "loss": 1.4816, + "step": 4589 + }, + { + "epoch": 0.7918571551798499, + "grad_norm": 0.5859375, + "learning_rate": 1.3288289461305149e-05, + "loss": 1.4373, + "step": 4590 + }, + { + "epoch": 0.7920296730785819, + "grad_norm": 0.6875, + "learning_rate": 1.3285718825657754e-05, + "loss": 1.4675, + "step": 4591 + }, + { + "epoch": 0.7922021909773139, + "grad_norm": 0.6875, + "learning_rate": 1.3283147946583781e-05, + "loss": 1.3418, + "step": 4592 + }, + { + "epoch": 0.7923747088760459, + "grad_norm": 0.65234375, + "learning_rate": 1.3280576824273694e-05, + "loss": 1.4304, + "step": 4593 + }, + { + "epoch": 0.7925472267747778, + "grad_norm": 0.61328125, + "learning_rate": 1.3278005458917978e-05, + "loss": 1.4713, + "step": 4594 + }, + { + "epoch": 0.7927197446735099, + "grad_norm": 0.68359375, + "learning_rate": 1.3275433850707136e-05, + "loss": 1.3984, + "step": 4595 + }, + { + "epoch": 0.7928922625722419, + "grad_norm": 0.5546875, + "learning_rate": 1.3272861999831688e-05, + "loss": 1.3734, + "step": 4596 + }, + { + "epoch": 0.7930647804709738, + "grad_norm": 0.59375, + "learning_rate": 1.3270289906482174e-05, + "loss": 1.4992, + "step": 4597 + }, + { + "epoch": 0.7932372983697059, + "grad_norm": 0.625, + "learning_rate": 1.326771757084915e-05, + "loss": 1.416, + "step": 4598 + }, + { + "epoch": 0.7934098162684379, + "grad_norm": 0.609375, + "learning_rate": 1.326514499312319e-05, + "loss": 1.4168, + "step": 4599 + }, + { + "epoch": 0.7935823341671698, + "grad_norm": 0.66796875, + "learning_rate": 1.3262572173494888e-05, + "loss": 1.481, + "step": 4600 + }, + { + "epoch": 0.7935823341671698, + "eval_loss": 1.4177677631378174, + "eval_runtime": 10.9052, + "eval_samples_per_second": 93.9, + "eval_steps_per_second": 23.475, + "step": 4600 + }, + { + "epoch": 0.7937548520659018, + "grad_norm": 2.765625, + "learning_rate": 1.3259999112154854e-05, + "loss": 1.446, + "step": 4601 + }, + { + "epoch": 0.7939273699646339, + "grad_norm": 0.60546875, + "learning_rate": 1.3257425809293714e-05, + "loss": 1.5257, + "step": 4602 + }, + { + "epoch": 0.7940998878633658, + "grad_norm": 0.609375, + "learning_rate": 1.3254852265102118e-05, + "loss": 1.4193, + "step": 4603 + }, + { + "epoch": 0.7942724057620978, + "grad_norm": 0.5859375, + "learning_rate": 1.325227847977073e-05, + "loss": 1.4235, + "step": 4604 + }, + { + "epoch": 0.7944449236608299, + "grad_norm": 0.60546875, + "learning_rate": 1.324970445349023e-05, + "loss": 1.3991, + "step": 4605 + }, + { + "epoch": 0.7946174415595618, + "grad_norm": 0.66015625, + "learning_rate": 1.3247130186451321e-05, + "loss": 1.4472, + "step": 4606 + }, + { + "epoch": 0.7947899594582938, + "grad_norm": 0.609375, + "learning_rate": 1.3244555678844717e-05, + "loss": 1.4479, + "step": 4607 + }, + { + "epoch": 0.7949624773570257, + "grad_norm": 2.09375, + "learning_rate": 1.3241980930861153e-05, + "loss": 1.3988, + "step": 4608 + }, + { + "epoch": 0.7951349952557578, + "grad_norm": 0.7890625, + "learning_rate": 1.3239405942691388e-05, + "loss": 1.5663, + "step": 4609 + }, + { + "epoch": 0.7953075131544898, + "grad_norm": 0.66796875, + "learning_rate": 1.323683071452619e-05, + "loss": 1.4208, + "step": 4610 + }, + { + "epoch": 0.7954800310532217, + "grad_norm": 0.62890625, + "learning_rate": 1.3234255246556347e-05, + "loss": 1.3979, + "step": 4611 + }, + { + "epoch": 0.7956525489519538, + "grad_norm": 0.63671875, + "learning_rate": 1.3231679538972668e-05, + "loss": 1.4996, + "step": 4612 + }, + { + "epoch": 0.7958250668506858, + "grad_norm": 0.86328125, + "learning_rate": 1.3229103591965977e-05, + "loss": 1.4751, + "step": 4613 + }, + { + "epoch": 0.7959975847494177, + "grad_norm": 0.58984375, + "learning_rate": 1.3226527405727115e-05, + "loss": 1.4927, + "step": 4614 + }, + { + "epoch": 0.7961701026481497, + "grad_norm": 0.62109375, + "learning_rate": 1.322395098044694e-05, + "loss": 1.5126, + "step": 4615 + }, + { + "epoch": 0.7963426205468818, + "grad_norm": 0.66796875, + "learning_rate": 1.3221374316316334e-05, + "loss": 1.4083, + "step": 4616 + }, + { + "epoch": 0.7965151384456137, + "grad_norm": 0.66796875, + "learning_rate": 1.321879741352619e-05, + "loss": 1.4313, + "step": 4617 + }, + { + "epoch": 0.7966876563443457, + "grad_norm": 0.67578125, + "learning_rate": 1.3216220272267421e-05, + "loss": 1.4374, + "step": 4618 + }, + { + "epoch": 0.7968601742430778, + "grad_norm": 0.55078125, + "learning_rate": 1.321364289273096e-05, + "loss": 1.4172, + "step": 4619 + }, + { + "epoch": 0.7970326921418097, + "grad_norm": 0.609375, + "learning_rate": 1.3211065275107754e-05, + "loss": 1.469, + "step": 4620 + }, + { + "epoch": 0.7972052100405417, + "grad_norm": 0.6796875, + "learning_rate": 1.3208487419588768e-05, + "loss": 1.4686, + "step": 4621 + }, + { + "epoch": 0.7973777279392738, + "grad_norm": 0.75, + "learning_rate": 1.3205909326364986e-05, + "loss": 1.5068, + "step": 4622 + }, + { + "epoch": 0.7975502458380057, + "grad_norm": 0.6796875, + "learning_rate": 1.320333099562741e-05, + "loss": 1.4224, + "step": 4623 + }, + { + "epoch": 0.7977227637367377, + "grad_norm": 0.62890625, + "learning_rate": 1.3200752427567056e-05, + "loss": 1.4457, + "step": 4624 + }, + { + "epoch": 0.7978952816354696, + "grad_norm": 0.609375, + "learning_rate": 1.3198173622374966e-05, + "loss": 1.3616, + "step": 4625 + }, + { + "epoch": 0.7980677995342017, + "grad_norm": 0.57421875, + "learning_rate": 1.319559458024219e-05, + "loss": 1.4188, + "step": 4626 + }, + { + "epoch": 0.7982403174329337, + "grad_norm": 0.59765625, + "learning_rate": 1.31930153013598e-05, + "loss": 1.4014, + "step": 4627 + }, + { + "epoch": 0.7984128353316656, + "grad_norm": 0.85546875, + "learning_rate": 1.3190435785918887e-05, + "loss": 1.4106, + "step": 4628 + }, + { + "epoch": 0.7985853532303977, + "grad_norm": 0.59375, + "learning_rate": 1.3187856034110555e-05, + "loss": 1.4869, + "step": 4629 + }, + { + "epoch": 0.7987578711291297, + "grad_norm": 0.63671875, + "learning_rate": 1.318527604612593e-05, + "loss": 1.5005, + "step": 4630 + }, + { + "epoch": 0.7989303890278616, + "grad_norm": 0.6875, + "learning_rate": 1.3182695822156153e-05, + "loss": 1.5098, + "step": 4631 + }, + { + "epoch": 0.7991029069265936, + "grad_norm": 0.6328125, + "learning_rate": 1.3180115362392383e-05, + "loss": 1.5083, + "step": 4632 + }, + { + "epoch": 0.7992754248253257, + "grad_norm": 0.58984375, + "learning_rate": 1.31775346670258e-05, + "loss": 1.4761, + "step": 4633 + }, + { + "epoch": 0.7994479427240576, + "grad_norm": 0.55859375, + "learning_rate": 1.3174953736247589e-05, + "loss": 1.4419, + "step": 4634 + }, + { + "epoch": 0.7996204606227896, + "grad_norm": 0.56640625, + "learning_rate": 1.317237257024897e-05, + "loss": 1.3964, + "step": 4635 + }, + { + "epoch": 0.7997929785215216, + "grad_norm": 0.69921875, + "learning_rate": 1.3169791169221168e-05, + "loss": 1.3349, + "step": 4636 + }, + { + "epoch": 0.7999654964202536, + "grad_norm": 0.6875, + "learning_rate": 1.3167209533355432e-05, + "loss": 1.3971, + "step": 4637 + }, + { + "epoch": 0.8001380143189856, + "grad_norm": 0.546875, + "learning_rate": 1.3164627662843024e-05, + "loss": 1.4125, + "step": 4638 + }, + { + "epoch": 0.8003105322177176, + "grad_norm": 0.7265625, + "learning_rate": 1.3162045557875223e-05, + "loss": 1.4942, + "step": 4639 + }, + { + "epoch": 0.8004830501164496, + "grad_norm": 0.60546875, + "learning_rate": 1.315946321864333e-05, + "loss": 1.469, + "step": 4640 + }, + { + "epoch": 0.8006555680151816, + "grad_norm": 0.69921875, + "learning_rate": 1.3156880645338663e-05, + "loss": 1.4833, + "step": 4641 + }, + { + "epoch": 0.8008280859139135, + "grad_norm": 0.61328125, + "learning_rate": 1.315429783815255e-05, + "loss": 1.5352, + "step": 4642 + }, + { + "epoch": 0.8010006038126456, + "grad_norm": 0.7109375, + "learning_rate": 1.3151714797276345e-05, + "loss": 1.4352, + "step": 4643 + }, + { + "epoch": 0.8011731217113776, + "grad_norm": 0.74609375, + "learning_rate": 1.3149131522901417e-05, + "loss": 1.367, + "step": 4644 + }, + { + "epoch": 0.8013456396101095, + "grad_norm": 0.64453125, + "learning_rate": 1.3146548015219147e-05, + "loss": 1.3862, + "step": 4645 + }, + { + "epoch": 0.8015181575088416, + "grad_norm": 0.62890625, + "learning_rate": 1.3143964274420941e-05, + "loss": 1.3852, + "step": 4646 + }, + { + "epoch": 0.8016906754075735, + "grad_norm": 0.65234375, + "learning_rate": 1.3141380300698217e-05, + "loss": 1.4443, + "step": 4647 + }, + { + "epoch": 0.8018631933063055, + "grad_norm": 0.62109375, + "learning_rate": 1.313879609424241e-05, + "loss": 1.4288, + "step": 4648 + }, + { + "epoch": 0.8020357112050375, + "grad_norm": 0.640625, + "learning_rate": 1.3136211655244979e-05, + "loss": 1.4431, + "step": 4649 + }, + { + "epoch": 0.8022082291037695, + "grad_norm": 0.69921875, + "learning_rate": 1.3133626983897392e-05, + "loss": 1.4145, + "step": 4650 + }, + { + "epoch": 0.8023807470025015, + "grad_norm": 0.61328125, + "learning_rate": 1.3131042080391136e-05, + "loss": 1.3694, + "step": 4651 + }, + { + "epoch": 0.8025532649012335, + "grad_norm": 0.6015625, + "learning_rate": 1.312845694491772e-05, + "loss": 1.4015, + "step": 4652 + }, + { + "epoch": 0.8027257827999655, + "grad_norm": 0.59765625, + "learning_rate": 1.3125871577668665e-05, + "loss": 1.4519, + "step": 4653 + }, + { + "epoch": 0.8028983006986975, + "grad_norm": 0.671875, + "learning_rate": 1.3123285978835517e-05, + "loss": 1.4086, + "step": 4654 + }, + { + "epoch": 0.8030708185974295, + "grad_norm": 0.7265625, + "learning_rate": 1.3120700148609824e-05, + "loss": 1.4072, + "step": 4655 + }, + { + "epoch": 0.8032433364961615, + "grad_norm": 0.71875, + "learning_rate": 1.3118114087183164e-05, + "loss": 1.3822, + "step": 4656 + }, + { + "epoch": 0.8034158543948935, + "grad_norm": 0.59375, + "learning_rate": 1.3115527794747134e-05, + "loss": 1.3953, + "step": 4657 + }, + { + "epoch": 0.8035883722936255, + "grad_norm": 0.62890625, + "learning_rate": 1.3112941271493336e-05, + "loss": 1.4183, + "step": 4658 + }, + { + "epoch": 0.8037608901923574, + "grad_norm": 0.64453125, + "learning_rate": 1.3110354517613396e-05, + "loss": 1.4297, + "step": 4659 + }, + { + "epoch": 0.8039334080910895, + "grad_norm": 0.6640625, + "learning_rate": 1.310776753329896e-05, + "loss": 1.5198, + "step": 4660 + }, + { + "epoch": 0.8041059259898214, + "grad_norm": 0.6328125, + "learning_rate": 1.3105180318741691e-05, + "loss": 1.3614, + "step": 4661 + }, + { + "epoch": 0.8042784438885534, + "grad_norm": 0.63671875, + "learning_rate": 1.3102592874133257e-05, + "loss": 1.3834, + "step": 4662 + }, + { + "epoch": 0.8044509617872855, + "grad_norm": 0.6171875, + "learning_rate": 1.310000519966536e-05, + "loss": 1.5917, + "step": 4663 + }, + { + "epoch": 0.8046234796860174, + "grad_norm": 0.62890625, + "learning_rate": 1.3097417295529706e-05, + "loss": 1.5054, + "step": 4664 + }, + { + "epoch": 0.8047959975847494, + "grad_norm": 0.60546875, + "learning_rate": 1.3094829161918028e-05, + "loss": 1.5035, + "step": 4665 + }, + { + "epoch": 0.8049685154834814, + "grad_norm": 0.58984375, + "learning_rate": 1.3092240799022065e-05, + "loss": 1.3778, + "step": 4666 + }, + { + "epoch": 0.8051410333822134, + "grad_norm": 0.6953125, + "learning_rate": 1.3089652207033583e-05, + "loss": 1.5439, + "step": 4667 + }, + { + "epoch": 0.8053135512809454, + "grad_norm": 0.62890625, + "learning_rate": 1.3087063386144361e-05, + "loss": 1.491, + "step": 4668 + }, + { + "epoch": 0.8054860691796774, + "grad_norm": 0.62109375, + "learning_rate": 1.3084474336546196e-05, + "loss": 1.4339, + "step": 4669 + }, + { + "epoch": 0.8056585870784094, + "grad_norm": 0.6015625, + "learning_rate": 1.3081885058430899e-05, + "loss": 1.4916, + "step": 4670 + }, + { + "epoch": 0.8058311049771414, + "grad_norm": 0.72265625, + "learning_rate": 1.30792955519903e-05, + "loss": 1.4565, + "step": 4671 + }, + { + "epoch": 0.8060036228758733, + "grad_norm": 0.63671875, + "learning_rate": 1.3076705817416242e-05, + "loss": 1.5596, + "step": 4672 + }, + { + "epoch": 0.8061761407746053, + "grad_norm": 0.67578125, + "learning_rate": 1.3074115854900598e-05, + "loss": 1.4778, + "step": 4673 + }, + { + "epoch": 0.8063486586733374, + "grad_norm": 0.65625, + "learning_rate": 1.3071525664635241e-05, + "loss": 1.4988, + "step": 4674 + }, + { + "epoch": 0.8065211765720693, + "grad_norm": 0.6875, + "learning_rate": 1.306893524681207e-05, + "loss": 1.3927, + "step": 4675 + }, + { + "epoch": 0.8066936944708013, + "grad_norm": 0.6640625, + "learning_rate": 1.3066344601623004e-05, + "loss": 1.4619, + "step": 4676 + }, + { + "epoch": 0.8068662123695334, + "grad_norm": 0.74609375, + "learning_rate": 1.3063753729259972e-05, + "loss": 1.4088, + "step": 4677 + }, + { + "epoch": 0.8070387302682653, + "grad_norm": 0.60546875, + "learning_rate": 1.3061162629914917e-05, + "loss": 1.491, + "step": 4678 + }, + { + "epoch": 0.8072112481669973, + "grad_norm": 0.58984375, + "learning_rate": 1.3058571303779806e-05, + "loss": 1.4972, + "step": 4679 + }, + { + "epoch": 0.8073837660657294, + "grad_norm": 0.66015625, + "learning_rate": 1.3055979751046624e-05, + "loss": 1.4425, + "step": 4680 + }, + { + "epoch": 0.8075562839644613, + "grad_norm": 0.6484375, + "learning_rate": 1.3053387971907368e-05, + "loss": 1.4029, + "step": 4681 + }, + { + "epoch": 0.8077288018631933, + "grad_norm": 0.6015625, + "learning_rate": 1.3050795966554051e-05, + "loss": 1.4645, + "step": 4682 + }, + { + "epoch": 0.8079013197619253, + "grad_norm": 0.55078125, + "learning_rate": 1.3048203735178709e-05, + "loss": 1.4078, + "step": 4683 + }, + { + "epoch": 0.8080738376606573, + "grad_norm": 0.64453125, + "learning_rate": 1.3045611277973385e-05, + "loss": 1.5524, + "step": 4684 + }, + { + "epoch": 0.8082463555593893, + "grad_norm": 0.5859375, + "learning_rate": 1.3043018595130148e-05, + "loss": 1.4131, + "step": 4685 + }, + { + "epoch": 0.8084188734581212, + "grad_norm": 0.6484375, + "learning_rate": 1.3040425686841083e-05, + "loss": 1.4879, + "step": 4686 + }, + { + "epoch": 0.8085913913568533, + "grad_norm": 0.66015625, + "learning_rate": 1.3037832553298282e-05, + "loss": 1.3294, + "step": 4687 + }, + { + "epoch": 0.8087639092555853, + "grad_norm": 0.68359375, + "learning_rate": 1.3035239194693865e-05, + "loss": 1.4486, + "step": 4688 + }, + { + "epoch": 0.8089364271543172, + "grad_norm": 0.62890625, + "learning_rate": 1.3032645611219965e-05, + "loss": 1.4143, + "step": 4689 + }, + { + "epoch": 0.8091089450530492, + "grad_norm": 0.7578125, + "learning_rate": 1.3030051803068729e-05, + "loss": 1.4405, + "step": 4690 + }, + { + "epoch": 0.8092814629517813, + "grad_norm": 0.61328125, + "learning_rate": 1.3027457770432322e-05, + "loss": 1.4022, + "step": 4691 + }, + { + "epoch": 0.8094539808505132, + "grad_norm": 0.6796875, + "learning_rate": 1.3024863513502926e-05, + "loss": 1.4181, + "step": 4692 + }, + { + "epoch": 0.8096264987492452, + "grad_norm": 0.57421875, + "learning_rate": 1.3022269032472745e-05, + "loss": 1.5205, + "step": 4693 + }, + { + "epoch": 0.8097990166479773, + "grad_norm": 0.66796875, + "learning_rate": 1.3019674327533984e-05, + "loss": 1.4996, + "step": 4694 + }, + { + "epoch": 0.8099715345467092, + "grad_norm": 0.62109375, + "learning_rate": 1.3017079398878884e-05, + "loss": 1.3902, + "step": 4695 + }, + { + "epoch": 0.8101440524454412, + "grad_norm": 0.66015625, + "learning_rate": 1.3014484246699693e-05, + "loss": 1.4089, + "step": 4696 + }, + { + "epoch": 0.8103165703441733, + "grad_norm": 0.6484375, + "learning_rate": 1.3011888871188669e-05, + "loss": 1.4944, + "step": 4697 + }, + { + "epoch": 0.8104890882429052, + "grad_norm": 0.64453125, + "learning_rate": 1.3009293272538104e-05, + "loss": 1.438, + "step": 4698 + }, + { + "epoch": 0.8106616061416372, + "grad_norm": 0.578125, + "learning_rate": 1.3006697450940284e-05, + "loss": 1.4211, + "step": 4699 + }, + { + "epoch": 0.8108341240403691, + "grad_norm": 0.93359375, + "learning_rate": 1.3004101406587535e-05, + "loss": 1.3598, + "step": 4700 + }, + { + "epoch": 0.8108341240403691, + "eval_loss": 1.4166769981384277, + "eval_runtime": 10.9127, + "eval_samples_per_second": 93.836, + "eval_steps_per_second": 23.459, + "step": 4700 + }, + { + "epoch": 0.8110066419391012, + "grad_norm": 0.62890625, + "learning_rate": 1.300150513967218e-05, + "loss": 1.4421, + "step": 4701 + }, + { + "epoch": 0.8111791598378332, + "grad_norm": 0.69921875, + "learning_rate": 1.2998908650386573e-05, + "loss": 1.4649, + "step": 4702 + }, + { + "epoch": 0.8113516777365651, + "grad_norm": 0.68359375, + "learning_rate": 1.2996311938923074e-05, + "loss": 1.5004, + "step": 4703 + }, + { + "epoch": 0.8115241956352972, + "grad_norm": 0.671875, + "learning_rate": 1.299371500547406e-05, + "loss": 1.3757, + "step": 4704 + }, + { + "epoch": 0.8116967135340292, + "grad_norm": 0.68359375, + "learning_rate": 1.299111785023194e-05, + "loss": 1.471, + "step": 4705 + }, + { + "epoch": 0.8118692314327611, + "grad_norm": 0.5859375, + "learning_rate": 1.2988520473389117e-05, + "loss": 1.4475, + "step": 4706 + }, + { + "epoch": 0.8120417493314931, + "grad_norm": 0.56640625, + "learning_rate": 1.2985922875138025e-05, + "loss": 1.417, + "step": 4707 + }, + { + "epoch": 0.8122142672302252, + "grad_norm": 0.66015625, + "learning_rate": 1.2983325055671108e-05, + "loss": 1.4515, + "step": 4708 + }, + { + "epoch": 0.8123867851289571, + "grad_norm": 0.63671875, + "learning_rate": 1.2980727015180833e-05, + "loss": 1.4563, + "step": 4709 + }, + { + "epoch": 0.8125593030276891, + "grad_norm": 0.92578125, + "learning_rate": 1.2978128753859674e-05, + "loss": 1.4191, + "step": 4710 + }, + { + "epoch": 0.8127318209264212, + "grad_norm": 0.6171875, + "learning_rate": 1.2975530271900127e-05, + "loss": 1.3702, + "step": 4711 + }, + { + "epoch": 0.8129043388251531, + "grad_norm": 0.69140625, + "learning_rate": 1.2972931569494707e-05, + "loss": 1.4901, + "step": 4712 + }, + { + "epoch": 0.8130768567238851, + "grad_norm": 0.7734375, + "learning_rate": 1.2970332646835942e-05, + "loss": 1.4811, + "step": 4713 + }, + { + "epoch": 0.8132493746226171, + "grad_norm": 0.6328125, + "learning_rate": 1.2967733504116375e-05, + "loss": 1.4553, + "step": 4714 + }, + { + "epoch": 0.8134218925213491, + "grad_norm": 0.578125, + "learning_rate": 1.2965134141528565e-05, + "loss": 1.4019, + "step": 4715 + }, + { + "epoch": 0.8135944104200811, + "grad_norm": 0.61328125, + "learning_rate": 1.2962534559265092e-05, + "loss": 1.435, + "step": 4716 + }, + { + "epoch": 0.813766928318813, + "grad_norm": 0.765625, + "learning_rate": 1.2959934757518548e-05, + "loss": 1.4376, + "step": 4717 + }, + { + "epoch": 0.8139394462175451, + "grad_norm": 0.59375, + "learning_rate": 1.2957334736481544e-05, + "loss": 1.4521, + "step": 4718 + }, + { + "epoch": 0.8141119641162771, + "grad_norm": 0.73046875, + "learning_rate": 1.2954734496346704e-05, + "loss": 1.4023, + "step": 4719 + }, + { + "epoch": 0.814284482015009, + "grad_norm": 0.87109375, + "learning_rate": 1.2952134037306667e-05, + "loss": 1.4733, + "step": 4720 + }, + { + "epoch": 0.8144569999137411, + "grad_norm": 0.59375, + "learning_rate": 1.2949533359554099e-05, + "loss": 1.4591, + "step": 4721 + }, + { + "epoch": 0.8146295178124731, + "grad_norm": 0.6328125, + "learning_rate": 1.2946932463281667e-05, + "loss": 1.4012, + "step": 4722 + }, + { + "epoch": 0.814802035711205, + "grad_norm": 0.83203125, + "learning_rate": 1.2944331348682067e-05, + "loss": 1.476, + "step": 4723 + }, + { + "epoch": 0.814974553609937, + "grad_norm": 0.6015625, + "learning_rate": 1.2941730015948001e-05, + "loss": 1.4007, + "step": 4724 + }, + { + "epoch": 0.815147071508669, + "grad_norm": 0.63671875, + "learning_rate": 1.2939128465272197e-05, + "loss": 1.3761, + "step": 4725 + }, + { + "epoch": 0.815319589407401, + "grad_norm": 0.640625, + "learning_rate": 1.2936526696847392e-05, + "loss": 1.3495, + "step": 4726 + }, + { + "epoch": 0.815492107306133, + "grad_norm": 0.6171875, + "learning_rate": 1.2933924710866342e-05, + "loss": 1.4033, + "step": 4727 + }, + { + "epoch": 0.815664625204865, + "grad_norm": 0.5859375, + "learning_rate": 1.2931322507521813e-05, + "loss": 1.456, + "step": 4728 + }, + { + "epoch": 0.815837143103597, + "grad_norm": 0.56640625, + "learning_rate": 1.29287200870066e-05, + "loss": 1.3995, + "step": 4729 + }, + { + "epoch": 0.816009661002329, + "grad_norm": 0.6640625, + "learning_rate": 1.2926117449513505e-05, + "loss": 1.4896, + "step": 4730 + }, + { + "epoch": 0.816182178901061, + "grad_norm": 0.7265625, + "learning_rate": 1.2923514595235345e-05, + "loss": 1.4255, + "step": 4731 + }, + { + "epoch": 0.816354696799793, + "grad_norm": 0.671875, + "learning_rate": 1.2920911524364954e-05, + "loss": 1.4791, + "step": 4732 + }, + { + "epoch": 0.816527214698525, + "grad_norm": 0.57421875, + "learning_rate": 1.291830823709519e-05, + "loss": 1.4537, + "step": 4733 + }, + { + "epoch": 0.8166997325972569, + "grad_norm": 0.69140625, + "learning_rate": 1.2915704733618917e-05, + "loss": 1.5038, + "step": 4734 + }, + { + "epoch": 0.816872250495989, + "grad_norm": 0.84375, + "learning_rate": 1.2913101014129017e-05, + "loss": 1.5189, + "step": 4735 + }, + { + "epoch": 0.817044768394721, + "grad_norm": 0.62109375, + "learning_rate": 1.291049707881839e-05, + "loss": 1.3664, + "step": 4736 + }, + { + "epoch": 0.8172172862934529, + "grad_norm": 0.6875, + "learning_rate": 1.2907892927879959e-05, + "loss": 1.3988, + "step": 4737 + }, + { + "epoch": 0.817389804192185, + "grad_norm": 0.62890625, + "learning_rate": 1.2905288561506649e-05, + "loss": 1.4323, + "step": 4738 + }, + { + "epoch": 0.817562322090917, + "grad_norm": 0.60546875, + "learning_rate": 1.2902683979891406e-05, + "loss": 1.4768, + "step": 4739 + }, + { + "epoch": 0.8177348399896489, + "grad_norm": 0.640625, + "learning_rate": 1.29000791832272e-05, + "loss": 1.4621, + "step": 4740 + }, + { + "epoch": 0.8179073578883809, + "grad_norm": 0.65234375, + "learning_rate": 1.2897474171707008e-05, + "loss": 1.3043, + "step": 4741 + }, + { + "epoch": 0.8180798757871129, + "grad_norm": 0.68359375, + "learning_rate": 1.2894868945523826e-05, + "loss": 1.4202, + "step": 4742 + }, + { + "epoch": 0.8182523936858449, + "grad_norm": 0.6484375, + "learning_rate": 1.2892263504870662e-05, + "loss": 1.4528, + "step": 4743 + }, + { + "epoch": 0.8184249115845769, + "grad_norm": 0.65625, + "learning_rate": 1.2889657849940547e-05, + "loss": 1.4524, + "step": 4744 + }, + { + "epoch": 0.8185974294833089, + "grad_norm": 0.63671875, + "learning_rate": 1.2887051980926522e-05, + "loss": 1.4295, + "step": 4745 + }, + { + "epoch": 0.8187699473820409, + "grad_norm": 0.68359375, + "learning_rate": 1.2884445898021649e-05, + "loss": 1.4305, + "step": 4746 + }, + { + "epoch": 0.8189424652807729, + "grad_norm": 0.59765625, + "learning_rate": 1.2881839601419e-05, + "loss": 1.5478, + "step": 4747 + }, + { + "epoch": 0.8191149831795048, + "grad_norm": 0.5859375, + "learning_rate": 1.2879233091311667e-05, + "loss": 1.6081, + "step": 4748 + }, + { + "epoch": 0.8192875010782369, + "grad_norm": 0.6171875, + "learning_rate": 1.2876626367892754e-05, + "loss": 1.4978, + "step": 4749 + }, + { + "epoch": 0.8194600189769689, + "grad_norm": 0.62109375, + "learning_rate": 1.2874019431355392e-05, + "loss": 1.4072, + "step": 4750 + }, + { + "epoch": 0.8196325368757008, + "grad_norm": 0.57421875, + "learning_rate": 1.2871412281892705e-05, + "loss": 1.4681, + "step": 4751 + }, + { + "epoch": 0.8198050547744329, + "grad_norm": 0.58984375, + "learning_rate": 1.2868804919697858e-05, + "loss": 1.4266, + "step": 4752 + }, + { + "epoch": 0.8199775726731648, + "grad_norm": 0.58203125, + "learning_rate": 1.286619734496402e-05, + "loss": 1.4385, + "step": 4753 + }, + { + "epoch": 0.8201500905718968, + "grad_norm": 0.6875, + "learning_rate": 1.2863589557884371e-05, + "loss": 1.5308, + "step": 4754 + }, + { + "epoch": 0.8203226084706289, + "grad_norm": 0.62890625, + "learning_rate": 1.2860981558652114e-05, + "loss": 1.3169, + "step": 4755 + }, + { + "epoch": 0.8204951263693608, + "grad_norm": 2.109375, + "learning_rate": 1.2858373347460469e-05, + "loss": 1.4768, + "step": 4756 + }, + { + "epoch": 0.8206676442680928, + "grad_norm": 0.765625, + "learning_rate": 1.2855764924502665e-05, + "loss": 1.4356, + "step": 4757 + }, + { + "epoch": 0.8208401621668248, + "grad_norm": 0.61328125, + "learning_rate": 1.2853156289971955e-05, + "loss": 1.5222, + "step": 4758 + }, + { + "epoch": 0.8210126800655568, + "grad_norm": 0.5859375, + "learning_rate": 1.2850547444061597e-05, + "loss": 1.523, + "step": 4759 + }, + { + "epoch": 0.8211851979642888, + "grad_norm": 0.65625, + "learning_rate": 1.2847938386964871e-05, + "loss": 1.4683, + "step": 4760 + }, + { + "epoch": 0.8213577158630208, + "grad_norm": 0.62109375, + "learning_rate": 1.2845329118875079e-05, + "loss": 1.4199, + "step": 4761 + }, + { + "epoch": 0.8215302337617528, + "grad_norm": 0.65625, + "learning_rate": 1.2842719639985525e-05, + "loss": 1.4601, + "step": 4762 + }, + { + "epoch": 0.8217027516604848, + "grad_norm": 0.56640625, + "learning_rate": 1.2840109950489538e-05, + "loss": 1.3938, + "step": 4763 + }, + { + "epoch": 0.8218752695592167, + "grad_norm": 1.109375, + "learning_rate": 1.2837500050580463e-05, + "loss": 1.3915, + "step": 4764 + }, + { + "epoch": 0.8220477874579487, + "grad_norm": 0.69921875, + "learning_rate": 1.2834889940451652e-05, + "loss": 1.3742, + "step": 4765 + }, + { + "epoch": 0.8222203053566808, + "grad_norm": 0.62109375, + "learning_rate": 1.2832279620296481e-05, + "loss": 1.4963, + "step": 4766 + }, + { + "epoch": 0.8223928232554127, + "grad_norm": 0.60546875, + "learning_rate": 1.2829669090308339e-05, + "loss": 1.4451, + "step": 4767 + }, + { + "epoch": 0.8225653411541447, + "grad_norm": 0.66015625, + "learning_rate": 1.2827058350680632e-05, + "loss": 1.4309, + "step": 4768 + }, + { + "epoch": 0.8227378590528768, + "grad_norm": 0.59375, + "learning_rate": 1.2824447401606776e-05, + "loss": 1.5286, + "step": 4769 + }, + { + "epoch": 0.8229103769516087, + "grad_norm": 0.72265625, + "learning_rate": 1.2821836243280209e-05, + "loss": 1.41, + "step": 4770 + }, + { + "epoch": 0.8230828948503407, + "grad_norm": 0.73828125, + "learning_rate": 1.281922487589438e-05, + "loss": 1.5432, + "step": 4771 + }, + { + "epoch": 0.8232554127490728, + "grad_norm": 0.60546875, + "learning_rate": 1.2816613299642758e-05, + "loss": 1.4329, + "step": 4772 + }, + { + "epoch": 0.8234279306478047, + "grad_norm": 0.62109375, + "learning_rate": 1.2814001514718824e-05, + "loss": 1.4705, + "step": 4773 + }, + { + "epoch": 0.8236004485465367, + "grad_norm": 0.73046875, + "learning_rate": 1.2811389521316077e-05, + "loss": 1.4188, + "step": 4774 + }, + { + "epoch": 0.8237729664452687, + "grad_norm": 0.77734375, + "learning_rate": 1.2808777319628025e-05, + "loss": 1.4128, + "step": 4775 + }, + { + "epoch": 0.8239454843440007, + "grad_norm": 0.609375, + "learning_rate": 1.28061649098482e-05, + "loss": 1.4184, + "step": 4776 + }, + { + "epoch": 0.8241180022427327, + "grad_norm": 0.64453125, + "learning_rate": 1.2803552292170145e-05, + "loss": 1.4397, + "step": 4777 + }, + { + "epoch": 0.8242905201414646, + "grad_norm": 0.7578125, + "learning_rate": 1.280093946678742e-05, + "loss": 1.5588, + "step": 4778 + }, + { + "epoch": 0.8244630380401967, + "grad_norm": 0.6796875, + "learning_rate": 1.2798326433893598e-05, + "loss": 1.5087, + "step": 4779 + }, + { + "epoch": 0.8246355559389287, + "grad_norm": 0.92578125, + "learning_rate": 1.2795713193682266e-05, + "loss": 1.4846, + "step": 4780 + }, + { + "epoch": 0.8248080738376606, + "grad_norm": 0.6328125, + "learning_rate": 1.2793099746347034e-05, + "loss": 1.448, + "step": 4781 + }, + { + "epoch": 0.8249805917363926, + "grad_norm": 0.640625, + "learning_rate": 1.2790486092081522e-05, + "loss": 1.4624, + "step": 4782 + }, + { + "epoch": 0.8251531096351247, + "grad_norm": 0.62109375, + "learning_rate": 1.2787872231079363e-05, + "loss": 1.4425, + "step": 4783 + }, + { + "epoch": 0.8253256275338566, + "grad_norm": 0.59765625, + "learning_rate": 1.2785258163534211e-05, + "loss": 1.4563, + "step": 4784 + }, + { + "epoch": 0.8254981454325886, + "grad_norm": 0.6171875, + "learning_rate": 1.2782643889639727e-05, + "loss": 1.472, + "step": 4785 + }, + { + "epoch": 0.8256706633313207, + "grad_norm": 0.5859375, + "learning_rate": 1.2780029409589603e-05, + "loss": 1.4048, + "step": 4786 + }, + { + "epoch": 0.8258431812300526, + "grad_norm": 0.6875, + "learning_rate": 1.2777414723577527e-05, + "loss": 1.5023, + "step": 4787 + }, + { + "epoch": 0.8260156991287846, + "grad_norm": 0.640625, + "learning_rate": 1.2774799831797214e-05, + "loss": 1.4089, + "step": 4788 + }, + { + "epoch": 0.8261882170275167, + "grad_norm": 0.60546875, + "learning_rate": 1.277218473444239e-05, + "loss": 1.4833, + "step": 4789 + }, + { + "epoch": 0.8263607349262486, + "grad_norm": 0.70703125, + "learning_rate": 1.2769569431706804e-05, + "loss": 1.4398, + "step": 4790 + }, + { + "epoch": 0.8265332528249806, + "grad_norm": 0.60546875, + "learning_rate": 1.2766953923784207e-05, + "loss": 1.4302, + "step": 4791 + }, + { + "epoch": 0.8267057707237125, + "grad_norm": 0.64453125, + "learning_rate": 1.2764338210868372e-05, + "loss": 1.4379, + "step": 4792 + }, + { + "epoch": 0.8268782886224446, + "grad_norm": 0.59765625, + "learning_rate": 1.2761722293153094e-05, + "loss": 1.3314, + "step": 4793 + }, + { + "epoch": 0.8270508065211766, + "grad_norm": 0.8046875, + "learning_rate": 1.2759106170832172e-05, + "loss": 1.4575, + "step": 4794 + }, + { + "epoch": 0.8272233244199085, + "grad_norm": 0.71484375, + "learning_rate": 1.2756489844099424e-05, + "loss": 1.4598, + "step": 4795 + }, + { + "epoch": 0.8273958423186406, + "grad_norm": 0.609375, + "learning_rate": 1.2753873313148683e-05, + "loss": 1.3463, + "step": 4796 + }, + { + "epoch": 0.8275683602173726, + "grad_norm": 0.64453125, + "learning_rate": 1.2751256578173803e-05, + "loss": 1.4494, + "step": 4797 + }, + { + "epoch": 0.8277408781161045, + "grad_norm": 0.6875, + "learning_rate": 1.2748639639368645e-05, + "loss": 1.5213, + "step": 4798 + }, + { + "epoch": 0.8279133960148365, + "grad_norm": 0.67578125, + "learning_rate": 1.2746022496927086e-05, + "loss": 1.3503, + "step": 4799 + }, + { + "epoch": 0.8280859139135686, + "grad_norm": 0.6796875, + "learning_rate": 1.2743405151043025e-05, + "loss": 1.4791, + "step": 4800 + }, + { + "epoch": 0.8280859139135686, + "eval_loss": 1.4158532619476318, + "eval_runtime": 10.8421, + "eval_samples_per_second": 94.447, + "eval_steps_per_second": 23.612, + "step": 4800 + }, + { + "epoch": 0.8282584318123005, + "grad_norm": 0.5859375, + "learning_rate": 1.2740787601910365e-05, + "loss": 1.495, + "step": 4801 + }, + { + "epoch": 0.8284309497110325, + "grad_norm": 0.6484375, + "learning_rate": 1.2738169849723039e-05, + "loss": 1.428, + "step": 4802 + }, + { + "epoch": 0.8286034676097646, + "grad_norm": 0.62890625, + "learning_rate": 1.2735551894674978e-05, + "loss": 1.4787, + "step": 4803 + }, + { + "epoch": 0.8287759855084965, + "grad_norm": 0.609375, + "learning_rate": 1.2732933736960143e-05, + "loss": 1.4055, + "step": 4804 + }, + { + "epoch": 0.8289485034072285, + "grad_norm": 0.7578125, + "learning_rate": 1.2730315376772498e-05, + "loss": 1.3993, + "step": 4805 + }, + { + "epoch": 0.8291210213059605, + "grad_norm": 0.7109375, + "learning_rate": 1.2727696814306034e-05, + "loss": 1.343, + "step": 4806 + }, + { + "epoch": 0.8292935392046925, + "grad_norm": 0.671875, + "learning_rate": 1.2725078049754742e-05, + "loss": 1.4253, + "step": 4807 + }, + { + "epoch": 0.8294660571034245, + "grad_norm": 0.62109375, + "learning_rate": 1.2722459083312645e-05, + "loss": 1.3983, + "step": 4808 + }, + { + "epoch": 0.8296385750021564, + "grad_norm": 0.6171875, + "learning_rate": 1.2719839915173764e-05, + "loss": 1.3923, + "step": 4809 + }, + { + "epoch": 0.8298110929008885, + "grad_norm": 0.69140625, + "learning_rate": 1.2717220545532151e-05, + "loss": 1.3616, + "step": 4810 + }, + { + "epoch": 0.8299836107996205, + "grad_norm": 0.6328125, + "learning_rate": 1.271460097458186e-05, + "loss": 1.3973, + "step": 4811 + }, + { + "epoch": 0.8301561286983524, + "grad_norm": 0.6953125, + "learning_rate": 1.271198120251697e-05, + "loss": 1.3502, + "step": 4812 + }, + { + "epoch": 0.8303286465970845, + "grad_norm": 0.67578125, + "learning_rate": 1.2709361229531565e-05, + "loss": 1.3808, + "step": 4813 + }, + { + "epoch": 0.8305011644958165, + "grad_norm": 0.65625, + "learning_rate": 1.2706741055819753e-05, + "loss": 1.4364, + "step": 4814 + }, + { + "epoch": 0.8306736823945484, + "grad_norm": 0.640625, + "learning_rate": 1.270412068157565e-05, + "loss": 1.3211, + "step": 4815 + }, + { + "epoch": 0.8308462002932804, + "grad_norm": 0.72265625, + "learning_rate": 1.2701500106993389e-05, + "loss": 1.4494, + "step": 4816 + }, + { + "epoch": 0.8310187181920125, + "grad_norm": 0.6875, + "learning_rate": 1.269887933226712e-05, + "loss": 1.4341, + "step": 4817 + }, + { + "epoch": 0.8311912360907444, + "grad_norm": 0.703125, + "learning_rate": 1.269625835759101e-05, + "loss": 1.3866, + "step": 4818 + }, + { + "epoch": 0.8313637539894764, + "grad_norm": 0.7109375, + "learning_rate": 1.2693637183159231e-05, + "loss": 1.4771, + "step": 4819 + }, + { + "epoch": 0.8315362718882084, + "grad_norm": 0.83203125, + "learning_rate": 1.269101580916598e-05, + "loss": 1.4753, + "step": 4820 + }, + { + "epoch": 0.8317087897869404, + "grad_norm": 0.65234375, + "learning_rate": 1.2688394235805466e-05, + "loss": 1.4644, + "step": 4821 + }, + { + "epoch": 0.8318813076856724, + "grad_norm": 0.625, + "learning_rate": 1.2685772463271909e-05, + "loss": 1.4329, + "step": 4822 + }, + { + "epoch": 0.8320538255844043, + "grad_norm": 0.8203125, + "learning_rate": 1.2683150491759544e-05, + "loss": 1.4185, + "step": 4823 + }, + { + "epoch": 0.8322263434831364, + "grad_norm": 0.77734375, + "learning_rate": 1.2680528321462624e-05, + "loss": 1.4757, + "step": 4824 + }, + { + "epoch": 0.8323988613818684, + "grad_norm": 0.58203125, + "learning_rate": 1.2677905952575417e-05, + "loss": 1.4213, + "step": 4825 + }, + { + "epoch": 0.8325713792806003, + "grad_norm": 0.6796875, + "learning_rate": 1.2675283385292212e-05, + "loss": 1.4002, + "step": 4826 + }, + { + "epoch": 0.8327438971793324, + "grad_norm": 0.7734375, + "learning_rate": 1.2672660619807291e-05, + "loss": 1.3573, + "step": 4827 + }, + { + "epoch": 0.8329164150780644, + "grad_norm": 0.7421875, + "learning_rate": 1.2670037656314973e-05, + "loss": 1.3828, + "step": 4828 + }, + { + "epoch": 0.8330889329767963, + "grad_norm": 0.63671875, + "learning_rate": 1.2667414495009583e-05, + "loss": 1.394, + "step": 4829 + }, + { + "epoch": 0.8332614508755284, + "grad_norm": 0.58984375, + "learning_rate": 1.2664791136085462e-05, + "loss": 1.4278, + "step": 4830 + }, + { + "epoch": 0.8334339687742603, + "grad_norm": 0.64453125, + "learning_rate": 1.2662167579736961e-05, + "loss": 1.4386, + "step": 4831 + }, + { + "epoch": 0.8336064866729923, + "grad_norm": 0.66796875, + "learning_rate": 1.2659543826158451e-05, + "loss": 1.3753, + "step": 4832 + }, + { + "epoch": 0.8337790045717243, + "grad_norm": 0.62890625, + "learning_rate": 1.2656919875544316e-05, + "loss": 1.3979, + "step": 4833 + }, + { + "epoch": 0.8339515224704563, + "grad_norm": 0.62890625, + "learning_rate": 1.2654295728088959e-05, + "loss": 1.5161, + "step": 4834 + }, + { + "epoch": 0.8341240403691883, + "grad_norm": 0.64453125, + "learning_rate": 1.2651671383986788e-05, + "loss": 1.4717, + "step": 4835 + }, + { + "epoch": 0.8342965582679203, + "grad_norm": 0.68359375, + "learning_rate": 1.2649046843432232e-05, + "loss": 1.4729, + "step": 4836 + }, + { + "epoch": 0.8344690761666523, + "grad_norm": 0.5546875, + "learning_rate": 1.2646422106619733e-05, + "loss": 1.4337, + "step": 4837 + }, + { + "epoch": 0.8346415940653843, + "grad_norm": 0.62109375, + "learning_rate": 1.2643797173743753e-05, + "loss": 1.3985, + "step": 4838 + }, + { + "epoch": 0.8348141119641163, + "grad_norm": 3.234375, + "learning_rate": 1.2641172044998754e-05, + "loss": 1.3862, + "step": 4839 + }, + { + "epoch": 0.8349866298628482, + "grad_norm": 0.69921875, + "learning_rate": 1.263854672057923e-05, + "loss": 1.2768, + "step": 4840 + }, + { + "epoch": 0.8351591477615803, + "grad_norm": 0.65234375, + "learning_rate": 1.2635921200679677e-05, + "loss": 1.4262, + "step": 4841 + }, + { + "epoch": 0.8353316656603123, + "grad_norm": 0.65234375, + "learning_rate": 1.2633295485494614e-05, + "loss": 1.3765, + "step": 4842 + }, + { + "epoch": 0.8355041835590442, + "grad_norm": 0.73046875, + "learning_rate": 1.2630669575218568e-05, + "loss": 1.3913, + "step": 4843 + }, + { + "epoch": 0.8356767014577763, + "grad_norm": 0.7109375, + "learning_rate": 1.2628043470046078e-05, + "loss": 1.4059, + "step": 4844 + }, + { + "epoch": 0.8358492193565082, + "grad_norm": 0.65234375, + "learning_rate": 1.2625417170171712e-05, + "loss": 1.4799, + "step": 4845 + }, + { + "epoch": 0.8360217372552402, + "grad_norm": 0.69140625, + "learning_rate": 1.2622790675790033e-05, + "loss": 1.3965, + "step": 4846 + }, + { + "epoch": 0.8361942551539723, + "grad_norm": 0.62890625, + "learning_rate": 1.2620163987095637e-05, + "loss": 1.4537, + "step": 4847 + }, + { + "epoch": 0.8363667730527042, + "grad_norm": 0.6015625, + "learning_rate": 1.2617537104283119e-05, + "loss": 1.3701, + "step": 4848 + }, + { + "epoch": 0.8365392909514362, + "grad_norm": 0.59375, + "learning_rate": 1.2614910027547096e-05, + "loss": 1.3818, + "step": 4849 + }, + { + "epoch": 0.8367118088501682, + "grad_norm": 0.6640625, + "learning_rate": 1.2612282757082204e-05, + "loss": 1.5375, + "step": 4850 + }, + { + "epoch": 0.8368843267489002, + "grad_norm": 0.68359375, + "learning_rate": 1.2609655293083079e-05, + "loss": 1.4179, + "step": 4851 + }, + { + "epoch": 0.8370568446476322, + "grad_norm": 0.68359375, + "learning_rate": 1.2607027635744384e-05, + "loss": 1.3935, + "step": 4852 + }, + { + "epoch": 0.8372293625463642, + "grad_norm": 0.7265625, + "learning_rate": 1.2604399785260794e-05, + "loss": 1.4925, + "step": 4853 + }, + { + "epoch": 0.8374018804450962, + "grad_norm": 0.69921875, + "learning_rate": 1.2601771741826996e-05, + "loss": 1.4307, + "step": 4854 + }, + { + "epoch": 0.8375743983438282, + "grad_norm": 0.72265625, + "learning_rate": 1.259914350563769e-05, + "loss": 1.3686, + "step": 4855 + }, + { + "epoch": 0.8377469162425601, + "grad_norm": 0.60546875, + "learning_rate": 1.259651507688759e-05, + "loss": 1.3841, + "step": 4856 + }, + { + "epoch": 0.8379194341412921, + "grad_norm": 0.703125, + "learning_rate": 1.259388645577143e-05, + "loss": 1.3666, + "step": 4857 + }, + { + "epoch": 0.8380919520400242, + "grad_norm": 0.6328125, + "learning_rate": 1.2591257642483959e-05, + "loss": 1.532, + "step": 4858 + }, + { + "epoch": 0.8382644699387561, + "grad_norm": 0.7109375, + "learning_rate": 1.2588628637219927e-05, + "loss": 1.4264, + "step": 4859 + }, + { + "epoch": 0.8384369878374881, + "grad_norm": 0.6171875, + "learning_rate": 1.258599944017411e-05, + "loss": 1.4054, + "step": 4860 + }, + { + "epoch": 0.8386095057362202, + "grad_norm": 0.640625, + "learning_rate": 1.2583370051541298e-05, + "loss": 1.3527, + "step": 4861 + }, + { + "epoch": 0.8387820236349521, + "grad_norm": 0.6875, + "learning_rate": 1.258074047151629e-05, + "loss": 1.4412, + "step": 4862 + }, + { + "epoch": 0.8389545415336841, + "grad_norm": 0.76171875, + "learning_rate": 1.2578110700293906e-05, + "loss": 1.6089, + "step": 4863 + }, + { + "epoch": 0.8391270594324162, + "grad_norm": 0.68359375, + "learning_rate": 1.2575480738068971e-05, + "loss": 1.4927, + "step": 4864 + }, + { + "epoch": 0.8392995773311481, + "grad_norm": 0.65625, + "learning_rate": 1.2572850585036325e-05, + "loss": 1.3671, + "step": 4865 + }, + { + "epoch": 0.8394720952298801, + "grad_norm": 0.7421875, + "learning_rate": 1.2570220241390838e-05, + "loss": 1.3574, + "step": 4866 + }, + { + "epoch": 0.839644613128612, + "grad_norm": 0.671875, + "learning_rate": 1.2567589707327375e-05, + "loss": 1.4596, + "step": 4867 + }, + { + "epoch": 0.8398171310273441, + "grad_norm": 0.671875, + "learning_rate": 1.256495898304082e-05, + "loss": 1.5785, + "step": 4868 + }, + { + "epoch": 0.8399896489260761, + "grad_norm": 0.67578125, + "learning_rate": 1.2562328068726081e-05, + "loss": 1.3818, + "step": 4869 + }, + { + "epoch": 0.840162166824808, + "grad_norm": 0.59375, + "learning_rate": 1.2559696964578068e-05, + "loss": 1.4781, + "step": 4870 + }, + { + "epoch": 0.8403346847235401, + "grad_norm": 0.640625, + "learning_rate": 1.2557065670791708e-05, + "loss": 1.3785, + "step": 4871 + }, + { + "epoch": 0.8405072026222721, + "grad_norm": 0.59765625, + "learning_rate": 1.2554434187561949e-05, + "loss": 1.4297, + "step": 4872 + }, + { + "epoch": 0.840679720521004, + "grad_norm": 0.60546875, + "learning_rate": 1.2551802515083742e-05, + "loss": 1.5078, + "step": 4873 + }, + { + "epoch": 0.840852238419736, + "grad_norm": 0.63671875, + "learning_rate": 1.2549170653552062e-05, + "loss": 1.6393, + "step": 4874 + }, + { + "epoch": 0.8410247563184681, + "grad_norm": 0.6484375, + "learning_rate": 1.254653860316189e-05, + "loss": 1.4881, + "step": 4875 + }, + { + "epoch": 0.8411972742172, + "grad_norm": 0.6015625, + "learning_rate": 1.2543906364108227e-05, + "loss": 1.4819, + "step": 4876 + }, + { + "epoch": 0.841369792115932, + "grad_norm": 0.60546875, + "learning_rate": 1.2541273936586088e-05, + "loss": 1.288, + "step": 4877 + }, + { + "epoch": 0.8415423100146641, + "grad_norm": 0.67578125, + "learning_rate": 1.2538641320790494e-05, + "loss": 1.5058, + "step": 4878 + }, + { + "epoch": 0.841714827913396, + "grad_norm": 0.61328125, + "learning_rate": 1.2536008516916491e-05, + "loss": 1.5025, + "step": 4879 + }, + { + "epoch": 0.841887345812128, + "grad_norm": 0.60546875, + "learning_rate": 1.253337552515913e-05, + "loss": 1.5237, + "step": 4880 + }, + { + "epoch": 0.8420598637108601, + "grad_norm": 0.609375, + "learning_rate": 1.253074234571348e-05, + "loss": 1.3913, + "step": 4881 + }, + { + "epoch": 0.842232381609592, + "grad_norm": 0.60546875, + "learning_rate": 1.2528108978774627e-05, + "loss": 1.4599, + "step": 4882 + }, + { + "epoch": 0.842404899508324, + "grad_norm": 0.6171875, + "learning_rate": 1.2525475424537664e-05, + "loss": 1.468, + "step": 4883 + }, + { + "epoch": 0.8425774174070559, + "grad_norm": 0.6640625, + "learning_rate": 1.25228416831977e-05, + "loss": 1.5137, + "step": 4884 + }, + { + "epoch": 0.842749935305788, + "grad_norm": 0.6484375, + "learning_rate": 1.2520207754949861e-05, + "loss": 1.3381, + "step": 4885 + }, + { + "epoch": 0.84292245320452, + "grad_norm": 0.6171875, + "learning_rate": 1.2517573639989284e-05, + "loss": 1.449, + "step": 4886 + }, + { + "epoch": 0.8430949711032519, + "grad_norm": 0.62890625, + "learning_rate": 1.2514939338511123e-05, + "loss": 1.486, + "step": 4887 + }, + { + "epoch": 0.843267489001984, + "grad_norm": 0.60546875, + "learning_rate": 1.2512304850710542e-05, + "loss": 1.5363, + "step": 4888 + }, + { + "epoch": 0.843440006900716, + "grad_norm": 0.6484375, + "learning_rate": 1.2509670176782718e-05, + "loss": 1.3917, + "step": 4889 + }, + { + "epoch": 0.8436125247994479, + "grad_norm": 0.60546875, + "learning_rate": 1.2507035316922847e-05, + "loss": 1.4611, + "step": 4890 + }, + { + "epoch": 0.8437850426981799, + "grad_norm": 0.5546875, + "learning_rate": 1.2504400271326136e-05, + "loss": 1.3827, + "step": 4891 + }, + { + "epoch": 0.843957560596912, + "grad_norm": 0.70703125, + "learning_rate": 1.2501765040187804e-05, + "loss": 1.4403, + "step": 4892 + }, + { + "epoch": 0.8441300784956439, + "grad_norm": 0.57421875, + "learning_rate": 1.2499129623703086e-05, + "loss": 1.4834, + "step": 4893 + }, + { + "epoch": 0.8443025963943759, + "grad_norm": 0.6328125, + "learning_rate": 1.249649402206723e-05, + "loss": 1.4435, + "step": 4894 + }, + { + "epoch": 0.844475114293108, + "grad_norm": 0.671875, + "learning_rate": 1.2493858235475497e-05, + "loss": 1.4356, + "step": 4895 + }, + { + "epoch": 0.8446476321918399, + "grad_norm": 0.59375, + "learning_rate": 1.2491222264123166e-05, + "loss": 1.4481, + "step": 4896 + }, + { + "epoch": 0.8448201500905719, + "grad_norm": 0.6171875, + "learning_rate": 1.2488586108205521e-05, + "loss": 1.497, + "step": 4897 + }, + { + "epoch": 0.8449926679893038, + "grad_norm": 0.640625, + "learning_rate": 1.2485949767917869e-05, + "loss": 1.3873, + "step": 4898 + }, + { + "epoch": 0.8451651858880359, + "grad_norm": 0.8515625, + "learning_rate": 1.2483313243455526e-05, + "loss": 1.4457, + "step": 4899 + }, + { + "epoch": 0.8453377037867679, + "grad_norm": 0.70703125, + "learning_rate": 1.2480676535013821e-05, + "loss": 1.3509, + "step": 4900 + }, + { + "epoch": 0.8453377037867679, + "eval_loss": 1.4150875806808472, + "eval_runtime": 10.8763, + "eval_samples_per_second": 94.15, + "eval_steps_per_second": 23.537, + "step": 4900 + }, + { + "epoch": 0.8455102216854998, + "grad_norm": 0.71875, + "learning_rate": 1.2478039642788097e-05, + "loss": 1.3508, + "step": 4901 + }, + { + "epoch": 0.8456827395842319, + "grad_norm": 0.58203125, + "learning_rate": 1.2475402566973715e-05, + "loss": 1.431, + "step": 4902 + }, + { + "epoch": 0.8458552574829639, + "grad_norm": 0.5859375, + "learning_rate": 1.2472765307766045e-05, + "loss": 1.5606, + "step": 4903 + }, + { + "epoch": 0.8460277753816958, + "grad_norm": 0.69921875, + "learning_rate": 1.2470127865360467e-05, + "loss": 1.4353, + "step": 4904 + }, + { + "epoch": 0.8462002932804279, + "grad_norm": 0.609375, + "learning_rate": 1.2467490239952382e-05, + "loss": 1.3749, + "step": 4905 + }, + { + "epoch": 0.8463728111791599, + "grad_norm": 0.64453125, + "learning_rate": 1.2464852431737205e-05, + "loss": 1.4545, + "step": 4906 + }, + { + "epoch": 0.8465453290778918, + "grad_norm": 0.57421875, + "learning_rate": 1.2462214440910359e-05, + "loss": 1.4226, + "step": 4907 + }, + { + "epoch": 0.8467178469766238, + "grad_norm": 0.8046875, + "learning_rate": 1.2459576267667281e-05, + "loss": 1.3848, + "step": 4908 + }, + { + "epoch": 0.8468903648753558, + "grad_norm": 0.5625, + "learning_rate": 1.2456937912203426e-05, + "loss": 1.4049, + "step": 4909 + }, + { + "epoch": 0.8470628827740878, + "grad_norm": 0.609375, + "learning_rate": 1.2454299374714258e-05, + "loss": 1.4062, + "step": 4910 + }, + { + "epoch": 0.8472354006728198, + "grad_norm": 0.90625, + "learning_rate": 1.2451660655395258e-05, + "loss": 1.4654, + "step": 4911 + }, + { + "epoch": 0.8474079185715518, + "grad_norm": 0.6796875, + "learning_rate": 1.2449021754441919e-05, + "loss": 1.4948, + "step": 4912 + }, + { + "epoch": 0.8475804364702838, + "grad_norm": 0.64453125, + "learning_rate": 1.2446382672049741e-05, + "loss": 1.4434, + "step": 4913 + }, + { + "epoch": 0.8477529543690158, + "grad_norm": 0.61328125, + "learning_rate": 1.2443743408414256e-05, + "loss": 1.4044, + "step": 4914 + }, + { + "epoch": 0.8479254722677477, + "grad_norm": 0.64453125, + "learning_rate": 1.2441103963730985e-05, + "loss": 1.4474, + "step": 4915 + }, + { + "epoch": 0.8480979901664798, + "grad_norm": 0.62890625, + "learning_rate": 1.2438464338195481e-05, + "loss": 1.4127, + "step": 4916 + }, + { + "epoch": 0.8482705080652118, + "grad_norm": 0.60546875, + "learning_rate": 1.2435824532003304e-05, + "loss": 1.4637, + "step": 4917 + }, + { + "epoch": 0.8484430259639437, + "grad_norm": 0.63671875, + "learning_rate": 1.2433184545350026e-05, + "loss": 1.4083, + "step": 4918 + }, + { + "epoch": 0.8486155438626758, + "grad_norm": 0.5625, + "learning_rate": 1.2430544378431233e-05, + "loss": 1.4099, + "step": 4919 + }, + { + "epoch": 0.8487880617614078, + "grad_norm": 0.58203125, + "learning_rate": 1.2427904031442526e-05, + "loss": 1.3869, + "step": 4920 + }, + { + "epoch": 0.8489605796601397, + "grad_norm": 0.609375, + "learning_rate": 1.2425263504579517e-05, + "loss": 1.439, + "step": 4921 + }, + { + "epoch": 0.8491330975588718, + "grad_norm": 0.58984375, + "learning_rate": 1.2422622798037833e-05, + "loss": 1.3557, + "step": 4922 + }, + { + "epoch": 0.8493056154576037, + "grad_norm": 0.58984375, + "learning_rate": 1.2419981912013116e-05, + "loss": 1.5707, + "step": 4923 + }, + { + "epoch": 0.8494781333563357, + "grad_norm": 0.625, + "learning_rate": 1.241734084670102e-05, + "loss": 1.4085, + "step": 4924 + }, + { + "epoch": 0.8496506512550677, + "grad_norm": 0.578125, + "learning_rate": 1.2414699602297206e-05, + "loss": 1.4046, + "step": 4925 + }, + { + "epoch": 0.8498231691537997, + "grad_norm": 0.6015625, + "learning_rate": 1.2412058178997362e-05, + "loss": 1.34, + "step": 4926 + }, + { + "epoch": 0.8499956870525317, + "grad_norm": 0.640625, + "learning_rate": 1.2409416576997176e-05, + "loss": 1.3569, + "step": 4927 + }, + { + "epoch": 0.8501682049512637, + "grad_norm": 0.5859375, + "learning_rate": 1.2406774796492353e-05, + "loss": 1.4416, + "step": 4928 + }, + { + "epoch": 0.8503407228499957, + "grad_norm": 0.90234375, + "learning_rate": 1.2404132837678613e-05, + "loss": 1.4531, + "step": 4929 + }, + { + "epoch": 0.8505132407487277, + "grad_norm": 0.5703125, + "learning_rate": 1.2401490700751695e-05, + "loss": 1.4516, + "step": 4930 + }, + { + "epoch": 0.8506857586474597, + "grad_norm": 0.703125, + "learning_rate": 1.239884838590734e-05, + "loss": 1.45, + "step": 4931 + }, + { + "epoch": 0.8508582765461916, + "grad_norm": 0.61328125, + "learning_rate": 1.2396205893341306e-05, + "loss": 1.5007, + "step": 4932 + }, + { + "epoch": 0.8510307944449237, + "grad_norm": 0.61328125, + "learning_rate": 1.2393563223249367e-05, + "loss": 1.4125, + "step": 4933 + }, + { + "epoch": 0.8512033123436556, + "grad_norm": 0.578125, + "learning_rate": 1.239092037582731e-05, + "loss": 1.4413, + "step": 4934 + }, + { + "epoch": 0.8513758302423876, + "grad_norm": 0.68359375, + "learning_rate": 1.2388277351270936e-05, + "loss": 1.5108, + "step": 4935 + }, + { + "epoch": 0.8515483481411197, + "grad_norm": 0.77734375, + "learning_rate": 1.2385634149776049e-05, + "loss": 1.5181, + "step": 4936 + }, + { + "epoch": 0.8517208660398516, + "grad_norm": 0.625, + "learning_rate": 1.2382990771538479e-05, + "loss": 1.4072, + "step": 4937 + }, + { + "epoch": 0.8518933839385836, + "grad_norm": 0.56640625, + "learning_rate": 1.238034721675406e-05, + "loss": 1.4797, + "step": 4938 + }, + { + "epoch": 0.8520659018373157, + "grad_norm": 0.87890625, + "learning_rate": 1.237770348561865e-05, + "loss": 1.4337, + "step": 4939 + }, + { + "epoch": 0.8522384197360476, + "grad_norm": 0.6640625, + "learning_rate": 1.237505957832811e-05, + "loss": 1.4196, + "step": 4940 + }, + { + "epoch": 0.8524109376347796, + "grad_norm": 0.81640625, + "learning_rate": 1.2372415495078314e-05, + "loss": 1.515, + "step": 4941 + }, + { + "epoch": 0.8525834555335116, + "grad_norm": 0.6484375, + "learning_rate": 1.2369771236065154e-05, + "loss": 1.361, + "step": 4942 + }, + { + "epoch": 0.8527559734322436, + "grad_norm": 0.875, + "learning_rate": 1.2367126801484537e-05, + "loss": 1.4598, + "step": 4943 + }, + { + "epoch": 0.8529284913309756, + "grad_norm": 0.72265625, + "learning_rate": 1.2364482191532371e-05, + "loss": 1.4902, + "step": 4944 + }, + { + "epoch": 0.8531010092297076, + "grad_norm": 0.5859375, + "learning_rate": 1.2361837406404592e-05, + "loss": 1.4504, + "step": 4945 + }, + { + "epoch": 0.8532735271284396, + "grad_norm": 0.62109375, + "learning_rate": 1.2359192446297143e-05, + "loss": 1.4503, + "step": 4946 + }, + { + "epoch": 0.8534460450271716, + "grad_norm": 0.65625, + "learning_rate": 1.2356547311405977e-05, + "loss": 1.4309, + "step": 4947 + }, + { + "epoch": 0.8536185629259035, + "grad_norm": 0.6484375, + "learning_rate": 1.2353902001927058e-05, + "loss": 1.3899, + "step": 4948 + }, + { + "epoch": 0.8537910808246355, + "grad_norm": 0.70703125, + "learning_rate": 1.2351256518056373e-05, + "loss": 1.371, + "step": 4949 + }, + { + "epoch": 0.8539635987233676, + "grad_norm": 0.59765625, + "learning_rate": 1.2348610859989913e-05, + "loss": 1.4957, + "step": 4950 + }, + { + "epoch": 0.8541361166220995, + "grad_norm": 0.625, + "learning_rate": 1.234596502792369e-05, + "loss": 1.351, + "step": 4951 + }, + { + "epoch": 0.8543086345208315, + "grad_norm": 0.64453125, + "learning_rate": 1.2343319022053715e-05, + "loss": 1.4467, + "step": 4952 + }, + { + "epoch": 0.8544811524195636, + "grad_norm": 0.68359375, + "learning_rate": 1.2340672842576025e-05, + "loss": 1.4147, + "step": 4953 + }, + { + "epoch": 0.8546536703182955, + "grad_norm": 0.6328125, + "learning_rate": 1.2338026489686668e-05, + "loss": 1.4445, + "step": 4954 + }, + { + "epoch": 0.8548261882170275, + "grad_norm": 0.68359375, + "learning_rate": 1.2335379963581699e-05, + "loss": 1.4351, + "step": 4955 + }, + { + "epoch": 0.8549987061157595, + "grad_norm": 0.61328125, + "learning_rate": 1.2332733264457188e-05, + "loss": 1.5089, + "step": 4956 + }, + { + "epoch": 0.8551712240144915, + "grad_norm": 0.62109375, + "learning_rate": 1.2330086392509224e-05, + "loss": 1.5096, + "step": 4957 + }, + { + "epoch": 0.8553437419132235, + "grad_norm": 0.5703125, + "learning_rate": 1.2327439347933901e-05, + "loss": 1.3495, + "step": 4958 + }, + { + "epoch": 0.8555162598119554, + "grad_norm": 0.56640625, + "learning_rate": 1.2324792130927328e-05, + "loss": 1.3313, + "step": 4959 + }, + { + "epoch": 0.8556887777106875, + "grad_norm": 0.59765625, + "learning_rate": 1.2322144741685627e-05, + "loss": 1.4532, + "step": 4960 + }, + { + "epoch": 0.8558612956094195, + "grad_norm": 0.61328125, + "learning_rate": 1.231949718040493e-05, + "loss": 1.5288, + "step": 4961 + }, + { + "epoch": 0.8560338135081514, + "grad_norm": 0.6015625, + "learning_rate": 1.2316849447281395e-05, + "loss": 1.4836, + "step": 4962 + }, + { + "epoch": 0.8562063314068835, + "grad_norm": 0.59375, + "learning_rate": 1.2314201542511176e-05, + "loss": 1.4735, + "step": 4963 + }, + { + "epoch": 0.8563788493056155, + "grad_norm": 0.62109375, + "learning_rate": 1.2311553466290447e-05, + "loss": 1.2235, + "step": 4964 + }, + { + "epoch": 0.8565513672043474, + "grad_norm": 0.703125, + "learning_rate": 1.2308905218815392e-05, + "loss": 1.4499, + "step": 4965 + }, + { + "epoch": 0.8567238851030794, + "grad_norm": 0.63671875, + "learning_rate": 1.2306256800282213e-05, + "loss": 1.4595, + "step": 4966 + }, + { + "epoch": 0.8568964030018115, + "grad_norm": 0.6015625, + "learning_rate": 1.2303608210887121e-05, + "loss": 1.4724, + "step": 4967 + }, + { + "epoch": 0.8570689209005434, + "grad_norm": 1.2109375, + "learning_rate": 1.230095945082634e-05, + "loss": 1.5173, + "step": 4968 + }, + { + "epoch": 0.8572414387992754, + "grad_norm": 0.7265625, + "learning_rate": 1.2298310520296101e-05, + "loss": 1.5255, + "step": 4969 + }, + { + "epoch": 0.8574139566980075, + "grad_norm": 0.59375, + "learning_rate": 1.2295661419492662e-05, + "loss": 1.4696, + "step": 4970 + }, + { + "epoch": 0.8575864745967394, + "grad_norm": 0.61328125, + "learning_rate": 1.2293012148612281e-05, + "loss": 1.484, + "step": 4971 + }, + { + "epoch": 0.8577589924954714, + "grad_norm": 0.70703125, + "learning_rate": 1.229036270785123e-05, + "loss": 1.4593, + "step": 4972 + }, + { + "epoch": 0.8579315103942033, + "grad_norm": 0.6796875, + "learning_rate": 1.2287713097405802e-05, + "loss": 1.4657, + "step": 4973 + }, + { + "epoch": 0.8581040282929354, + "grad_norm": 0.59375, + "learning_rate": 1.2285063317472293e-05, + "loss": 1.3391, + "step": 4974 + }, + { + "epoch": 0.8582765461916674, + "grad_norm": 0.6171875, + "learning_rate": 1.2282413368247015e-05, + "loss": 1.3832, + "step": 4975 + }, + { + "epoch": 0.8584490640903993, + "grad_norm": 0.7578125, + "learning_rate": 1.2279763249926293e-05, + "loss": 1.5026, + "step": 4976 + }, + { + "epoch": 0.8586215819891314, + "grad_norm": 0.875, + "learning_rate": 1.2277112962706463e-05, + "loss": 1.4334, + "step": 4977 + }, + { + "epoch": 0.8587940998878634, + "grad_norm": 0.7421875, + "learning_rate": 1.2274462506783877e-05, + "loss": 1.515, + "step": 4978 + }, + { + "epoch": 0.8589666177865953, + "grad_norm": 0.703125, + "learning_rate": 1.22718118823549e-05, + "loss": 1.4328, + "step": 4979 + }, + { + "epoch": 0.8591391356853274, + "grad_norm": 0.578125, + "learning_rate": 1.2269161089615902e-05, + "loss": 1.442, + "step": 4980 + }, + { + "epoch": 0.8593116535840594, + "grad_norm": 0.58984375, + "learning_rate": 1.2266510128763271e-05, + "loss": 1.4677, + "step": 4981 + }, + { + "epoch": 0.8594841714827913, + "grad_norm": 0.68359375, + "learning_rate": 1.226385899999341e-05, + "loss": 1.3962, + "step": 4982 + }, + { + "epoch": 0.8596566893815233, + "grad_norm": 0.68359375, + "learning_rate": 1.2261207703502731e-05, + "loss": 1.4035, + "step": 4983 + }, + { + "epoch": 0.8598292072802554, + "grad_norm": 0.62890625, + "learning_rate": 1.2258556239487654e-05, + "loss": 1.4359, + "step": 4984 + }, + { + "epoch": 0.8600017251789873, + "grad_norm": 1.0234375, + "learning_rate": 1.2255904608144618e-05, + "loss": 1.3858, + "step": 4985 + }, + { + "epoch": 0.8601742430777193, + "grad_norm": 0.6015625, + "learning_rate": 1.2253252809670074e-05, + "loss": 1.4108, + "step": 4986 + }, + { + "epoch": 0.8603467609764514, + "grad_norm": 0.58203125, + "learning_rate": 1.2250600844260482e-05, + "loss": 1.4605, + "step": 4987 + }, + { + "epoch": 0.8605192788751833, + "grad_norm": 0.58203125, + "learning_rate": 1.2247948712112318e-05, + "loss": 1.419, + "step": 4988 + }, + { + "epoch": 0.8606917967739153, + "grad_norm": 0.6171875, + "learning_rate": 1.2245296413422067e-05, + "loss": 1.3961, + "step": 4989 + }, + { + "epoch": 0.8608643146726472, + "grad_norm": 0.609375, + "learning_rate": 1.2242643948386231e-05, + "loss": 1.494, + "step": 4990 + }, + { + "epoch": 0.8610368325713793, + "grad_norm": 0.609375, + "learning_rate": 1.2239991317201316e-05, + "loss": 1.4661, + "step": 4991 + }, + { + "epoch": 0.8612093504701113, + "grad_norm": 0.609375, + "learning_rate": 1.2237338520063848e-05, + "loss": 1.4005, + "step": 4992 + }, + { + "epoch": 0.8613818683688432, + "grad_norm": 0.59765625, + "learning_rate": 1.2234685557170365e-05, + "loss": 1.491, + "step": 4993 + }, + { + "epoch": 0.8615543862675753, + "grad_norm": 0.6328125, + "learning_rate": 1.2232032428717408e-05, + "loss": 1.3907, + "step": 4994 + }, + { + "epoch": 0.8617269041663073, + "grad_norm": 0.6171875, + "learning_rate": 1.2229379134901546e-05, + "loss": 1.3841, + "step": 4995 + }, + { + "epoch": 0.8618994220650392, + "grad_norm": 0.6015625, + "learning_rate": 1.2226725675919349e-05, + "loss": 1.52, + "step": 4996 + }, + { + "epoch": 0.8620719399637713, + "grad_norm": 0.70703125, + "learning_rate": 1.2224072051967398e-05, + "loss": 1.3878, + "step": 4997 + }, + { + "epoch": 0.8622444578625033, + "grad_norm": 0.57421875, + "learning_rate": 1.2221418263242292e-05, + "loss": 1.4099, + "step": 4998 + }, + { + "epoch": 0.8624169757612352, + "grad_norm": 0.6484375, + "learning_rate": 1.2218764309940647e-05, + "loss": 1.4386, + "step": 4999 + }, + { + "epoch": 0.8625894936599672, + "grad_norm": 0.70703125, + "learning_rate": 1.2216110192259075e-05, + "loss": 1.4912, + "step": 5000 + }, + { + "epoch": 0.8625894936599672, + "eval_loss": 1.414391040802002, + "eval_runtime": 10.8891, + "eval_samples_per_second": 94.039, + "eval_steps_per_second": 23.51, + "step": 5000 + }, + { + "epoch": 0.8627620115586992, + "grad_norm": 0.61328125, + "learning_rate": 1.221345591039421e-05, + "loss": 1.4483, + "step": 5001 + }, + { + "epoch": 0.8629345294574312, + "grad_norm": 0.63671875, + "learning_rate": 1.2210801464542704e-05, + "loss": 1.4031, + "step": 5002 + }, + { + "epoch": 0.8631070473561632, + "grad_norm": 0.6015625, + "learning_rate": 1.2208146854901213e-05, + "loss": 1.5115, + "step": 5003 + }, + { + "epoch": 0.8632795652548952, + "grad_norm": 0.6328125, + "learning_rate": 1.2205492081666404e-05, + "loss": 1.4754, + "step": 5004 + }, + { + "epoch": 0.8634520831536272, + "grad_norm": 0.69921875, + "learning_rate": 1.220283714503496e-05, + "loss": 1.4836, + "step": 5005 + }, + { + "epoch": 0.8636246010523592, + "grad_norm": 0.6015625, + "learning_rate": 1.2200182045203576e-05, + "loss": 1.4632, + "step": 5006 + }, + { + "epoch": 0.8637971189510911, + "grad_norm": 0.625, + "learning_rate": 1.2197526782368962e-05, + "loss": 1.3843, + "step": 5007 + }, + { + "epoch": 0.8639696368498232, + "grad_norm": 0.7109375, + "learning_rate": 1.219487135672783e-05, + "loss": 1.4724, + "step": 5008 + }, + { + "epoch": 0.8641421547485552, + "grad_norm": 0.70703125, + "learning_rate": 1.2192215768476915e-05, + "loss": 1.4989, + "step": 5009 + }, + { + "epoch": 0.8643146726472871, + "grad_norm": 0.59375, + "learning_rate": 1.2189560017812955e-05, + "loss": 1.4918, + "step": 5010 + }, + { + "epoch": 0.8644871905460192, + "grad_norm": 1.0, + "learning_rate": 1.2186904104932716e-05, + "loss": 1.3991, + "step": 5011 + }, + { + "epoch": 0.8646597084447512, + "grad_norm": 0.6640625, + "learning_rate": 1.218424803003295e-05, + "loss": 1.4472, + "step": 5012 + }, + { + "epoch": 0.8648322263434831, + "grad_norm": 0.59375, + "learning_rate": 1.2181591793310444e-05, + "loss": 1.2846, + "step": 5013 + }, + { + "epoch": 0.8650047442422152, + "grad_norm": 0.609375, + "learning_rate": 1.2178935394961985e-05, + "loss": 1.4859, + "step": 5014 + }, + { + "epoch": 0.8651772621409471, + "grad_norm": 0.65625, + "learning_rate": 1.2176278835184381e-05, + "loss": 1.4976, + "step": 5015 + }, + { + "epoch": 0.8653497800396791, + "grad_norm": 1.5703125, + "learning_rate": 1.2173622114174439e-05, + "loss": 1.3716, + "step": 5016 + }, + { + "epoch": 0.8655222979384111, + "grad_norm": 0.68359375, + "learning_rate": 1.2170965232128991e-05, + "loss": 1.4576, + "step": 5017 + }, + { + "epoch": 0.8656948158371431, + "grad_norm": 0.62109375, + "learning_rate": 1.2168308189244872e-05, + "loss": 1.4493, + "step": 5018 + }, + { + "epoch": 0.8658673337358751, + "grad_norm": 0.58984375, + "learning_rate": 1.216565098571894e-05, + "loss": 1.5467, + "step": 5019 + }, + { + "epoch": 0.8660398516346071, + "grad_norm": 0.62109375, + "learning_rate": 1.2162993621748048e-05, + "loss": 1.4384, + "step": 5020 + }, + { + "epoch": 0.8662123695333391, + "grad_norm": 1.046875, + "learning_rate": 1.216033609752907e-05, + "loss": 1.4942, + "step": 5021 + }, + { + "epoch": 0.8663848874320711, + "grad_norm": 0.64453125, + "learning_rate": 1.2157678413258898e-05, + "loss": 1.5091, + "step": 5022 + }, + { + "epoch": 0.866557405330803, + "grad_norm": 0.6015625, + "learning_rate": 1.2155020569134428e-05, + "loss": 1.4179, + "step": 5023 + }, + { + "epoch": 0.866729923229535, + "grad_norm": 0.6328125, + "learning_rate": 1.2152362565352571e-05, + "loss": 1.4006, + "step": 5024 + }, + { + "epoch": 0.8669024411282671, + "grad_norm": 0.6328125, + "learning_rate": 1.2149704402110243e-05, + "loss": 1.431, + "step": 5025 + }, + { + "epoch": 0.867074959026999, + "grad_norm": 0.66796875, + "learning_rate": 1.2147046079604378e-05, + "loss": 1.517, + "step": 5026 + }, + { + "epoch": 0.867247476925731, + "grad_norm": 1.1796875, + "learning_rate": 1.2144387598031933e-05, + "loss": 1.5725, + "step": 5027 + }, + { + "epoch": 0.8674199948244631, + "grad_norm": 0.75390625, + "learning_rate": 1.2141728957589848e-05, + "loss": 1.4621, + "step": 5028 + }, + { + "epoch": 0.867592512723195, + "grad_norm": 0.6015625, + "learning_rate": 1.2139070158475104e-05, + "loss": 1.4756, + "step": 5029 + }, + { + "epoch": 0.867765030621927, + "grad_norm": 0.578125, + "learning_rate": 1.2136411200884676e-05, + "loss": 1.4426, + "step": 5030 + }, + { + "epoch": 0.867937548520659, + "grad_norm": 0.625, + "learning_rate": 1.2133752085015558e-05, + "loss": 1.4562, + "step": 5031 + }, + { + "epoch": 0.868110066419391, + "grad_norm": 0.75390625, + "learning_rate": 1.2131092811064753e-05, + "loss": 1.4278, + "step": 5032 + }, + { + "epoch": 0.868282584318123, + "grad_norm": 0.6875, + "learning_rate": 1.2128433379229276e-05, + "loss": 1.5037, + "step": 5033 + }, + { + "epoch": 0.868455102216855, + "grad_norm": 0.57421875, + "learning_rate": 1.2125773789706155e-05, + "loss": 1.4705, + "step": 5034 + }, + { + "epoch": 0.868627620115587, + "grad_norm": 0.73046875, + "learning_rate": 1.2123114042692432e-05, + "loss": 1.4946, + "step": 5035 + }, + { + "epoch": 0.868800138014319, + "grad_norm": 0.6796875, + "learning_rate": 1.212045413838515e-05, + "loss": 1.4562, + "step": 5036 + }, + { + "epoch": 0.868972655913051, + "grad_norm": 0.65234375, + "learning_rate": 1.2117794076981381e-05, + "loss": 1.4539, + "step": 5037 + }, + { + "epoch": 0.869145173811783, + "grad_norm": 0.62109375, + "learning_rate": 1.2115133858678192e-05, + "loss": 1.4397, + "step": 5038 + }, + { + "epoch": 0.869317691710515, + "grad_norm": 0.68359375, + "learning_rate": 1.2112473483672671e-05, + "loss": 1.3853, + "step": 5039 + }, + { + "epoch": 0.869490209609247, + "grad_norm": 0.66796875, + "learning_rate": 1.2109812952161916e-05, + "loss": 1.5076, + "step": 5040 + }, + { + "epoch": 0.8696627275079789, + "grad_norm": 0.58203125, + "learning_rate": 1.2107152264343033e-05, + "loss": 1.4681, + "step": 5041 + }, + { + "epoch": 0.869835245406711, + "grad_norm": 0.58203125, + "learning_rate": 1.2104491420413143e-05, + "loss": 1.4733, + "step": 5042 + }, + { + "epoch": 0.8700077633054429, + "grad_norm": 0.6015625, + "learning_rate": 1.2101830420569382e-05, + "loss": 1.3826, + "step": 5043 + }, + { + "epoch": 0.8701802812041749, + "grad_norm": 0.62890625, + "learning_rate": 1.209916926500889e-05, + "loss": 1.454, + "step": 5044 + }, + { + "epoch": 0.870352799102907, + "grad_norm": 0.66796875, + "learning_rate": 1.2096507953928823e-05, + "loss": 1.4646, + "step": 5045 + }, + { + "epoch": 0.8705253170016389, + "grad_norm": 0.640625, + "learning_rate": 1.2093846487526344e-05, + "loss": 1.5317, + "step": 5046 + }, + { + "epoch": 0.8706978349003709, + "grad_norm": 0.61328125, + "learning_rate": 1.209118486599864e-05, + "loss": 1.4352, + "step": 5047 + }, + { + "epoch": 0.8708703527991029, + "grad_norm": 0.7890625, + "learning_rate": 1.208852308954289e-05, + "loss": 1.4748, + "step": 5048 + }, + { + "epoch": 0.8710428706978349, + "grad_norm": 0.65625, + "learning_rate": 1.20858611583563e-05, + "loss": 1.5263, + "step": 5049 + }, + { + "epoch": 0.8712153885965669, + "grad_norm": 0.6640625, + "learning_rate": 1.2083199072636084e-05, + "loss": 1.4742, + "step": 5050 + }, + { + "epoch": 0.8713879064952988, + "grad_norm": 0.62890625, + "learning_rate": 1.2080536832579466e-05, + "loss": 1.4146, + "step": 5051 + }, + { + "epoch": 0.8715604243940309, + "grad_norm": 0.71484375, + "learning_rate": 1.2077874438383676e-05, + "loss": 1.3773, + "step": 5052 + }, + { + "epoch": 0.8717329422927629, + "grad_norm": 0.58203125, + "learning_rate": 1.2075211890245965e-05, + "loss": 1.4945, + "step": 5053 + }, + { + "epoch": 0.8719054601914948, + "grad_norm": 0.578125, + "learning_rate": 1.2072549188363594e-05, + "loss": 1.5134, + "step": 5054 + }, + { + "epoch": 0.8720779780902269, + "grad_norm": 0.79296875, + "learning_rate": 1.2069886332933824e-05, + "loss": 1.4979, + "step": 5055 + }, + { + "epoch": 0.8722504959889589, + "grad_norm": 0.63671875, + "learning_rate": 1.2067223324153947e-05, + "loss": 1.4189, + "step": 5056 + }, + { + "epoch": 0.8724230138876908, + "grad_norm": 0.65625, + "learning_rate": 1.2064560162221246e-05, + "loss": 1.4177, + "step": 5057 + }, + { + "epoch": 0.8725955317864228, + "grad_norm": 0.625, + "learning_rate": 1.2061896847333025e-05, + "loss": 1.4413, + "step": 5058 + }, + { + "epoch": 0.8727680496851549, + "grad_norm": 0.609375, + "learning_rate": 1.205923337968661e-05, + "loss": 1.4782, + "step": 5059 + }, + { + "epoch": 0.8729405675838868, + "grad_norm": 0.64453125, + "learning_rate": 1.2056569759479315e-05, + "loss": 1.4539, + "step": 5060 + }, + { + "epoch": 0.8731130854826188, + "grad_norm": 0.62890625, + "learning_rate": 1.205390598690848e-05, + "loss": 1.4785, + "step": 5061 + }, + { + "epoch": 0.8732856033813509, + "grad_norm": 0.57421875, + "learning_rate": 1.205124206217146e-05, + "loss": 1.4304, + "step": 5062 + }, + { + "epoch": 0.8734581212800828, + "grad_norm": 0.7109375, + "learning_rate": 1.2048577985465613e-05, + "loss": 1.4521, + "step": 5063 + }, + { + "epoch": 0.8736306391788148, + "grad_norm": 0.58203125, + "learning_rate": 1.2045913756988305e-05, + "loss": 1.4276, + "step": 5064 + }, + { + "epoch": 0.8738031570775467, + "grad_norm": 0.6328125, + "learning_rate": 1.2043249376936923e-05, + "loss": 1.4619, + "step": 5065 + }, + { + "epoch": 0.8739756749762788, + "grad_norm": 0.61328125, + "learning_rate": 1.204058484550886e-05, + "loss": 1.475, + "step": 5066 + }, + { + "epoch": 0.8741481928750108, + "grad_norm": 0.66015625, + "learning_rate": 1.2037920162901522e-05, + "loss": 1.4353, + "step": 5067 + }, + { + "epoch": 0.8743207107737427, + "grad_norm": 0.640625, + "learning_rate": 1.2035255329312325e-05, + "loss": 1.5358, + "step": 5068 + }, + { + "epoch": 0.8744932286724748, + "grad_norm": 0.61328125, + "learning_rate": 1.2032590344938697e-05, + "loss": 1.4719, + "step": 5069 + }, + { + "epoch": 0.8746657465712068, + "grad_norm": 0.734375, + "learning_rate": 1.2029925209978075e-05, + "loss": 1.4687, + "step": 5070 + }, + { + "epoch": 0.8748382644699387, + "grad_norm": 0.62109375, + "learning_rate": 1.202725992462791e-05, + "loss": 1.4727, + "step": 5071 + }, + { + "epoch": 0.8750107823686708, + "grad_norm": 0.68359375, + "learning_rate": 1.2024594489085665e-05, + "loss": 1.4396, + "step": 5072 + }, + { + "epoch": 0.8751833002674028, + "grad_norm": 0.7578125, + "learning_rate": 1.2021928903548807e-05, + "loss": 1.3646, + "step": 5073 + }, + { + "epoch": 0.8753558181661347, + "grad_norm": 0.65625, + "learning_rate": 1.2019263168214822e-05, + "loss": 1.5268, + "step": 5074 + }, + { + "epoch": 0.8755283360648667, + "grad_norm": 0.5625, + "learning_rate": 1.2016597283281209e-05, + "loss": 1.4303, + "step": 5075 + }, + { + "epoch": 0.8757008539635988, + "grad_norm": 0.63671875, + "learning_rate": 1.2013931248945463e-05, + "loss": 1.5223, + "step": 5076 + }, + { + "epoch": 0.8758733718623307, + "grad_norm": 0.671875, + "learning_rate": 1.2011265065405108e-05, + "loss": 1.5449, + "step": 5077 + }, + { + "epoch": 0.8760458897610627, + "grad_norm": 0.86328125, + "learning_rate": 1.2008598732857673e-05, + "loss": 1.4209, + "step": 5078 + }, + { + "epoch": 0.8762184076597948, + "grad_norm": 0.58984375, + "learning_rate": 1.2005932251500693e-05, + "loss": 1.4707, + "step": 5079 + }, + { + "epoch": 0.8763909255585267, + "grad_norm": 0.640625, + "learning_rate": 1.2003265621531716e-05, + "loss": 1.4294, + "step": 5080 + }, + { + "epoch": 0.8765634434572587, + "grad_norm": 0.60546875, + "learning_rate": 1.2000598843148306e-05, + "loss": 1.404, + "step": 5081 + }, + { + "epoch": 0.8767359613559906, + "grad_norm": 0.6328125, + "learning_rate": 1.1997931916548034e-05, + "loss": 1.4738, + "step": 5082 + }, + { + "epoch": 0.8769084792547227, + "grad_norm": 0.61328125, + "learning_rate": 1.199526484192848e-05, + "loss": 1.4521, + "step": 5083 + }, + { + "epoch": 0.8770809971534547, + "grad_norm": 0.5703125, + "learning_rate": 1.1992597619487242e-05, + "loss": 1.4946, + "step": 5084 + }, + { + "epoch": 0.8772535150521866, + "grad_norm": 0.59765625, + "learning_rate": 1.198993024942192e-05, + "loss": 1.3803, + "step": 5085 + }, + { + "epoch": 0.8774260329509187, + "grad_norm": 0.6015625, + "learning_rate": 1.1987262731930132e-05, + "loss": 1.4693, + "step": 5086 + }, + { + "epoch": 0.8775985508496507, + "grad_norm": 0.6015625, + "learning_rate": 1.1984595067209503e-05, + "loss": 1.3118, + "step": 5087 + }, + { + "epoch": 0.8777710687483826, + "grad_norm": 0.66015625, + "learning_rate": 1.1981927255457675e-05, + "loss": 1.4328, + "step": 5088 + }, + { + "epoch": 0.8779435866471147, + "grad_norm": 0.609375, + "learning_rate": 1.1979259296872287e-05, + "loss": 1.3952, + "step": 5089 + }, + { + "epoch": 0.8781161045458467, + "grad_norm": 0.69140625, + "learning_rate": 1.1976591191651003e-05, + "loss": 1.3956, + "step": 5090 + }, + { + "epoch": 0.8782886224445786, + "grad_norm": 0.7578125, + "learning_rate": 1.19739229399915e-05, + "loss": 1.3836, + "step": 5091 + }, + { + "epoch": 0.8784611403433106, + "grad_norm": 0.62109375, + "learning_rate": 1.1971254542091446e-05, + "loss": 1.411, + "step": 5092 + }, + { + "epoch": 0.8786336582420426, + "grad_norm": 0.5390625, + "learning_rate": 1.1968585998148541e-05, + "loss": 1.3845, + "step": 5093 + }, + { + "epoch": 0.8788061761407746, + "grad_norm": 1.0859375, + "learning_rate": 1.1965917308360484e-05, + "loss": 1.3995, + "step": 5094 + }, + { + "epoch": 0.8789786940395066, + "grad_norm": 0.625, + "learning_rate": 1.196324847292499e-05, + "loss": 1.426, + "step": 5095 + }, + { + "epoch": 0.8791512119382386, + "grad_norm": 0.66015625, + "learning_rate": 1.1960579492039783e-05, + "loss": 1.5251, + "step": 5096 + }, + { + "epoch": 0.8793237298369706, + "grad_norm": 0.62109375, + "learning_rate": 1.1957910365902594e-05, + "loss": 1.4423, + "step": 5097 + }, + { + "epoch": 0.8794962477357026, + "grad_norm": 0.6328125, + "learning_rate": 1.1955241094711174e-05, + "loss": 1.4563, + "step": 5098 + }, + { + "epoch": 0.8796687656344345, + "grad_norm": 0.59375, + "learning_rate": 1.1952571678663274e-05, + "loss": 1.3736, + "step": 5099 + }, + { + "epoch": 0.8798412835331666, + "grad_norm": 0.625, + "learning_rate": 1.194990211795667e-05, + "loss": 1.5295, + "step": 5100 + }, + { + "epoch": 0.8798412835331666, + "eval_loss": 1.4137705564498901, + "eval_runtime": 10.9003, + "eval_samples_per_second": 93.943, + "eval_steps_per_second": 23.486, + "step": 5100 + }, + { + "epoch": 0.8800138014318986, + "grad_norm": 0.59375, + "learning_rate": 1.1947232412789127e-05, + "loss": 1.4976, + "step": 5101 + }, + { + "epoch": 0.8801863193306305, + "grad_norm": 0.67578125, + "learning_rate": 1.1944562563358442e-05, + "loss": 1.3986, + "step": 5102 + }, + { + "epoch": 0.8803588372293626, + "grad_norm": 0.62109375, + "learning_rate": 1.1941892569862413e-05, + "loss": 1.4513, + "step": 5103 + }, + { + "epoch": 0.8805313551280946, + "grad_norm": 0.8984375, + "learning_rate": 1.1939222432498849e-05, + "loss": 1.4324, + "step": 5104 + }, + { + "epoch": 0.8807038730268265, + "grad_norm": 0.82421875, + "learning_rate": 1.193655215146557e-05, + "loss": 1.4404, + "step": 5105 + }, + { + "epoch": 0.8808763909255585, + "grad_norm": 0.78515625, + "learning_rate": 1.1933881726960403e-05, + "loss": 1.4431, + "step": 5106 + }, + { + "epoch": 0.8810489088242905, + "grad_norm": 0.73828125, + "learning_rate": 1.1931211159181201e-05, + "loss": 1.4725, + "step": 5107 + }, + { + "epoch": 0.8812214267230225, + "grad_norm": 0.71484375, + "learning_rate": 1.1928540448325807e-05, + "loss": 1.4504, + "step": 5108 + }, + { + "epoch": 0.8813939446217545, + "grad_norm": 0.63671875, + "learning_rate": 1.1925869594592086e-05, + "loss": 1.3811, + "step": 5109 + }, + { + "epoch": 0.8815664625204865, + "grad_norm": 0.61328125, + "learning_rate": 1.1923198598177912e-05, + "loss": 1.4481, + "step": 5110 + }, + { + "epoch": 0.8817389804192185, + "grad_norm": 0.94921875, + "learning_rate": 1.192052745928117e-05, + "loss": 1.4472, + "step": 5111 + }, + { + "epoch": 0.8819114983179505, + "grad_norm": 0.62109375, + "learning_rate": 1.1917856178099756e-05, + "loss": 1.4421, + "step": 5112 + }, + { + "epoch": 0.8820840162166825, + "grad_norm": 0.65234375, + "learning_rate": 1.191518475483157e-05, + "loss": 1.4525, + "step": 5113 + }, + { + "epoch": 0.8822565341154145, + "grad_norm": 0.73046875, + "learning_rate": 1.191251318967453e-05, + "loss": 1.4304, + "step": 5114 + }, + { + "epoch": 0.8824290520141465, + "grad_norm": 0.62109375, + "learning_rate": 1.1909841482826564e-05, + "loss": 1.4208, + "step": 5115 + }, + { + "epoch": 0.8826015699128784, + "grad_norm": 0.640625, + "learning_rate": 1.190716963448561e-05, + "loss": 1.5275, + "step": 5116 + }, + { + "epoch": 0.8827740878116105, + "grad_norm": 0.6171875, + "learning_rate": 1.1904497644849611e-05, + "loss": 1.4372, + "step": 5117 + }, + { + "epoch": 0.8829466057103424, + "grad_norm": 0.66015625, + "learning_rate": 1.1901825514116526e-05, + "loss": 1.4102, + "step": 5118 + }, + { + "epoch": 0.8831191236090744, + "grad_norm": 0.62109375, + "learning_rate": 1.1899153242484322e-05, + "loss": 1.4247, + "step": 5119 + }, + { + "epoch": 0.8832916415078065, + "grad_norm": 0.71484375, + "learning_rate": 1.1896480830150985e-05, + "loss": 1.4263, + "step": 5120 + }, + { + "epoch": 0.8834641594065384, + "grad_norm": 0.63671875, + "learning_rate": 1.1893808277314494e-05, + "loss": 1.3491, + "step": 5121 + }, + { + "epoch": 0.8836366773052704, + "grad_norm": 0.61328125, + "learning_rate": 1.189113558417285e-05, + "loss": 1.4428, + "step": 5122 + }, + { + "epoch": 0.8838091952040024, + "grad_norm": 0.65625, + "learning_rate": 1.1888462750924072e-05, + "loss": 1.4553, + "step": 5123 + }, + { + "epoch": 0.8839817131027344, + "grad_norm": 0.6484375, + "learning_rate": 1.188578977776617e-05, + "loss": 1.4678, + "step": 5124 + }, + { + "epoch": 0.8841542310014664, + "grad_norm": 0.60546875, + "learning_rate": 1.1883116664897179e-05, + "loss": 1.3806, + "step": 5125 + }, + { + "epoch": 0.8843267489001984, + "grad_norm": 0.66796875, + "learning_rate": 1.1880443412515138e-05, + "loss": 1.4011, + "step": 5126 + }, + { + "epoch": 0.8844992667989304, + "grad_norm": 0.62109375, + "learning_rate": 1.1877770020818103e-05, + "loss": 1.4743, + "step": 5127 + }, + { + "epoch": 0.8846717846976624, + "grad_norm": 0.56640625, + "learning_rate": 1.187509649000413e-05, + "loss": 1.4523, + "step": 5128 + }, + { + "epoch": 0.8848443025963943, + "grad_norm": 0.8515625, + "learning_rate": 1.1872422820271294e-05, + "loss": 1.4416, + "step": 5129 + }, + { + "epoch": 0.8850168204951264, + "grad_norm": 0.63671875, + "learning_rate": 1.1869749011817675e-05, + "loss": 1.4134, + "step": 5130 + }, + { + "epoch": 0.8851893383938584, + "grad_norm": 0.66796875, + "learning_rate": 1.1867075064841365e-05, + "loss": 1.3652, + "step": 5131 + }, + { + "epoch": 0.8853618562925903, + "grad_norm": 3.609375, + "learning_rate": 1.1864400979540472e-05, + "loss": 1.4375, + "step": 5132 + }, + { + "epoch": 0.8855343741913223, + "grad_norm": 0.6171875, + "learning_rate": 1.1861726756113101e-05, + "loss": 1.4964, + "step": 5133 + }, + { + "epoch": 0.8857068920900544, + "grad_norm": 0.60546875, + "learning_rate": 1.185905239475738e-05, + "loss": 1.4896, + "step": 5134 + }, + { + "epoch": 0.8858794099887863, + "grad_norm": 0.59375, + "learning_rate": 1.1856377895671442e-05, + "loss": 1.4705, + "step": 5135 + }, + { + "epoch": 0.8860519278875183, + "grad_norm": 0.578125, + "learning_rate": 1.1853703259053436e-05, + "loss": 1.4255, + "step": 5136 + }, + { + "epoch": 0.8862244457862504, + "grad_norm": 0.59375, + "learning_rate": 1.1851028485101504e-05, + "loss": 1.4481, + "step": 5137 + }, + { + "epoch": 0.8863969636849823, + "grad_norm": 0.67578125, + "learning_rate": 1.1848353574013813e-05, + "loss": 1.4347, + "step": 5138 + }, + { + "epoch": 0.8865694815837143, + "grad_norm": 0.796875, + "learning_rate": 1.1845678525988547e-05, + "loss": 1.4899, + "step": 5139 + }, + { + "epoch": 0.8867419994824463, + "grad_norm": 0.5859375, + "learning_rate": 1.184300334122388e-05, + "loss": 1.5167, + "step": 5140 + }, + { + "epoch": 0.8869145173811783, + "grad_norm": 0.59765625, + "learning_rate": 1.1840328019918011e-05, + "loss": 1.3896, + "step": 5141 + }, + { + "epoch": 0.8870870352799103, + "grad_norm": 0.609375, + "learning_rate": 1.1837652562269141e-05, + "loss": 1.4038, + "step": 5142 + }, + { + "epoch": 0.8872595531786422, + "grad_norm": 0.75, + "learning_rate": 1.1834976968475488e-05, + "loss": 1.4514, + "step": 5143 + }, + { + "epoch": 0.8874320710773743, + "grad_norm": 0.59375, + "learning_rate": 1.183230123873528e-05, + "loss": 1.4866, + "step": 5144 + }, + { + "epoch": 0.8876045889761063, + "grad_norm": 0.65625, + "learning_rate": 1.1829625373246745e-05, + "loss": 1.4061, + "step": 5145 + }, + { + "epoch": 0.8877771068748382, + "grad_norm": 0.59765625, + "learning_rate": 1.1826949372208128e-05, + "loss": 1.4768, + "step": 5146 + }, + { + "epoch": 0.8879496247735703, + "grad_norm": 0.671875, + "learning_rate": 1.1824273235817687e-05, + "loss": 1.3845, + "step": 5147 + }, + { + "epoch": 0.8881221426723023, + "grad_norm": 0.66796875, + "learning_rate": 1.1821596964273689e-05, + "loss": 1.6053, + "step": 5148 + }, + { + "epoch": 0.8882946605710342, + "grad_norm": 0.58203125, + "learning_rate": 1.1818920557774402e-05, + "loss": 1.44, + "step": 5149 + }, + { + "epoch": 0.8884671784697662, + "grad_norm": 0.6015625, + "learning_rate": 1.1816244016518117e-05, + "loss": 1.4575, + "step": 5150 + }, + { + "epoch": 0.8886396963684983, + "grad_norm": 0.828125, + "learning_rate": 1.1813567340703128e-05, + "loss": 1.4115, + "step": 5151 + }, + { + "epoch": 0.8888122142672302, + "grad_norm": 0.69140625, + "learning_rate": 1.1810890530527738e-05, + "loss": 1.4147, + "step": 5152 + }, + { + "epoch": 0.8889847321659622, + "grad_norm": 0.6484375, + "learning_rate": 1.180821358619026e-05, + "loss": 1.3576, + "step": 5153 + }, + { + "epoch": 0.8891572500646943, + "grad_norm": 0.60546875, + "learning_rate": 1.1805536507889021e-05, + "loss": 1.4656, + "step": 5154 + }, + { + "epoch": 0.8893297679634262, + "grad_norm": 0.66796875, + "learning_rate": 1.1802859295822358e-05, + "loss": 1.4959, + "step": 5155 + }, + { + "epoch": 0.8895022858621582, + "grad_norm": 0.69921875, + "learning_rate": 1.1800181950188617e-05, + "loss": 1.4124, + "step": 5156 + }, + { + "epoch": 0.8896748037608901, + "grad_norm": 0.61328125, + "learning_rate": 1.1797504471186146e-05, + "loss": 1.4673, + "step": 5157 + }, + { + "epoch": 0.8898473216596222, + "grad_norm": 0.6171875, + "learning_rate": 1.179482685901331e-05, + "loss": 1.4117, + "step": 5158 + }, + { + "epoch": 0.8900198395583542, + "grad_norm": 0.59765625, + "learning_rate": 1.1792149113868488e-05, + "loss": 1.5153, + "step": 5159 + }, + { + "epoch": 0.8901923574570861, + "grad_norm": 0.62109375, + "learning_rate": 1.1789471235950062e-05, + "loss": 1.4547, + "step": 5160 + }, + { + "epoch": 0.8903648753558182, + "grad_norm": 0.57421875, + "learning_rate": 1.1786793225456428e-05, + "loss": 1.4885, + "step": 5161 + }, + { + "epoch": 0.8905373932545502, + "grad_norm": 0.6015625, + "learning_rate": 1.1784115082585982e-05, + "loss": 1.4485, + "step": 5162 + }, + { + "epoch": 0.8907099111532821, + "grad_norm": 0.57421875, + "learning_rate": 1.1781436807537148e-05, + "loss": 1.4793, + "step": 5163 + }, + { + "epoch": 0.8908824290520142, + "grad_norm": 0.7265625, + "learning_rate": 1.1778758400508343e-05, + "loss": 1.485, + "step": 5164 + }, + { + "epoch": 0.8910549469507462, + "grad_norm": 0.57421875, + "learning_rate": 1.1776079861698e-05, + "loss": 1.4215, + "step": 5165 + }, + { + "epoch": 0.8912274648494781, + "grad_norm": 0.65625, + "learning_rate": 1.1773401191304564e-05, + "loss": 1.4582, + "step": 5166 + }, + { + "epoch": 0.8913999827482101, + "grad_norm": 0.61328125, + "learning_rate": 1.1770722389526487e-05, + "loss": 1.4509, + "step": 5167 + }, + { + "epoch": 0.8915725006469422, + "grad_norm": 0.59375, + "learning_rate": 1.1768043456562233e-05, + "loss": 1.5415, + "step": 5168 + }, + { + "epoch": 0.8917450185456741, + "grad_norm": 0.67578125, + "learning_rate": 1.176536439261027e-05, + "loss": 1.4549, + "step": 5169 + }, + { + "epoch": 0.8919175364444061, + "grad_norm": 0.640625, + "learning_rate": 1.176268519786908e-05, + "loss": 1.4992, + "step": 5170 + }, + { + "epoch": 0.8920900543431382, + "grad_norm": 0.62109375, + "learning_rate": 1.1760005872537161e-05, + "loss": 1.4561, + "step": 5171 + }, + { + "epoch": 0.8922625722418701, + "grad_norm": 0.6328125, + "learning_rate": 1.175732641681301e-05, + "loss": 1.4543, + "step": 5172 + }, + { + "epoch": 0.8924350901406021, + "grad_norm": 0.6484375, + "learning_rate": 1.1754646830895138e-05, + "loss": 1.4101, + "step": 5173 + }, + { + "epoch": 0.892607608039334, + "grad_norm": 0.67578125, + "learning_rate": 1.1751967114982063e-05, + "loss": 1.4677, + "step": 5174 + }, + { + "epoch": 0.8927801259380661, + "grad_norm": 0.609375, + "learning_rate": 1.1749287269272318e-05, + "loss": 1.4727, + "step": 5175 + }, + { + "epoch": 0.8929526438367981, + "grad_norm": 0.57421875, + "learning_rate": 1.1746607293964446e-05, + "loss": 1.4093, + "step": 5176 + }, + { + "epoch": 0.89312516173553, + "grad_norm": 0.66796875, + "learning_rate": 1.1743927189256988e-05, + "loss": 1.569, + "step": 5177 + }, + { + "epoch": 0.8932976796342621, + "grad_norm": 0.6171875, + "learning_rate": 1.1741246955348506e-05, + "loss": 1.5247, + "step": 5178 + }, + { + "epoch": 0.8934701975329941, + "grad_norm": 0.66015625, + "learning_rate": 1.1738566592437573e-05, + "loss": 1.3986, + "step": 5179 + }, + { + "epoch": 0.893642715431726, + "grad_norm": 0.6328125, + "learning_rate": 1.1735886100722764e-05, + "loss": 1.3666, + "step": 5180 + }, + { + "epoch": 0.893815233330458, + "grad_norm": 0.64453125, + "learning_rate": 1.1733205480402663e-05, + "loss": 1.5267, + "step": 5181 + }, + { + "epoch": 0.89398775122919, + "grad_norm": 0.6796875, + "learning_rate": 1.1730524731675872e-05, + "loss": 1.3956, + "step": 5182 + }, + { + "epoch": 0.894160269127922, + "grad_norm": 0.60546875, + "learning_rate": 1.1727843854740997e-05, + "loss": 1.4123, + "step": 5183 + }, + { + "epoch": 0.894332787026654, + "grad_norm": 0.578125, + "learning_rate": 1.1725162849796653e-05, + "loss": 1.4602, + "step": 5184 + }, + { + "epoch": 0.894505304925386, + "grad_norm": 0.64453125, + "learning_rate": 1.1722481717041467e-05, + "loss": 1.3849, + "step": 5185 + }, + { + "epoch": 0.894677822824118, + "grad_norm": 0.65625, + "learning_rate": 1.171980045667407e-05, + "loss": 1.454, + "step": 5186 + }, + { + "epoch": 0.89485034072285, + "grad_norm": 0.65625, + "learning_rate": 1.1717119068893108e-05, + "loss": 1.4808, + "step": 5187 + }, + { + "epoch": 0.895022858621582, + "grad_norm": 0.6328125, + "learning_rate": 1.171443755389724e-05, + "loss": 1.5093, + "step": 5188 + }, + { + "epoch": 0.895195376520314, + "grad_norm": 0.5859375, + "learning_rate": 1.1711755911885126e-05, + "loss": 1.3697, + "step": 5189 + }, + { + "epoch": 0.895367894419046, + "grad_norm": 0.68359375, + "learning_rate": 1.1709074143055435e-05, + "loss": 1.3519, + "step": 5190 + }, + { + "epoch": 0.8955404123177779, + "grad_norm": 0.609375, + "learning_rate": 1.1706392247606855e-05, + "loss": 1.4844, + "step": 5191 + }, + { + "epoch": 0.89571293021651, + "grad_norm": 0.6796875, + "learning_rate": 1.1703710225738077e-05, + "loss": 1.4182, + "step": 5192 + }, + { + "epoch": 0.895885448115242, + "grad_norm": 0.61328125, + "learning_rate": 1.1701028077647798e-05, + "loss": 1.4381, + "step": 5193 + }, + { + "epoch": 0.8960579660139739, + "grad_norm": 0.60546875, + "learning_rate": 1.169834580353473e-05, + "loss": 1.3898, + "step": 5194 + }, + { + "epoch": 0.896230483912706, + "grad_norm": 0.6796875, + "learning_rate": 1.1695663403597592e-05, + "loss": 1.3194, + "step": 5195 + }, + { + "epoch": 0.896403001811438, + "grad_norm": 0.578125, + "learning_rate": 1.1692980878035119e-05, + "loss": 1.4205, + "step": 5196 + }, + { + "epoch": 0.8965755197101699, + "grad_norm": 0.671875, + "learning_rate": 1.1690298227046041e-05, + "loss": 1.48, + "step": 5197 + }, + { + "epoch": 0.8967480376089019, + "grad_norm": 1.0703125, + "learning_rate": 1.1687615450829108e-05, + "loss": 1.4633, + "step": 5198 + }, + { + "epoch": 0.8969205555076339, + "grad_norm": 0.6875, + "learning_rate": 1.1684932549583079e-05, + "loss": 1.3481, + "step": 5199 + }, + { + "epoch": 0.8970930734063659, + "grad_norm": 0.6640625, + "learning_rate": 1.1682249523506721e-05, + "loss": 1.4053, + "step": 5200 + }, + { + "epoch": 0.8970930734063659, + "eval_loss": 1.41326105594635, + "eval_runtime": 10.8591, + "eval_samples_per_second": 94.299, + "eval_steps_per_second": 23.575, + "step": 5200 + }, + { + "epoch": 0.8972655913050979, + "grad_norm": 0.62890625, + "learning_rate": 1.1679566372798803e-05, + "loss": 1.4101, + "step": 5201 + }, + { + "epoch": 0.8974381092038299, + "grad_norm": 0.59765625, + "learning_rate": 1.1676883097658117e-05, + "loss": 1.43, + "step": 5202 + }, + { + "epoch": 0.8976106271025619, + "grad_norm": 0.6328125, + "learning_rate": 1.1674199698283448e-05, + "loss": 1.3728, + "step": 5203 + }, + { + "epoch": 0.8977831450012939, + "grad_norm": 0.88671875, + "learning_rate": 1.167151617487361e-05, + "loss": 1.4642, + "step": 5204 + }, + { + "epoch": 0.8979556629000259, + "grad_norm": 0.5859375, + "learning_rate": 1.1668832527627407e-05, + "loss": 1.4412, + "step": 5205 + }, + { + "epoch": 0.8981281807987579, + "grad_norm": 0.73046875, + "learning_rate": 1.1666148756743665e-05, + "loss": 1.5616, + "step": 5206 + }, + { + "epoch": 0.8983006986974899, + "grad_norm": 0.59765625, + "learning_rate": 1.166346486242121e-05, + "loss": 1.444, + "step": 5207 + }, + { + "epoch": 0.8984732165962218, + "grad_norm": 0.70703125, + "learning_rate": 1.1660780844858887e-05, + "loss": 1.4412, + "step": 5208 + }, + { + "epoch": 0.8986457344949539, + "grad_norm": 0.63671875, + "learning_rate": 1.1658096704255542e-05, + "loss": 1.3195, + "step": 5209 + }, + { + "epoch": 0.8988182523936858, + "grad_norm": 0.59765625, + "learning_rate": 1.165541244081003e-05, + "loss": 1.371, + "step": 5210 + }, + { + "epoch": 0.8989907702924178, + "grad_norm": 0.59375, + "learning_rate": 1.1652728054721223e-05, + "loss": 1.4934, + "step": 5211 + }, + { + "epoch": 0.8991632881911499, + "grad_norm": 0.6171875, + "learning_rate": 1.1650043546187994e-05, + "loss": 1.412, + "step": 5212 + }, + { + "epoch": 0.8993358060898818, + "grad_norm": 0.70703125, + "learning_rate": 1.1647358915409231e-05, + "loss": 1.4778, + "step": 5213 + }, + { + "epoch": 0.8995083239886138, + "grad_norm": 0.66015625, + "learning_rate": 1.1644674162583825e-05, + "loss": 1.5873, + "step": 5214 + }, + { + "epoch": 0.8996808418873458, + "grad_norm": 0.62890625, + "learning_rate": 1.164198928791068e-05, + "loss": 1.5613, + "step": 5215 + }, + { + "epoch": 0.8998533597860778, + "grad_norm": 0.61328125, + "learning_rate": 1.1639304291588708e-05, + "loss": 1.4511, + "step": 5216 + }, + { + "epoch": 0.9000258776848098, + "grad_norm": 0.6796875, + "learning_rate": 1.1636619173816834e-05, + "loss": 1.406, + "step": 5217 + }, + { + "epoch": 0.9001983955835418, + "grad_norm": 0.69140625, + "learning_rate": 1.1633933934793984e-05, + "loss": 1.5064, + "step": 5218 + }, + { + "epoch": 0.9003709134822738, + "grad_norm": 0.65234375, + "learning_rate": 1.1631248574719098e-05, + "loss": 1.3637, + "step": 5219 + }, + { + "epoch": 0.9005434313810058, + "grad_norm": 0.7734375, + "learning_rate": 1.1628563093791128e-05, + "loss": 1.501, + "step": 5220 + }, + { + "epoch": 0.9007159492797377, + "grad_norm": 0.65234375, + "learning_rate": 1.1625877492209025e-05, + "loss": 1.523, + "step": 5221 + }, + { + "epoch": 0.9008884671784698, + "grad_norm": 0.75, + "learning_rate": 1.1623191770171761e-05, + "loss": 1.4947, + "step": 5222 + }, + { + "epoch": 0.9010609850772018, + "grad_norm": 0.734375, + "learning_rate": 1.1620505927878305e-05, + "loss": 1.5754, + "step": 5223 + }, + { + "epoch": 0.9012335029759337, + "grad_norm": 0.63671875, + "learning_rate": 1.161781996552765e-05, + "loss": 1.3632, + "step": 5224 + }, + { + "epoch": 0.9014060208746657, + "grad_norm": 0.58203125, + "learning_rate": 1.1615133883318778e-05, + "loss": 1.4967, + "step": 5225 + }, + { + "epoch": 0.9015785387733978, + "grad_norm": 1.0078125, + "learning_rate": 1.1612447681450697e-05, + "loss": 1.3705, + "step": 5226 + }, + { + "epoch": 0.9017510566721297, + "grad_norm": 0.6328125, + "learning_rate": 1.160976136012242e-05, + "loss": 1.4213, + "step": 5227 + }, + { + "epoch": 0.9019235745708617, + "grad_norm": 1.015625, + "learning_rate": 1.1607074919532964e-05, + "loss": 1.487, + "step": 5228 + }, + { + "epoch": 0.9020960924695938, + "grad_norm": 0.58203125, + "learning_rate": 1.160438835988135e-05, + "loss": 1.4603, + "step": 5229 + }, + { + "epoch": 0.9022686103683257, + "grad_norm": 0.83984375, + "learning_rate": 1.1601701681366625e-05, + "loss": 1.4169, + "step": 5230 + }, + { + "epoch": 0.9024411282670577, + "grad_norm": 0.69140625, + "learning_rate": 1.1599014884187834e-05, + "loss": 1.4951, + "step": 5231 + }, + { + "epoch": 0.9026136461657897, + "grad_norm": 0.61328125, + "learning_rate": 1.1596327968544025e-05, + "loss": 1.4138, + "step": 5232 + }, + { + "epoch": 0.9027861640645217, + "grad_norm": 0.73828125, + "learning_rate": 1.1593640934634272e-05, + "loss": 1.4008, + "step": 5233 + }, + { + "epoch": 0.9029586819632537, + "grad_norm": 0.61328125, + "learning_rate": 1.1590953782657635e-05, + "loss": 1.5109, + "step": 5234 + }, + { + "epoch": 0.9031311998619856, + "grad_norm": 0.64453125, + "learning_rate": 1.15882665128132e-05, + "loss": 1.4836, + "step": 5235 + }, + { + "epoch": 0.9033037177607177, + "grad_norm": 0.76953125, + "learning_rate": 1.1585579125300063e-05, + "loss": 1.4583, + "step": 5236 + }, + { + "epoch": 0.9034762356594497, + "grad_norm": 0.75, + "learning_rate": 1.1582891620317316e-05, + "loss": 1.3559, + "step": 5237 + }, + { + "epoch": 0.9036487535581816, + "grad_norm": 0.84375, + "learning_rate": 1.1580203998064066e-05, + "loss": 1.51, + "step": 5238 + }, + { + "epoch": 0.9038212714569136, + "grad_norm": 0.609375, + "learning_rate": 1.157751625873943e-05, + "loss": 1.4308, + "step": 5239 + }, + { + "epoch": 0.9039937893556457, + "grad_norm": 0.64453125, + "learning_rate": 1.1574828402542535e-05, + "loss": 1.4897, + "step": 5240 + }, + { + "epoch": 0.9041663072543776, + "grad_norm": 0.8203125, + "learning_rate": 1.157214042967251e-05, + "loss": 1.4364, + "step": 5241 + }, + { + "epoch": 0.9043388251531096, + "grad_norm": 0.71484375, + "learning_rate": 1.1569452340328497e-05, + "loss": 1.4573, + "step": 5242 + }, + { + "epoch": 0.9045113430518417, + "grad_norm": 0.76171875, + "learning_rate": 1.1566764134709652e-05, + "loss": 1.4314, + "step": 5243 + }, + { + "epoch": 0.9046838609505736, + "grad_norm": 0.59765625, + "learning_rate": 1.156407581301513e-05, + "loss": 1.532, + "step": 5244 + }, + { + "epoch": 0.9048563788493056, + "grad_norm": 0.58984375, + "learning_rate": 1.1561387375444098e-05, + "loss": 1.4244, + "step": 5245 + }, + { + "epoch": 0.9050288967480377, + "grad_norm": 0.69140625, + "learning_rate": 1.155869882219573e-05, + "loss": 1.4332, + "step": 5246 + }, + { + "epoch": 0.9052014146467696, + "grad_norm": 0.671875, + "learning_rate": 1.1556010153469219e-05, + "loss": 1.3592, + "step": 5247 + }, + { + "epoch": 0.9053739325455016, + "grad_norm": 0.62890625, + "learning_rate": 1.155332136946375e-05, + "loss": 1.4865, + "step": 5248 + }, + { + "epoch": 0.9055464504442335, + "grad_norm": 0.59765625, + "learning_rate": 1.155063247037853e-05, + "loss": 1.5179, + "step": 5249 + }, + { + "epoch": 0.9057189683429656, + "grad_norm": 0.58203125, + "learning_rate": 1.154794345641277e-05, + "loss": 1.4468, + "step": 5250 + }, + { + "epoch": 0.9058914862416976, + "grad_norm": 0.58203125, + "learning_rate": 1.154525432776568e-05, + "loss": 1.4827, + "step": 5251 + }, + { + "epoch": 0.9060640041404295, + "grad_norm": 1.15625, + "learning_rate": 1.15425650846365e-05, + "loss": 1.4675, + "step": 5252 + }, + { + "epoch": 0.9062365220391616, + "grad_norm": 0.61328125, + "learning_rate": 1.153987572722446e-05, + "loss": 1.46, + "step": 5253 + }, + { + "epoch": 0.9064090399378936, + "grad_norm": 0.58984375, + "learning_rate": 1.1537186255728803e-05, + "loss": 1.3107, + "step": 5254 + }, + { + "epoch": 0.9065815578366255, + "grad_norm": 0.6171875, + "learning_rate": 1.1534496670348783e-05, + "loss": 1.5258, + "step": 5255 + }, + { + "epoch": 0.9067540757353575, + "grad_norm": 0.61328125, + "learning_rate": 1.1531806971283663e-05, + "loss": 1.4652, + "step": 5256 + }, + { + "epoch": 0.9069265936340896, + "grad_norm": 0.578125, + "learning_rate": 1.152911715873271e-05, + "loss": 1.5179, + "step": 5257 + }, + { + "epoch": 0.9070991115328215, + "grad_norm": 0.578125, + "learning_rate": 1.1526427232895205e-05, + "loss": 1.493, + "step": 5258 + }, + { + "epoch": 0.9072716294315535, + "grad_norm": 0.59765625, + "learning_rate": 1.1523737193970432e-05, + "loss": 1.4812, + "step": 5259 + }, + { + "epoch": 0.9074441473302856, + "grad_norm": 0.5859375, + "learning_rate": 1.1521047042157684e-05, + "loss": 1.4551, + "step": 5260 + }, + { + "epoch": 0.9076166652290175, + "grad_norm": 0.62109375, + "learning_rate": 1.1518356777656274e-05, + "loss": 1.3966, + "step": 5261 + }, + { + "epoch": 0.9077891831277495, + "grad_norm": 0.58203125, + "learning_rate": 1.1515666400665504e-05, + "loss": 1.3979, + "step": 5262 + }, + { + "epoch": 0.9079617010264815, + "grad_norm": 0.56640625, + "learning_rate": 1.1512975911384695e-05, + "loss": 1.3519, + "step": 5263 + }, + { + "epoch": 0.9081342189252135, + "grad_norm": 0.59375, + "learning_rate": 1.1510285310013176e-05, + "loss": 1.4521, + "step": 5264 + }, + { + "epoch": 0.9083067368239455, + "grad_norm": 0.55078125, + "learning_rate": 1.1507594596750288e-05, + "loss": 1.445, + "step": 5265 + }, + { + "epoch": 0.9084792547226774, + "grad_norm": 0.61328125, + "learning_rate": 1.150490377179537e-05, + "loss": 1.3316, + "step": 5266 + }, + { + "epoch": 0.9086517726214095, + "grad_norm": 0.61328125, + "learning_rate": 1.1502212835347776e-05, + "loss": 1.4873, + "step": 5267 + }, + { + "epoch": 0.9088242905201415, + "grad_norm": 0.6171875, + "learning_rate": 1.1499521787606874e-05, + "loss": 1.446, + "step": 5268 + }, + { + "epoch": 0.9089968084188734, + "grad_norm": 0.65234375, + "learning_rate": 1.1496830628772025e-05, + "loss": 1.4837, + "step": 5269 + }, + { + "epoch": 0.9091693263176055, + "grad_norm": 0.65234375, + "learning_rate": 1.1494139359042612e-05, + "loss": 1.5096, + "step": 5270 + }, + { + "epoch": 0.9093418442163375, + "grad_norm": 0.5859375, + "learning_rate": 1.1491447978618015e-05, + "loss": 1.4487, + "step": 5271 + }, + { + "epoch": 0.9095143621150694, + "grad_norm": 0.609375, + "learning_rate": 1.148875648769764e-05, + "loss": 1.4519, + "step": 5272 + }, + { + "epoch": 0.9096868800138014, + "grad_norm": 0.59765625, + "learning_rate": 1.1486064886480876e-05, + "loss": 1.4319, + "step": 5273 + }, + { + "epoch": 0.9098593979125335, + "grad_norm": 0.5859375, + "learning_rate": 1.1483373175167142e-05, + "loss": 1.3451, + "step": 5274 + }, + { + "epoch": 0.9100319158112654, + "grad_norm": 0.625, + "learning_rate": 1.1480681353955856e-05, + "loss": 1.4576, + "step": 5275 + }, + { + "epoch": 0.9102044337099974, + "grad_norm": 0.67578125, + "learning_rate": 1.1477989423046442e-05, + "loss": 1.4853, + "step": 5276 + }, + { + "epoch": 0.9103769516087294, + "grad_norm": 0.59375, + "learning_rate": 1.147529738263834e-05, + "loss": 1.4957, + "step": 5277 + }, + { + "epoch": 0.9105494695074614, + "grad_norm": 0.6484375, + "learning_rate": 1.1472605232930985e-05, + "loss": 1.4662, + "step": 5278 + }, + { + "epoch": 0.9107219874061934, + "grad_norm": 0.80078125, + "learning_rate": 1.1469912974123835e-05, + "loss": 1.3621, + "step": 5279 + }, + { + "epoch": 0.9108945053049254, + "grad_norm": 0.609375, + "learning_rate": 1.1467220606416348e-05, + "loss": 1.4839, + "step": 5280 + }, + { + "epoch": 0.9110670232036574, + "grad_norm": 0.73046875, + "learning_rate": 1.1464528130007992e-05, + "loss": 1.3379, + "step": 5281 + }, + { + "epoch": 0.9112395411023894, + "grad_norm": 0.80859375, + "learning_rate": 1.146183554509824e-05, + "loss": 1.4744, + "step": 5282 + }, + { + "epoch": 0.9114120590011213, + "grad_norm": 0.6640625, + "learning_rate": 1.1459142851886573e-05, + "loss": 1.4277, + "step": 5283 + }, + { + "epoch": 0.9115845768998534, + "grad_norm": 0.625, + "learning_rate": 1.1456450050572491e-05, + "loss": 1.5622, + "step": 5284 + }, + { + "epoch": 0.9117570947985854, + "grad_norm": 0.63671875, + "learning_rate": 1.1453757141355488e-05, + "loss": 1.4224, + "step": 5285 + }, + { + "epoch": 0.9119296126973173, + "grad_norm": 0.69921875, + "learning_rate": 1.1451064124435072e-05, + "loss": 1.4353, + "step": 5286 + }, + { + "epoch": 0.9121021305960494, + "grad_norm": 0.6015625, + "learning_rate": 1.1448371000010758e-05, + "loss": 1.3761, + "step": 5287 + }, + { + "epoch": 0.9122746484947813, + "grad_norm": 0.796875, + "learning_rate": 1.1445677768282073e-05, + "loss": 1.4288, + "step": 5288 + }, + { + "epoch": 0.9124471663935133, + "grad_norm": 0.65625, + "learning_rate": 1.1442984429448545e-05, + "loss": 1.4021, + "step": 5289 + }, + { + "epoch": 0.9126196842922453, + "grad_norm": 0.64453125, + "learning_rate": 1.1440290983709715e-05, + "loss": 1.4077, + "step": 5290 + }, + { + "epoch": 0.9127922021909773, + "grad_norm": 0.5703125, + "learning_rate": 1.1437597431265126e-05, + "loss": 1.403, + "step": 5291 + }, + { + "epoch": 0.9129647200897093, + "grad_norm": 0.60546875, + "learning_rate": 1.143490377231434e-05, + "loss": 1.37, + "step": 5292 + }, + { + "epoch": 0.9131372379884413, + "grad_norm": 0.609375, + "learning_rate": 1.1432210007056919e-05, + "loss": 1.4122, + "step": 5293 + }, + { + "epoch": 0.9133097558871733, + "grad_norm": 0.66015625, + "learning_rate": 1.1429516135692427e-05, + "loss": 1.4618, + "step": 5294 + }, + { + "epoch": 0.9134822737859053, + "grad_norm": 0.63671875, + "learning_rate": 1.1426822158420449e-05, + "loss": 1.3283, + "step": 5295 + }, + { + "epoch": 0.9136547916846373, + "grad_norm": 0.72265625, + "learning_rate": 1.1424128075440572e-05, + "loss": 1.438, + "step": 5296 + }, + { + "epoch": 0.9138273095833693, + "grad_norm": 0.6875, + "learning_rate": 1.142143388695239e-05, + "loss": 1.4426, + "step": 5297 + }, + { + "epoch": 0.9139998274821013, + "grad_norm": 0.6328125, + "learning_rate": 1.14187395931555e-05, + "loss": 1.4818, + "step": 5298 + }, + { + "epoch": 0.9141723453808333, + "grad_norm": 0.6484375, + "learning_rate": 1.1416045194249517e-05, + "loss": 1.4444, + "step": 5299 + }, + { + "epoch": 0.9143448632795652, + "grad_norm": 0.63671875, + "learning_rate": 1.141335069043406e-05, + "loss": 1.4915, + "step": 5300 + }, + { + "epoch": 0.9143448632795652, + "eval_loss": 1.4127678871154785, + "eval_runtime": 10.8242, + "eval_samples_per_second": 94.603, + "eval_steps_per_second": 23.651, + "step": 5300 + }, + { + "epoch": 0.9145173811782973, + "grad_norm": 0.67578125, + "learning_rate": 1.1410656081908754e-05, + "loss": 1.3751, + "step": 5301 + }, + { + "epoch": 0.9146898990770292, + "grad_norm": 1.046875, + "learning_rate": 1.1407961368873226e-05, + "loss": 1.4008, + "step": 5302 + }, + { + "epoch": 0.9148624169757612, + "grad_norm": 0.6796875, + "learning_rate": 1.1405266551527126e-05, + "loss": 1.4782, + "step": 5303 + }, + { + "epoch": 0.9150349348744933, + "grad_norm": 0.7109375, + "learning_rate": 1.1402571630070098e-05, + "loss": 1.4454, + "step": 5304 + }, + { + "epoch": 0.9152074527732252, + "grad_norm": 0.625, + "learning_rate": 1.1399876604701805e-05, + "loss": 1.3988, + "step": 5305 + }, + { + "epoch": 0.9153799706719572, + "grad_norm": 0.765625, + "learning_rate": 1.1397181475621901e-05, + "loss": 1.3682, + "step": 5306 + }, + { + "epoch": 0.9155524885706892, + "grad_norm": 0.625, + "learning_rate": 1.1394486243030066e-05, + "loss": 1.438, + "step": 5307 + }, + { + "epoch": 0.9157250064694212, + "grad_norm": 0.61328125, + "learning_rate": 1.1391790907125975e-05, + "loss": 1.4038, + "step": 5308 + }, + { + "epoch": 0.9158975243681532, + "grad_norm": 0.72265625, + "learning_rate": 1.138909546810932e-05, + "loss": 1.4956, + "step": 5309 + }, + { + "epoch": 0.9160700422668852, + "grad_norm": 0.68359375, + "learning_rate": 1.138639992617979e-05, + "loss": 1.5037, + "step": 5310 + }, + { + "epoch": 0.9162425601656172, + "grad_norm": 0.625, + "learning_rate": 1.138370428153709e-05, + "loss": 1.4534, + "step": 5311 + }, + { + "epoch": 0.9164150780643492, + "grad_norm": 0.58203125, + "learning_rate": 1.1381008534380933e-05, + "loss": 1.4813, + "step": 5312 + }, + { + "epoch": 0.9165875959630811, + "grad_norm": 0.6875, + "learning_rate": 1.1378312684911036e-05, + "loss": 1.4618, + "step": 5313 + }, + { + "epoch": 0.9167601138618131, + "grad_norm": 0.66015625, + "learning_rate": 1.1375616733327125e-05, + "loss": 1.3733, + "step": 5314 + }, + { + "epoch": 0.9169326317605452, + "grad_norm": 0.9609375, + "learning_rate": 1.1372920679828922e-05, + "loss": 1.4316, + "step": 5315 + }, + { + "epoch": 0.9171051496592771, + "grad_norm": 0.65625, + "learning_rate": 1.1370224524616187e-05, + "loss": 1.419, + "step": 5316 + }, + { + "epoch": 0.9172776675580091, + "grad_norm": 0.59375, + "learning_rate": 1.1367528267888653e-05, + "loss": 1.4067, + "step": 5317 + }, + { + "epoch": 0.9174501854567412, + "grad_norm": 0.8046875, + "learning_rate": 1.136483190984608e-05, + "loss": 1.4496, + "step": 5318 + }, + { + "epoch": 0.9176227033554731, + "grad_norm": 0.87890625, + "learning_rate": 1.1362135450688232e-05, + "loss": 1.4289, + "step": 5319 + }, + { + "epoch": 0.9177952212542051, + "grad_norm": 0.65234375, + "learning_rate": 1.1359438890614878e-05, + "loss": 1.4319, + "step": 5320 + }, + { + "epoch": 0.9179677391529372, + "grad_norm": 0.62890625, + "learning_rate": 1.13567422298258e-05, + "loss": 1.4672, + "step": 5321 + }, + { + "epoch": 0.9181402570516691, + "grad_norm": 0.6875, + "learning_rate": 1.1354045468520777e-05, + "loss": 1.4084, + "step": 5322 + }, + { + "epoch": 0.9183127749504011, + "grad_norm": 0.671875, + "learning_rate": 1.1351348606899605e-05, + "loss": 1.3973, + "step": 5323 + }, + { + "epoch": 0.918485292849133, + "grad_norm": 0.66015625, + "learning_rate": 1.1348651645162088e-05, + "loss": 1.4132, + "step": 5324 + }, + { + "epoch": 0.9186578107478651, + "grad_norm": 0.57421875, + "learning_rate": 1.1345954583508028e-05, + "loss": 1.3457, + "step": 5325 + }, + { + "epoch": 0.9188303286465971, + "grad_norm": 0.6328125, + "learning_rate": 1.1343257422137244e-05, + "loss": 1.3226, + "step": 5326 + }, + { + "epoch": 0.919002846545329, + "grad_norm": 0.578125, + "learning_rate": 1.1340560161249554e-05, + "loss": 1.3878, + "step": 5327 + }, + { + "epoch": 0.9191753644440611, + "grad_norm": 0.57421875, + "learning_rate": 1.1337862801044792e-05, + "loss": 1.4944, + "step": 5328 + }, + { + "epoch": 0.9193478823427931, + "grad_norm": 1.609375, + "learning_rate": 1.1335165341722798e-05, + "loss": 1.4164, + "step": 5329 + }, + { + "epoch": 0.919520400241525, + "grad_norm": 0.6640625, + "learning_rate": 1.133246778348341e-05, + "loss": 1.5615, + "step": 5330 + }, + { + "epoch": 0.919692918140257, + "grad_norm": 0.74609375, + "learning_rate": 1.132977012652648e-05, + "loss": 1.3541, + "step": 5331 + }, + { + "epoch": 0.9198654360389891, + "grad_norm": 0.6875, + "learning_rate": 1.1327072371051873e-05, + "loss": 1.3902, + "step": 5332 + }, + { + "epoch": 0.920037953937721, + "grad_norm": 0.609375, + "learning_rate": 1.1324374517259455e-05, + "loss": 1.4545, + "step": 5333 + }, + { + "epoch": 0.920210471836453, + "grad_norm": 0.60546875, + "learning_rate": 1.1321676565349096e-05, + "loss": 1.4162, + "step": 5334 + }, + { + "epoch": 0.9203829897351851, + "grad_norm": 0.58203125, + "learning_rate": 1.1318978515520678e-05, + "loss": 1.3777, + "step": 5335 + }, + { + "epoch": 0.920555507633917, + "grad_norm": 0.58984375, + "learning_rate": 1.1316280367974091e-05, + "loss": 1.4113, + "step": 5336 + }, + { + "epoch": 0.920728025532649, + "grad_norm": 0.5546875, + "learning_rate": 1.131358212290923e-05, + "loss": 1.5264, + "step": 5337 + }, + { + "epoch": 0.9209005434313811, + "grad_norm": 0.59375, + "learning_rate": 1.1310883780525996e-05, + "loss": 1.4073, + "step": 5338 + }, + { + "epoch": 0.921073061330113, + "grad_norm": 0.72265625, + "learning_rate": 1.1308185341024303e-05, + "loss": 1.3889, + "step": 5339 + }, + { + "epoch": 0.921245579228845, + "grad_norm": 0.63671875, + "learning_rate": 1.1305486804604065e-05, + "loss": 1.4394, + "step": 5340 + }, + { + "epoch": 0.9214180971275769, + "grad_norm": 0.6640625, + "learning_rate": 1.1302788171465208e-05, + "loss": 1.2919, + "step": 5341 + }, + { + "epoch": 0.921590615026309, + "grad_norm": 0.6015625, + "learning_rate": 1.1300089441807664e-05, + "loss": 1.4688, + "step": 5342 + }, + { + "epoch": 0.921763132925041, + "grad_norm": 0.62109375, + "learning_rate": 1.129739061583137e-05, + "loss": 1.5077, + "step": 5343 + }, + { + "epoch": 0.9219356508237729, + "grad_norm": 0.67578125, + "learning_rate": 1.129469169373627e-05, + "loss": 1.5521, + "step": 5344 + }, + { + "epoch": 0.922108168722505, + "grad_norm": 0.62109375, + "learning_rate": 1.1291992675722325e-05, + "loss": 1.55, + "step": 5345 + }, + { + "epoch": 0.922280686621237, + "grad_norm": 0.58203125, + "learning_rate": 1.1289293561989486e-05, + "loss": 1.3981, + "step": 5346 + }, + { + "epoch": 0.9224532045199689, + "grad_norm": 0.7734375, + "learning_rate": 1.1286594352737723e-05, + "loss": 1.4134, + "step": 5347 + }, + { + "epoch": 0.9226257224187009, + "grad_norm": 0.61328125, + "learning_rate": 1.1283895048167013e-05, + "loss": 1.3881, + "step": 5348 + }, + { + "epoch": 0.922798240317433, + "grad_norm": 0.578125, + "learning_rate": 1.1281195648477336e-05, + "loss": 1.4075, + "step": 5349 + }, + { + "epoch": 0.9229707582161649, + "grad_norm": 0.625, + "learning_rate": 1.1278496153868681e-05, + "loss": 1.3477, + "step": 5350 + }, + { + "epoch": 0.9231432761148969, + "grad_norm": 0.6796875, + "learning_rate": 1.127579656454104e-05, + "loss": 1.4386, + "step": 5351 + }, + { + "epoch": 0.923315794013629, + "grad_norm": 0.58984375, + "learning_rate": 1.1273096880694419e-05, + "loss": 1.4019, + "step": 5352 + }, + { + "epoch": 0.9234883119123609, + "grad_norm": 0.6484375, + "learning_rate": 1.127039710252883e-05, + "loss": 1.4122, + "step": 5353 + }, + { + "epoch": 0.9236608298110929, + "grad_norm": 0.60546875, + "learning_rate": 1.1267697230244281e-05, + "loss": 1.4157, + "step": 5354 + }, + { + "epoch": 0.923833347709825, + "grad_norm": 0.64453125, + "learning_rate": 1.1264997264040802e-05, + "loss": 1.392, + "step": 5355 + }, + { + "epoch": 0.9240058656085569, + "grad_norm": 0.60546875, + "learning_rate": 1.1262297204118422e-05, + "loss": 1.4343, + "step": 5356 + }, + { + "epoch": 0.9241783835072889, + "grad_norm": 0.70703125, + "learning_rate": 1.1259597050677178e-05, + "loss": 1.4186, + "step": 5357 + }, + { + "epoch": 0.9243509014060208, + "grad_norm": 0.76953125, + "learning_rate": 1.1256896803917115e-05, + "loss": 1.3255, + "step": 5358 + }, + { + "epoch": 0.9245234193047529, + "grad_norm": 0.84765625, + "learning_rate": 1.1254196464038281e-05, + "loss": 1.4194, + "step": 5359 + }, + { + "epoch": 0.9246959372034849, + "grad_norm": 0.61328125, + "learning_rate": 1.1251496031240736e-05, + "loss": 1.5005, + "step": 5360 + }, + { + "epoch": 0.9248684551022168, + "grad_norm": 0.65625, + "learning_rate": 1.1248795505724548e-05, + "loss": 1.3214, + "step": 5361 + }, + { + "epoch": 0.9250409730009489, + "grad_norm": 0.60546875, + "learning_rate": 1.1246094887689784e-05, + "loss": 1.4798, + "step": 5362 + }, + { + "epoch": 0.9252134908996809, + "grad_norm": 0.69921875, + "learning_rate": 1.1243394177336524e-05, + "loss": 1.4729, + "step": 5363 + }, + { + "epoch": 0.9253860087984128, + "grad_norm": 0.76171875, + "learning_rate": 1.1240693374864854e-05, + "loss": 1.5883, + "step": 5364 + }, + { + "epoch": 0.9255585266971448, + "grad_norm": 0.62109375, + "learning_rate": 1.123799248047487e-05, + "loss": 1.4788, + "step": 5365 + }, + { + "epoch": 0.9257310445958769, + "grad_norm": 0.625, + "learning_rate": 1.1235291494366668e-05, + "loss": 1.4362, + "step": 5366 + }, + { + "epoch": 0.9259035624946088, + "grad_norm": 0.66015625, + "learning_rate": 1.123259041674035e-05, + "loss": 1.4446, + "step": 5367 + }, + { + "epoch": 0.9260760803933408, + "grad_norm": 0.66015625, + "learning_rate": 1.1229889247796033e-05, + "loss": 1.416, + "step": 5368 + }, + { + "epoch": 0.9262485982920728, + "grad_norm": 0.64453125, + "learning_rate": 1.1227187987733838e-05, + "loss": 1.4744, + "step": 5369 + }, + { + "epoch": 0.9264211161908048, + "grad_norm": 0.64453125, + "learning_rate": 1.1224486636753888e-05, + "loss": 1.3834, + "step": 5370 + }, + { + "epoch": 0.9265936340895368, + "grad_norm": 0.578125, + "learning_rate": 1.1221785195056316e-05, + "loss": 1.4061, + "step": 5371 + }, + { + "epoch": 0.9267661519882688, + "grad_norm": 0.6015625, + "learning_rate": 1.1219083662841263e-05, + "loss": 1.473, + "step": 5372 + }, + { + "epoch": 0.9269386698870008, + "grad_norm": 0.6640625, + "learning_rate": 1.1216382040308877e-05, + "loss": 1.3396, + "step": 5373 + }, + { + "epoch": 0.9271111877857328, + "grad_norm": 0.703125, + "learning_rate": 1.121368032765931e-05, + "loss": 1.4057, + "step": 5374 + }, + { + "epoch": 0.9272837056844647, + "grad_norm": 0.6484375, + "learning_rate": 1.1210978525092717e-05, + "loss": 1.4684, + "step": 5375 + }, + { + "epoch": 0.9274562235831968, + "grad_norm": 0.6015625, + "learning_rate": 1.120827663280927e-05, + "loss": 1.4043, + "step": 5376 + }, + { + "epoch": 0.9276287414819288, + "grad_norm": 0.60546875, + "learning_rate": 1.120557465100914e-05, + "loss": 1.4343, + "step": 5377 + }, + { + "epoch": 0.9278012593806607, + "grad_norm": 0.58984375, + "learning_rate": 1.1202872579892507e-05, + "loss": 1.4044, + "step": 5378 + }, + { + "epoch": 0.9279737772793928, + "grad_norm": 0.6484375, + "learning_rate": 1.1200170419659558e-05, + "loss": 1.3756, + "step": 5379 + }, + { + "epoch": 0.9281462951781247, + "grad_norm": 0.62890625, + "learning_rate": 1.1197468170510483e-05, + "loss": 1.4508, + "step": 5380 + }, + { + "epoch": 0.9283188130768567, + "grad_norm": 0.62109375, + "learning_rate": 1.1194765832645489e-05, + "loss": 1.5315, + "step": 5381 + }, + { + "epoch": 0.9284913309755887, + "grad_norm": 0.5625, + "learning_rate": 1.1192063406264772e-05, + "loss": 1.4169, + "step": 5382 + }, + { + "epoch": 0.9286638488743207, + "grad_norm": 0.63671875, + "learning_rate": 1.1189360891568551e-05, + "loss": 1.483, + "step": 5383 + }, + { + "epoch": 0.9288363667730527, + "grad_norm": 0.72265625, + "learning_rate": 1.1186658288757044e-05, + "loss": 1.4337, + "step": 5384 + }, + { + "epoch": 0.9290088846717847, + "grad_norm": 0.64453125, + "learning_rate": 1.1183955598030479e-05, + "loss": 1.5479, + "step": 5385 + }, + { + "epoch": 0.9291814025705167, + "grad_norm": 0.62890625, + "learning_rate": 1.1181252819589081e-05, + "loss": 1.4495, + "step": 5386 + }, + { + "epoch": 0.9293539204692487, + "grad_norm": 0.63671875, + "learning_rate": 1.1178549953633096e-05, + "loss": 1.4111, + "step": 5387 + }, + { + "epoch": 0.9295264383679807, + "grad_norm": 0.62890625, + "learning_rate": 1.1175847000362765e-05, + "loss": 1.453, + "step": 5388 + }, + { + "epoch": 0.9296989562667126, + "grad_norm": 0.62109375, + "learning_rate": 1.1173143959978344e-05, + "loss": 1.5223, + "step": 5389 + }, + { + "epoch": 0.9298714741654447, + "grad_norm": 0.65234375, + "learning_rate": 1.1170440832680086e-05, + "loss": 1.4029, + "step": 5390 + }, + { + "epoch": 0.9300439920641767, + "grad_norm": 0.625, + "learning_rate": 1.1167737618668258e-05, + "loss": 1.3839, + "step": 5391 + }, + { + "epoch": 0.9302165099629086, + "grad_norm": 0.60546875, + "learning_rate": 1.1165034318143129e-05, + "loss": 1.4541, + "step": 5392 + }, + { + "epoch": 0.9303890278616407, + "grad_norm": 0.68359375, + "learning_rate": 1.116233093130498e-05, + "loss": 1.3738, + "step": 5393 + }, + { + "epoch": 0.9305615457603726, + "grad_norm": 0.62109375, + "learning_rate": 1.1159627458354096e-05, + "loss": 1.4244, + "step": 5394 + }, + { + "epoch": 0.9307340636591046, + "grad_norm": 0.62890625, + "learning_rate": 1.1156923899490761e-05, + "loss": 1.3742, + "step": 5395 + }, + { + "epoch": 0.9309065815578367, + "grad_norm": 0.66015625, + "learning_rate": 1.115422025491527e-05, + "loss": 1.4678, + "step": 5396 + }, + { + "epoch": 0.9310790994565686, + "grad_norm": 0.58984375, + "learning_rate": 1.1151516524827938e-05, + "loss": 1.3177, + "step": 5397 + }, + { + "epoch": 0.9312516173553006, + "grad_norm": 0.8125, + "learning_rate": 1.1148812709429067e-05, + "loss": 1.3989, + "step": 5398 + }, + { + "epoch": 0.9314241352540326, + "grad_norm": 0.70703125, + "learning_rate": 1.114610880891897e-05, + "loss": 1.3871, + "step": 5399 + }, + { + "epoch": 0.9315966531527646, + "grad_norm": 0.65234375, + "learning_rate": 1.1143404823497969e-05, + "loss": 1.4404, + "step": 5400 + }, + { + "epoch": 0.9315966531527646, + "eval_loss": 1.4122982025146484, + "eval_runtime": 10.8508, + "eval_samples_per_second": 94.371, + "eval_steps_per_second": 23.593, + "step": 5400 + }, + { + "epoch": 0.9317691710514966, + "grad_norm": 0.62109375, + "learning_rate": 1.1140700753366397e-05, + "loss": 1.3977, + "step": 5401 + }, + { + "epoch": 0.9319416889502286, + "grad_norm": 0.578125, + "learning_rate": 1.1137996598724588e-05, + "loss": 1.4804, + "step": 5402 + }, + { + "epoch": 0.9321142068489606, + "grad_norm": 0.62109375, + "learning_rate": 1.1135292359772875e-05, + "loss": 1.4298, + "step": 5403 + }, + { + "epoch": 0.9322867247476926, + "grad_norm": 0.609375, + "learning_rate": 1.1132588036711614e-05, + "loss": 1.4478, + "step": 5404 + }, + { + "epoch": 0.9324592426464245, + "grad_norm": 0.609375, + "learning_rate": 1.1129883629741156e-05, + "loss": 1.4497, + "step": 5405 + }, + { + "epoch": 0.9326317605451565, + "grad_norm": 0.60546875, + "learning_rate": 1.1127179139061854e-05, + "loss": 1.4567, + "step": 5406 + }, + { + "epoch": 0.9328042784438886, + "grad_norm": 0.60546875, + "learning_rate": 1.112447456487408e-05, + "loss": 1.3938, + "step": 5407 + }, + { + "epoch": 0.9329767963426205, + "grad_norm": 0.625, + "learning_rate": 1.1121769907378206e-05, + "loss": 1.5709, + "step": 5408 + }, + { + "epoch": 0.9331493142413525, + "grad_norm": 0.7890625, + "learning_rate": 1.1119065166774608e-05, + "loss": 1.4594, + "step": 5409 + }, + { + "epoch": 0.9333218321400846, + "grad_norm": 0.609375, + "learning_rate": 1.1116360343263671e-05, + "loss": 1.3834, + "step": 5410 + }, + { + "epoch": 0.9334943500388165, + "grad_norm": 0.65234375, + "learning_rate": 1.1113655437045782e-05, + "loss": 1.5306, + "step": 5411 + }, + { + "epoch": 0.9336668679375485, + "grad_norm": 0.6484375, + "learning_rate": 1.1110950448321338e-05, + "loss": 1.3906, + "step": 5412 + }, + { + "epoch": 0.9338393858362806, + "grad_norm": 0.58984375, + "learning_rate": 1.1108245377290748e-05, + "loss": 1.4253, + "step": 5413 + }, + { + "epoch": 0.9340119037350125, + "grad_norm": 0.59375, + "learning_rate": 1.1105540224154413e-05, + "loss": 1.5626, + "step": 5414 + }, + { + "epoch": 0.9341844216337445, + "grad_norm": 0.625, + "learning_rate": 1.1102834989112752e-05, + "loss": 1.3905, + "step": 5415 + }, + { + "epoch": 0.9343569395324764, + "grad_norm": 0.58984375, + "learning_rate": 1.1100129672366182e-05, + "loss": 1.404, + "step": 5416 + }, + { + "epoch": 0.9345294574312085, + "grad_norm": 0.59765625, + "learning_rate": 1.1097424274115136e-05, + "loss": 1.4319, + "step": 5417 + }, + { + "epoch": 0.9347019753299405, + "grad_norm": 0.61328125, + "learning_rate": 1.1094718794560037e-05, + "loss": 1.5673, + "step": 5418 + }, + { + "epoch": 0.9348744932286724, + "grad_norm": 0.56640625, + "learning_rate": 1.1092013233901334e-05, + "loss": 1.3819, + "step": 5419 + }, + { + "epoch": 0.9350470111274045, + "grad_norm": 0.58203125, + "learning_rate": 1.1089307592339467e-05, + "loss": 1.4296, + "step": 5420 + }, + { + "epoch": 0.9352195290261365, + "grad_norm": 0.59765625, + "learning_rate": 1.1086601870074887e-05, + "loss": 1.4434, + "step": 5421 + }, + { + "epoch": 0.9353920469248684, + "grad_norm": 0.59765625, + "learning_rate": 1.1083896067308052e-05, + "loss": 1.5113, + "step": 5422 + }, + { + "epoch": 0.9355645648236004, + "grad_norm": 0.56640625, + "learning_rate": 1.1081190184239418e-05, + "loss": 1.4677, + "step": 5423 + }, + { + "epoch": 0.9357370827223325, + "grad_norm": 0.59375, + "learning_rate": 1.1078484221069465e-05, + "loss": 1.4168, + "step": 5424 + }, + { + "epoch": 0.9359096006210644, + "grad_norm": 0.60546875, + "learning_rate": 1.107577817799866e-05, + "loss": 1.3563, + "step": 5425 + }, + { + "epoch": 0.9360821185197964, + "grad_norm": 0.65625, + "learning_rate": 1.107307205522749e-05, + "loss": 1.3891, + "step": 5426 + }, + { + "epoch": 0.9362546364185285, + "grad_norm": 0.59375, + "learning_rate": 1.1070365852956432e-05, + "loss": 1.4826, + "step": 5427 + }, + { + "epoch": 0.9364271543172604, + "grad_norm": 0.609375, + "learning_rate": 1.1067659571385985e-05, + "loss": 1.3225, + "step": 5428 + }, + { + "epoch": 0.9365996722159924, + "grad_norm": 0.59375, + "learning_rate": 1.1064953210716647e-05, + "loss": 1.4668, + "step": 5429 + }, + { + "epoch": 0.9367721901147245, + "grad_norm": 0.61328125, + "learning_rate": 1.1062246771148922e-05, + "loss": 1.4022, + "step": 5430 + }, + { + "epoch": 0.9369447080134564, + "grad_norm": 0.61328125, + "learning_rate": 1.1059540252883317e-05, + "loss": 1.4116, + "step": 5431 + }, + { + "epoch": 0.9371172259121884, + "grad_norm": 0.609375, + "learning_rate": 1.105683365612035e-05, + "loss": 1.5058, + "step": 5432 + }, + { + "epoch": 0.9372897438109203, + "grad_norm": 0.59765625, + "learning_rate": 1.1054126981060542e-05, + "loss": 1.4929, + "step": 5433 + }, + { + "epoch": 0.9374622617096524, + "grad_norm": 0.62109375, + "learning_rate": 1.1051420227904423e-05, + "loss": 1.4555, + "step": 5434 + }, + { + "epoch": 0.9376347796083844, + "grad_norm": 0.60546875, + "learning_rate": 1.1048713396852522e-05, + "loss": 1.4708, + "step": 5435 + }, + { + "epoch": 0.9378072975071163, + "grad_norm": 0.6171875, + "learning_rate": 1.1046006488105379e-05, + "loss": 1.456, + "step": 5436 + }, + { + "epoch": 0.9379798154058484, + "grad_norm": 0.6171875, + "learning_rate": 1.104329950186354e-05, + "loss": 1.3934, + "step": 5437 + }, + { + "epoch": 0.9381523333045804, + "grad_norm": 0.59765625, + "learning_rate": 1.1040592438327558e-05, + "loss": 1.4241, + "step": 5438 + }, + { + "epoch": 0.9383248512033123, + "grad_norm": 0.69140625, + "learning_rate": 1.1037885297697985e-05, + "loss": 1.4923, + "step": 5439 + }, + { + "epoch": 0.9384973691020443, + "grad_norm": 0.65234375, + "learning_rate": 1.1035178080175382e-05, + "loss": 1.4042, + "step": 5440 + }, + { + "epoch": 0.9386698870007764, + "grad_norm": 0.62890625, + "learning_rate": 1.1032470785960321e-05, + "loss": 1.488, + "step": 5441 + }, + { + "epoch": 0.9388424048995083, + "grad_norm": 0.68359375, + "learning_rate": 1.1029763415253374e-05, + "loss": 1.4338, + "step": 5442 + }, + { + "epoch": 0.9390149227982403, + "grad_norm": 1.75, + "learning_rate": 1.1027055968255116e-05, + "loss": 1.3755, + "step": 5443 + }, + { + "epoch": 0.9391874406969724, + "grad_norm": 0.703125, + "learning_rate": 1.1024348445166133e-05, + "loss": 1.472, + "step": 5444 + }, + { + "epoch": 0.9393599585957043, + "grad_norm": 0.625, + "learning_rate": 1.1021640846187021e-05, + "loss": 1.3511, + "step": 5445 + }, + { + "epoch": 0.9395324764944363, + "grad_norm": 0.66796875, + "learning_rate": 1.1018933171518369e-05, + "loss": 1.4285, + "step": 5446 + }, + { + "epoch": 0.9397049943931682, + "grad_norm": 0.69140625, + "learning_rate": 1.1016225421360783e-05, + "loss": 1.4036, + "step": 5447 + }, + { + "epoch": 0.9398775122919003, + "grad_norm": 0.59765625, + "learning_rate": 1.1013517595914867e-05, + "loss": 1.4262, + "step": 5448 + }, + { + "epoch": 0.9400500301906323, + "grad_norm": 0.671875, + "learning_rate": 1.1010809695381235e-05, + "loss": 1.3865, + "step": 5449 + }, + { + "epoch": 0.9402225480893642, + "grad_norm": 0.796875, + "learning_rate": 1.1008101719960504e-05, + "loss": 1.4087, + "step": 5450 + }, + { + "epoch": 0.9403950659880963, + "grad_norm": 0.6015625, + "learning_rate": 1.10053936698533e-05, + "loss": 1.3771, + "step": 5451 + }, + { + "epoch": 0.9405675838868283, + "grad_norm": 0.703125, + "learning_rate": 1.100268554526025e-05, + "loss": 1.4645, + "step": 5452 + }, + { + "epoch": 0.9407401017855602, + "grad_norm": 0.796875, + "learning_rate": 1.099997734638199e-05, + "loss": 1.4652, + "step": 5453 + }, + { + "epoch": 0.9409126196842923, + "grad_norm": 0.60546875, + "learning_rate": 1.0997269073419162e-05, + "loss": 1.5207, + "step": 5454 + }, + { + "epoch": 0.9410851375830243, + "grad_norm": 0.78125, + "learning_rate": 1.0994560726572408e-05, + "loss": 1.4699, + "step": 5455 + }, + { + "epoch": 0.9412576554817562, + "grad_norm": 0.5546875, + "learning_rate": 1.0991852306042381e-05, + "loss": 1.3744, + "step": 5456 + }, + { + "epoch": 0.9414301733804882, + "grad_norm": 0.56640625, + "learning_rate": 1.0989143812029736e-05, + "loss": 1.4185, + "step": 5457 + }, + { + "epoch": 0.9416026912792202, + "grad_norm": 0.68359375, + "learning_rate": 1.0986435244735141e-05, + "loss": 1.3928, + "step": 5458 + }, + { + "epoch": 0.9417752091779522, + "grad_norm": 0.625, + "learning_rate": 1.0983726604359257e-05, + "loss": 1.422, + "step": 5459 + }, + { + "epoch": 0.9419477270766842, + "grad_norm": 0.73828125, + "learning_rate": 1.0981017891102757e-05, + "loss": 1.4217, + "step": 5460 + }, + { + "epoch": 0.9421202449754162, + "grad_norm": 0.625, + "learning_rate": 1.0978309105166328e-05, + "loss": 1.455, + "step": 5461 + }, + { + "epoch": 0.9422927628741482, + "grad_norm": 0.77734375, + "learning_rate": 1.0975600246750644e-05, + "loss": 1.4021, + "step": 5462 + }, + { + "epoch": 0.9424652807728802, + "grad_norm": 0.60546875, + "learning_rate": 1.0972891316056397e-05, + "loss": 1.4949, + "step": 5463 + }, + { + "epoch": 0.9426377986716121, + "grad_norm": 0.57421875, + "learning_rate": 1.0970182313284283e-05, + "loss": 1.4539, + "step": 5464 + }, + { + "epoch": 0.9428103165703442, + "grad_norm": 0.640625, + "learning_rate": 1.0967473238635005e-05, + "loss": 1.4881, + "step": 5465 + }, + { + "epoch": 0.9429828344690762, + "grad_norm": 0.60546875, + "learning_rate": 1.0964764092309261e-05, + "loss": 1.423, + "step": 5466 + }, + { + "epoch": 0.9431553523678081, + "grad_norm": 0.64453125, + "learning_rate": 1.0962054874507766e-05, + "loss": 1.5349, + "step": 5467 + }, + { + "epoch": 0.9433278702665402, + "grad_norm": 0.58984375, + "learning_rate": 1.0959345585431233e-05, + "loss": 1.3661, + "step": 5468 + }, + { + "epoch": 0.9435003881652722, + "grad_norm": 0.64453125, + "learning_rate": 1.0956636225280386e-05, + "loss": 1.5621, + "step": 5469 + }, + { + "epoch": 0.9436729060640041, + "grad_norm": 0.625, + "learning_rate": 1.0953926794255952e-05, + "loss": 1.4969, + "step": 5470 + }, + { + "epoch": 0.9438454239627362, + "grad_norm": 0.63671875, + "learning_rate": 1.0951217292558659e-05, + "loss": 1.5013, + "step": 5471 + }, + { + "epoch": 0.9440179418614681, + "grad_norm": 0.609375, + "learning_rate": 1.0948507720389242e-05, + "loss": 1.4703, + "step": 5472 + }, + { + "epoch": 0.9441904597602001, + "grad_norm": 0.6328125, + "learning_rate": 1.094579807794845e-05, + "loss": 1.4418, + "step": 5473 + }, + { + "epoch": 0.9443629776589321, + "grad_norm": 0.60546875, + "learning_rate": 1.094308836543703e-05, + "loss": 1.3773, + "step": 5474 + }, + { + "epoch": 0.9445354955576641, + "grad_norm": 0.6484375, + "learning_rate": 1.0940378583055727e-05, + "loss": 1.4028, + "step": 5475 + }, + { + "epoch": 0.9447080134563961, + "grad_norm": 0.60546875, + "learning_rate": 1.09376687310053e-05, + "loss": 1.3619, + "step": 5476 + }, + { + "epoch": 0.9448805313551281, + "grad_norm": 0.60546875, + "learning_rate": 1.093495880948652e-05, + "loss": 1.4371, + "step": 5477 + }, + { + "epoch": 0.9450530492538601, + "grad_norm": 0.671875, + "learning_rate": 1.0932248818700146e-05, + "loss": 1.5281, + "step": 5478 + }, + { + "epoch": 0.9452255671525921, + "grad_norm": 0.60546875, + "learning_rate": 1.0929538758846956e-05, + "loss": 1.4308, + "step": 5479 + }, + { + "epoch": 0.945398085051324, + "grad_norm": 0.59375, + "learning_rate": 1.0926828630127728e-05, + "loss": 1.4224, + "step": 5480 + }, + { + "epoch": 0.945570602950056, + "grad_norm": 0.61328125, + "learning_rate": 1.0924118432743243e-05, + "loss": 1.4619, + "step": 5481 + }, + { + "epoch": 0.9457431208487881, + "grad_norm": 0.68359375, + "learning_rate": 1.0921408166894292e-05, + "loss": 1.5235, + "step": 5482 + }, + { + "epoch": 0.94591563874752, + "grad_norm": 0.60546875, + "learning_rate": 1.0918697832781663e-05, + "loss": 1.3516, + "step": 5483 + }, + { + "epoch": 0.946088156646252, + "grad_norm": 0.5859375, + "learning_rate": 1.0915987430606161e-05, + "loss": 1.4879, + "step": 5484 + }, + { + "epoch": 0.9462606745449841, + "grad_norm": 1.4140625, + "learning_rate": 1.0913276960568583e-05, + "loss": 1.551, + "step": 5485 + }, + { + "epoch": 0.946433192443716, + "grad_norm": 0.609375, + "learning_rate": 1.0910566422869748e-05, + "loss": 1.4284, + "step": 5486 + }, + { + "epoch": 0.946605710342448, + "grad_norm": 0.63671875, + "learning_rate": 1.0907855817710457e-05, + "loss": 1.3764, + "step": 5487 + }, + { + "epoch": 0.9467782282411801, + "grad_norm": 0.59765625, + "learning_rate": 1.0905145145291537e-05, + "loss": 1.4533, + "step": 5488 + }, + { + "epoch": 0.946950746139912, + "grad_norm": 0.640625, + "learning_rate": 1.0902434405813809e-05, + "loss": 1.3727, + "step": 5489 + }, + { + "epoch": 0.947123264038644, + "grad_norm": 0.63671875, + "learning_rate": 1.0899723599478103e-05, + "loss": 1.4027, + "step": 5490 + }, + { + "epoch": 0.947295781937376, + "grad_norm": 0.66796875, + "learning_rate": 1.089701272648525e-05, + "loss": 1.4288, + "step": 5491 + }, + { + "epoch": 0.947468299836108, + "grad_norm": 0.640625, + "learning_rate": 1.0894301787036085e-05, + "loss": 1.3706, + "step": 5492 + }, + { + "epoch": 0.94764081773484, + "grad_norm": 0.66015625, + "learning_rate": 1.0891590781331463e-05, + "loss": 1.4171, + "step": 5493 + }, + { + "epoch": 0.947813335633572, + "grad_norm": 0.68359375, + "learning_rate": 1.088887970957222e-05, + "loss": 1.3683, + "step": 5494 + }, + { + "epoch": 0.947985853532304, + "grad_norm": 0.5859375, + "learning_rate": 1.0886168571959214e-05, + "loss": 1.3923, + "step": 5495 + }, + { + "epoch": 0.948158371431036, + "grad_norm": 0.625, + "learning_rate": 1.0883457368693307e-05, + "loss": 1.4652, + "step": 5496 + }, + { + "epoch": 0.948330889329768, + "grad_norm": 0.7109375, + "learning_rate": 1.0880746099975355e-05, + "loss": 1.5098, + "step": 5497 + }, + { + "epoch": 0.9485034072284999, + "grad_norm": 0.671875, + "learning_rate": 1.087803476600623e-05, + "loss": 1.4231, + "step": 5498 + }, + { + "epoch": 0.948675925127232, + "grad_norm": 0.65625, + "learning_rate": 1.0875323366986803e-05, + "loss": 1.4784, + "step": 5499 + }, + { + "epoch": 0.9488484430259639, + "grad_norm": 0.58984375, + "learning_rate": 1.0872611903117951e-05, + "loss": 1.4145, + "step": 5500 + }, + { + "epoch": 0.9488484430259639, + "eval_loss": 1.4118043184280396, + "eval_runtime": 11.0208, + "eval_samples_per_second": 92.915, + "eval_steps_per_second": 23.229, + "step": 5500 + }, + { + "epoch": 0.9490209609246959, + "grad_norm": 0.66796875, + "learning_rate": 1.0869900374600557e-05, + "loss": 1.4202, + "step": 5501 + }, + { + "epoch": 0.949193478823428, + "grad_norm": 0.7578125, + "learning_rate": 1.086718878163551e-05, + "loss": 1.4019, + "step": 5502 + }, + { + "epoch": 0.9493659967221599, + "grad_norm": 0.71484375, + "learning_rate": 1.0864477124423698e-05, + "loss": 1.4151, + "step": 5503 + }, + { + "epoch": 0.9495385146208919, + "grad_norm": 0.640625, + "learning_rate": 1.0861765403166018e-05, + "loss": 1.471, + "step": 5504 + }, + { + "epoch": 0.949711032519624, + "grad_norm": 0.73046875, + "learning_rate": 1.0859053618063372e-05, + "loss": 1.4649, + "step": 5505 + }, + { + "epoch": 0.9498835504183559, + "grad_norm": 1.0, + "learning_rate": 1.0856341769316672e-05, + "loss": 1.426, + "step": 5506 + }, + { + "epoch": 0.9500560683170879, + "grad_norm": 0.6953125, + "learning_rate": 1.0853629857126817e-05, + "loss": 1.3961, + "step": 5507 + }, + { + "epoch": 0.9502285862158198, + "grad_norm": 0.60546875, + "learning_rate": 1.085091788169473e-05, + "loss": 1.4524, + "step": 5508 + }, + { + "epoch": 0.9504011041145519, + "grad_norm": 0.61328125, + "learning_rate": 1.084820584322133e-05, + "loss": 1.4308, + "step": 5509 + }, + { + "epoch": 0.9505736220132839, + "grad_norm": 0.66796875, + "learning_rate": 1.084549374190754e-05, + "loss": 1.4239, + "step": 5510 + }, + { + "epoch": 0.9507461399120158, + "grad_norm": 0.61328125, + "learning_rate": 1.0842781577954294e-05, + "loss": 1.3743, + "step": 5511 + }, + { + "epoch": 0.9509186578107479, + "grad_norm": 0.640625, + "learning_rate": 1.0840069351562519e-05, + "loss": 1.3684, + "step": 5512 + }, + { + "epoch": 0.9510911757094799, + "grad_norm": 1.078125, + "learning_rate": 1.0837357062933158e-05, + "loss": 1.3119, + "step": 5513 + }, + { + "epoch": 0.9512636936082118, + "grad_norm": 0.58203125, + "learning_rate": 1.0834644712267158e-05, + "loss": 1.462, + "step": 5514 + }, + { + "epoch": 0.9514362115069438, + "grad_norm": 0.63671875, + "learning_rate": 1.0831932299765458e-05, + "loss": 1.4212, + "step": 5515 + }, + { + "epoch": 0.9516087294056759, + "grad_norm": 0.57421875, + "learning_rate": 1.0829219825629016e-05, + "loss": 1.4816, + "step": 5516 + }, + { + "epoch": 0.9517812473044078, + "grad_norm": 0.79296875, + "learning_rate": 1.0826507290058787e-05, + "loss": 1.4519, + "step": 5517 + }, + { + "epoch": 0.9519537652031398, + "grad_norm": 0.63671875, + "learning_rate": 1.0823794693255738e-05, + "loss": 1.4065, + "step": 5518 + }, + { + "epoch": 0.9521262831018719, + "grad_norm": 0.61328125, + "learning_rate": 1.0821082035420829e-05, + "loss": 1.4785, + "step": 5519 + }, + { + "epoch": 0.9522988010006038, + "grad_norm": 0.58984375, + "learning_rate": 1.0818369316755031e-05, + "loss": 1.41, + "step": 5520 + }, + { + "epoch": 0.9524713188993358, + "grad_norm": 0.5859375, + "learning_rate": 1.081565653745932e-05, + "loss": 1.3799, + "step": 5521 + }, + { + "epoch": 0.9526438367980677, + "grad_norm": 0.5703125, + "learning_rate": 1.081294369773468e-05, + "loss": 1.3599, + "step": 5522 + }, + { + "epoch": 0.9528163546967998, + "grad_norm": 0.71484375, + "learning_rate": 1.0810230797782088e-05, + "loss": 1.4379, + "step": 5523 + }, + { + "epoch": 0.9529888725955318, + "grad_norm": 0.625, + "learning_rate": 1.0807517837802535e-05, + "loss": 1.447, + "step": 5524 + }, + { + "epoch": 0.9531613904942637, + "grad_norm": 0.63671875, + "learning_rate": 1.0804804817997015e-05, + "loss": 1.5195, + "step": 5525 + }, + { + "epoch": 0.9533339083929958, + "grad_norm": 0.640625, + "learning_rate": 1.080209173856653e-05, + "loss": 1.4686, + "step": 5526 + }, + { + "epoch": 0.9535064262917278, + "grad_norm": 0.59375, + "learning_rate": 1.0799378599712073e-05, + "loss": 1.4608, + "step": 5527 + }, + { + "epoch": 0.9536789441904597, + "grad_norm": 0.63671875, + "learning_rate": 1.0796665401634657e-05, + "loss": 1.3722, + "step": 5528 + }, + { + "epoch": 0.9538514620891918, + "grad_norm": 0.66015625, + "learning_rate": 1.0793952144535289e-05, + "loss": 1.4454, + "step": 5529 + }, + { + "epoch": 0.9540239799879238, + "grad_norm": 0.62109375, + "learning_rate": 1.0791238828614987e-05, + "loss": 1.3737, + "step": 5530 + }, + { + "epoch": 0.9541964978866557, + "grad_norm": 0.59765625, + "learning_rate": 1.0788525454074765e-05, + "loss": 1.3756, + "step": 5531 + }, + { + "epoch": 0.9543690157853877, + "grad_norm": 0.73828125, + "learning_rate": 1.0785812021115654e-05, + "loss": 1.4899, + "step": 5532 + }, + { + "epoch": 0.9545415336841198, + "grad_norm": 0.546875, + "learning_rate": 1.0783098529938675e-05, + "loss": 1.3998, + "step": 5533 + }, + { + "epoch": 0.9547140515828517, + "grad_norm": 0.609375, + "learning_rate": 1.078038498074487e-05, + "loss": 1.437, + "step": 5534 + }, + { + "epoch": 0.9548865694815837, + "grad_norm": 0.671875, + "learning_rate": 1.0777671373735266e-05, + "loss": 1.4801, + "step": 5535 + }, + { + "epoch": 0.9550590873803158, + "grad_norm": 0.75390625, + "learning_rate": 1.0774957709110905e-05, + "loss": 1.4471, + "step": 5536 + }, + { + "epoch": 0.9552316052790477, + "grad_norm": 0.6328125, + "learning_rate": 1.0772243987072838e-05, + "loss": 1.4805, + "step": 5537 + }, + { + "epoch": 0.9554041231777797, + "grad_norm": 0.625, + "learning_rate": 1.076953020782211e-05, + "loss": 1.4201, + "step": 5538 + }, + { + "epoch": 0.9555766410765116, + "grad_norm": 0.60546875, + "learning_rate": 1.0766816371559776e-05, + "loss": 1.4411, + "step": 5539 + }, + { + "epoch": 0.9557491589752437, + "grad_norm": 0.60546875, + "learning_rate": 1.0764102478486894e-05, + "loss": 1.4425, + "step": 5540 + }, + { + "epoch": 0.9559216768739757, + "grad_norm": 0.68359375, + "learning_rate": 1.0761388528804524e-05, + "loss": 1.488, + "step": 5541 + }, + { + "epoch": 0.9560941947727076, + "grad_norm": 0.625, + "learning_rate": 1.0758674522713737e-05, + "loss": 1.4224, + "step": 5542 + }, + { + "epoch": 0.9562667126714397, + "grad_norm": 0.5390625, + "learning_rate": 1.07559604604156e-05, + "loss": 1.4734, + "step": 5543 + }, + { + "epoch": 0.9564392305701717, + "grad_norm": 0.625, + "learning_rate": 1.075324634211119e-05, + "loss": 1.3959, + "step": 5544 + }, + { + "epoch": 0.9566117484689036, + "grad_norm": 0.61328125, + "learning_rate": 1.0750532168001581e-05, + "loss": 1.5127, + "step": 5545 + }, + { + "epoch": 0.9567842663676357, + "grad_norm": 0.59765625, + "learning_rate": 1.0747817938287866e-05, + "loss": 1.5154, + "step": 5546 + }, + { + "epoch": 0.9569567842663677, + "grad_norm": 0.734375, + "learning_rate": 1.074510365317112e-05, + "loss": 1.3529, + "step": 5547 + }, + { + "epoch": 0.9571293021650996, + "grad_norm": 0.671875, + "learning_rate": 1.0742389312852441e-05, + "loss": 1.3786, + "step": 5548 + }, + { + "epoch": 0.9573018200638316, + "grad_norm": 0.734375, + "learning_rate": 1.0739674917532923e-05, + "loss": 1.3981, + "step": 5549 + }, + { + "epoch": 0.9574743379625636, + "grad_norm": 0.67578125, + "learning_rate": 1.0736960467413668e-05, + "loss": 1.4618, + "step": 5550 + }, + { + "epoch": 0.9576468558612956, + "grad_norm": 0.60546875, + "learning_rate": 1.0734245962695775e-05, + "loss": 1.3747, + "step": 5551 + }, + { + "epoch": 0.9578193737600276, + "grad_norm": 0.62890625, + "learning_rate": 1.073153140358035e-05, + "loss": 1.3701, + "step": 5552 + }, + { + "epoch": 0.9579918916587596, + "grad_norm": 0.57421875, + "learning_rate": 1.0728816790268513e-05, + "loss": 1.4192, + "step": 5553 + }, + { + "epoch": 0.9581644095574916, + "grad_norm": 0.6171875, + "learning_rate": 1.0726102122961373e-05, + "loss": 1.4915, + "step": 5554 + }, + { + "epoch": 0.9583369274562236, + "grad_norm": 0.67578125, + "learning_rate": 1.0723387401860048e-05, + "loss": 1.4411, + "step": 5555 + }, + { + "epoch": 0.9585094453549555, + "grad_norm": 0.83203125, + "learning_rate": 1.0720672627165665e-05, + "loss": 1.4482, + "step": 5556 + }, + { + "epoch": 0.9586819632536876, + "grad_norm": 0.62890625, + "learning_rate": 1.071795779907935e-05, + "loss": 1.4209, + "step": 5557 + }, + { + "epoch": 0.9588544811524196, + "grad_norm": 0.61328125, + "learning_rate": 1.071524291780224e-05, + "loss": 1.4751, + "step": 5558 + }, + { + "epoch": 0.9590269990511515, + "grad_norm": 0.6875, + "learning_rate": 1.0712527983535463e-05, + "loss": 1.4628, + "step": 5559 + }, + { + "epoch": 0.9591995169498836, + "grad_norm": 0.546875, + "learning_rate": 1.070981299648016e-05, + "loss": 1.3229, + "step": 5560 + }, + { + "epoch": 0.9593720348486156, + "grad_norm": 0.640625, + "learning_rate": 1.0707097956837475e-05, + "loss": 1.3905, + "step": 5561 + }, + { + "epoch": 0.9595445527473475, + "grad_norm": 0.57421875, + "learning_rate": 1.0704382864808558e-05, + "loss": 1.4366, + "step": 5562 + }, + { + "epoch": 0.9597170706460796, + "grad_norm": 0.68359375, + "learning_rate": 1.0701667720594555e-05, + "loss": 1.4201, + "step": 5563 + }, + { + "epoch": 0.9598895885448115, + "grad_norm": 0.6328125, + "learning_rate": 1.0698952524396621e-05, + "loss": 1.4332, + "step": 5564 + }, + { + "epoch": 0.9600621064435435, + "grad_norm": 0.640625, + "learning_rate": 1.069623727641592e-05, + "loss": 1.5507, + "step": 5565 + }, + { + "epoch": 0.9602346243422755, + "grad_norm": 0.67578125, + "learning_rate": 1.0693521976853612e-05, + "loss": 1.4058, + "step": 5566 + }, + { + "epoch": 0.9604071422410075, + "grad_norm": 0.59765625, + "learning_rate": 1.0690806625910862e-05, + "loss": 1.5297, + "step": 5567 + }, + { + "epoch": 0.9605796601397395, + "grad_norm": 0.62890625, + "learning_rate": 1.068809122378884e-05, + "loss": 1.5256, + "step": 5568 + }, + { + "epoch": 0.9607521780384715, + "grad_norm": 0.578125, + "learning_rate": 1.0685375770688717e-05, + "loss": 1.4706, + "step": 5569 + }, + { + "epoch": 0.9609246959372035, + "grad_norm": 0.63671875, + "learning_rate": 1.068266026681168e-05, + "loss": 1.4291, + "step": 5570 + }, + { + "epoch": 0.9610972138359355, + "grad_norm": 0.6953125, + "learning_rate": 1.0679944712358903e-05, + "loss": 1.3866, + "step": 5571 + }, + { + "epoch": 0.9612697317346675, + "grad_norm": 0.6484375, + "learning_rate": 1.067722910753157e-05, + "loss": 1.4339, + "step": 5572 + }, + { + "epoch": 0.9614422496333994, + "grad_norm": 0.58984375, + "learning_rate": 1.0674513452530877e-05, + "loss": 1.3447, + "step": 5573 + }, + { + "epoch": 0.9616147675321315, + "grad_norm": 0.60546875, + "learning_rate": 1.0671797747558013e-05, + "loss": 1.4081, + "step": 5574 + }, + { + "epoch": 0.9617872854308634, + "grad_norm": 0.625, + "learning_rate": 1.0669081992814174e-05, + "loss": 1.4666, + "step": 5575 + }, + { + "epoch": 0.9619598033295954, + "grad_norm": 0.58203125, + "learning_rate": 1.0666366188500559e-05, + "loss": 1.3793, + "step": 5576 + }, + { + "epoch": 0.9621323212283275, + "grad_norm": 0.73828125, + "learning_rate": 1.0663650334818374e-05, + "loss": 1.4518, + "step": 5577 + }, + { + "epoch": 0.9623048391270594, + "grad_norm": 0.69921875, + "learning_rate": 1.0660934431968829e-05, + "loss": 1.4876, + "step": 5578 + }, + { + "epoch": 0.9624773570257914, + "grad_norm": 0.64453125, + "learning_rate": 1.0658218480153127e-05, + "loss": 1.3871, + "step": 5579 + }, + { + "epoch": 0.9626498749245235, + "grad_norm": 0.671875, + "learning_rate": 1.0655502479572486e-05, + "loss": 1.4577, + "step": 5580 + }, + { + "epoch": 0.9628223928232554, + "grad_norm": 0.81640625, + "learning_rate": 1.0652786430428128e-05, + "loss": 1.3674, + "step": 5581 + }, + { + "epoch": 0.9629949107219874, + "grad_norm": 0.75, + "learning_rate": 1.0650070332921275e-05, + "loss": 1.4384, + "step": 5582 + }, + { + "epoch": 0.9631674286207194, + "grad_norm": 0.59375, + "learning_rate": 1.0647354187253145e-05, + "loss": 1.3278, + "step": 5583 + }, + { + "epoch": 0.9633399465194514, + "grad_norm": 0.76171875, + "learning_rate": 1.0644637993624973e-05, + "loss": 1.4714, + "step": 5584 + }, + { + "epoch": 0.9635124644181834, + "grad_norm": 0.7734375, + "learning_rate": 1.0641921752237992e-05, + "loss": 1.4265, + "step": 5585 + }, + { + "epoch": 0.9636849823169154, + "grad_norm": 0.72265625, + "learning_rate": 1.0639205463293436e-05, + "loss": 1.4047, + "step": 5586 + }, + { + "epoch": 0.9638575002156474, + "grad_norm": 0.76953125, + "learning_rate": 1.0636489126992548e-05, + "loss": 1.4471, + "step": 5587 + }, + { + "epoch": 0.9640300181143794, + "grad_norm": 0.671875, + "learning_rate": 1.0633772743536563e-05, + "loss": 1.438, + "step": 5588 + }, + { + "epoch": 0.9642025360131113, + "grad_norm": 0.625, + "learning_rate": 1.0631056313126736e-05, + "loss": 1.4633, + "step": 5589 + }, + { + "epoch": 0.9643750539118433, + "grad_norm": 0.7578125, + "learning_rate": 1.0628339835964317e-05, + "loss": 1.3694, + "step": 5590 + }, + { + "epoch": 0.9645475718105754, + "grad_norm": 0.734375, + "learning_rate": 1.0625623312250554e-05, + "loss": 1.4391, + "step": 5591 + }, + { + "epoch": 0.9647200897093073, + "grad_norm": 0.6484375, + "learning_rate": 1.0622906742186707e-05, + "loss": 1.409, + "step": 5592 + }, + { + "epoch": 0.9648926076080393, + "grad_norm": 0.78515625, + "learning_rate": 1.0620190125974036e-05, + "loss": 1.5127, + "step": 5593 + }, + { + "epoch": 0.9650651255067714, + "grad_norm": 0.7265625, + "learning_rate": 1.0617473463813808e-05, + "loss": 1.4054, + "step": 5594 + }, + { + "epoch": 0.9652376434055033, + "grad_norm": 3.453125, + "learning_rate": 1.0614756755907284e-05, + "loss": 1.462, + "step": 5595 + }, + { + "epoch": 0.9654101613042353, + "grad_norm": 0.7734375, + "learning_rate": 1.0612040002455742e-05, + "loss": 1.403, + "step": 5596 + }, + { + "epoch": 0.9655826792029673, + "grad_norm": 0.78125, + "learning_rate": 1.0609323203660451e-05, + "loss": 1.4555, + "step": 5597 + }, + { + "epoch": 0.9657551971016993, + "grad_norm": 0.5703125, + "learning_rate": 1.0606606359722691e-05, + "loss": 1.3381, + "step": 5598 + }, + { + "epoch": 0.9659277150004313, + "grad_norm": 0.6015625, + "learning_rate": 1.060388947084374e-05, + "loss": 1.2738, + "step": 5599 + }, + { + "epoch": 0.9661002328991632, + "grad_norm": 0.73828125, + "learning_rate": 1.0601172537224881e-05, + "loss": 1.3784, + "step": 5600 + }, + { + "epoch": 0.9661002328991632, + "eval_loss": 1.411436915397644, + "eval_runtime": 10.8561, + "eval_samples_per_second": 94.325, + "eval_steps_per_second": 23.581, + "step": 5600 + }, + { + "epoch": 0.9662727507978953, + "grad_norm": 1.171875, + "learning_rate": 1.0598455559067409e-05, + "loss": 1.3455, + "step": 5601 + }, + { + "epoch": 0.9664452686966273, + "grad_norm": 0.65625, + "learning_rate": 1.0595738536572607e-05, + "loss": 1.5216, + "step": 5602 + }, + { + "epoch": 0.9666177865953592, + "grad_norm": 0.69921875, + "learning_rate": 1.0593021469941773e-05, + "loss": 1.4521, + "step": 5603 + }, + { + "epoch": 0.9667903044940913, + "grad_norm": 0.65625, + "learning_rate": 1.0590304359376202e-05, + "loss": 1.3953, + "step": 5604 + }, + { + "epoch": 0.9669628223928233, + "grad_norm": 0.74609375, + "learning_rate": 1.0587587205077196e-05, + "loss": 1.4433, + "step": 5605 + }, + { + "epoch": 0.9671353402915552, + "grad_norm": 0.7578125, + "learning_rate": 1.0584870007246059e-05, + "loss": 1.4629, + "step": 5606 + }, + { + "epoch": 0.9673078581902872, + "grad_norm": 0.63671875, + "learning_rate": 1.05821527660841e-05, + "loss": 1.4307, + "step": 5607 + }, + { + "epoch": 0.9674803760890193, + "grad_norm": 0.59375, + "learning_rate": 1.0579435481792621e-05, + "loss": 1.3582, + "step": 5608 + }, + { + "epoch": 0.9676528939877512, + "grad_norm": 0.69140625, + "learning_rate": 1.0576718154572944e-05, + "loss": 1.372, + "step": 5609 + }, + { + "epoch": 0.9678254118864832, + "grad_norm": 0.6796875, + "learning_rate": 1.0574000784626386e-05, + "loss": 1.4447, + "step": 5610 + }, + { + "epoch": 0.9679979297852153, + "grad_norm": 0.6640625, + "learning_rate": 1.057128337215426e-05, + "loss": 1.4734, + "step": 5611 + }, + { + "epoch": 0.9681704476839472, + "grad_norm": 0.6328125, + "learning_rate": 1.0568565917357892e-05, + "loss": 1.4525, + "step": 5612 + }, + { + "epoch": 0.9683429655826792, + "grad_norm": 0.60546875, + "learning_rate": 1.0565848420438608e-05, + "loss": 1.4177, + "step": 5613 + }, + { + "epoch": 0.9685154834814111, + "grad_norm": 0.5703125, + "learning_rate": 1.0563130881597739e-05, + "loss": 1.3963, + "step": 5614 + }, + { + "epoch": 0.9686880013801432, + "grad_norm": 0.64453125, + "learning_rate": 1.0560413301036614e-05, + "loss": 1.3608, + "step": 5615 + }, + { + "epoch": 0.9688605192788752, + "grad_norm": 0.65234375, + "learning_rate": 1.0557695678956569e-05, + "loss": 1.4067, + "step": 5616 + }, + { + "epoch": 0.9690330371776071, + "grad_norm": 0.91015625, + "learning_rate": 1.0554978015558946e-05, + "loss": 1.4527, + "step": 5617 + }, + { + "epoch": 0.9692055550763392, + "grad_norm": 0.59765625, + "learning_rate": 1.0552260311045082e-05, + "loss": 1.4399, + "step": 5618 + }, + { + "epoch": 0.9693780729750712, + "grad_norm": 0.6015625, + "learning_rate": 1.0549542565616326e-05, + "loss": 1.5231, + "step": 5619 + }, + { + "epoch": 0.9695505908738031, + "grad_norm": 0.5703125, + "learning_rate": 1.0546824779474022e-05, + "loss": 1.4008, + "step": 5620 + }, + { + "epoch": 0.9697231087725352, + "grad_norm": 0.65625, + "learning_rate": 1.054410695281952e-05, + "loss": 1.4296, + "step": 5621 + }, + { + "epoch": 0.9698956266712672, + "grad_norm": 0.65625, + "learning_rate": 1.0541389085854177e-05, + "loss": 1.4242, + "step": 5622 + }, + { + "epoch": 0.9700681445699991, + "grad_norm": 0.62109375, + "learning_rate": 1.0538671178779346e-05, + "loss": 1.3831, + "step": 5623 + }, + { + "epoch": 0.9702406624687311, + "grad_norm": 0.6484375, + "learning_rate": 1.053595323179639e-05, + "loss": 1.3795, + "step": 5624 + }, + { + "epoch": 0.9704131803674632, + "grad_norm": 0.58203125, + "learning_rate": 1.0533235245106668e-05, + "loss": 1.3784, + "step": 5625 + }, + { + "epoch": 0.9705856982661951, + "grad_norm": 0.56640625, + "learning_rate": 1.053051721891155e-05, + "loss": 1.4076, + "step": 5626 + }, + { + "epoch": 0.9707582161649271, + "grad_norm": 0.59765625, + "learning_rate": 1.0527799153412402e-05, + "loss": 1.3815, + "step": 5627 + }, + { + "epoch": 0.9709307340636592, + "grad_norm": 0.6015625, + "learning_rate": 1.052508104881059e-05, + "loss": 1.4889, + "step": 5628 + }, + { + "epoch": 0.9711032519623911, + "grad_norm": 0.671875, + "learning_rate": 1.0522362905307497e-05, + "loss": 1.4448, + "step": 5629 + }, + { + "epoch": 0.9712757698611231, + "grad_norm": 0.64453125, + "learning_rate": 1.0519644723104494e-05, + "loss": 1.3009, + "step": 5630 + }, + { + "epoch": 0.971448287759855, + "grad_norm": 0.640625, + "learning_rate": 1.0516926502402966e-05, + "loss": 1.5176, + "step": 5631 + }, + { + "epoch": 0.9716208056585871, + "grad_norm": 0.61328125, + "learning_rate": 1.0514208243404291e-05, + "loss": 1.4149, + "step": 5632 + }, + { + "epoch": 0.9717933235573191, + "grad_norm": 0.578125, + "learning_rate": 1.0511489946309856e-05, + "loss": 1.4443, + "step": 5633 + }, + { + "epoch": 0.971965841456051, + "grad_norm": 0.5859375, + "learning_rate": 1.050877161132105e-05, + "loss": 1.4321, + "step": 5634 + }, + { + "epoch": 0.9721383593547831, + "grad_norm": 0.609375, + "learning_rate": 1.0506053238639267e-05, + "loss": 1.4988, + "step": 5635 + }, + { + "epoch": 0.9723108772535151, + "grad_norm": 0.58984375, + "learning_rate": 1.0503334828465895e-05, + "loss": 1.4718, + "step": 5636 + }, + { + "epoch": 0.972483395152247, + "grad_norm": 0.58984375, + "learning_rate": 1.0500616381002331e-05, + "loss": 1.3965, + "step": 5637 + }, + { + "epoch": 0.9726559130509791, + "grad_norm": 0.58984375, + "learning_rate": 1.0497897896449983e-05, + "loss": 1.5349, + "step": 5638 + }, + { + "epoch": 0.972828430949711, + "grad_norm": 0.62890625, + "learning_rate": 1.0495179375010244e-05, + "loss": 1.3807, + "step": 5639 + }, + { + "epoch": 0.973000948848443, + "grad_norm": 0.66015625, + "learning_rate": 1.0492460816884524e-05, + "loss": 1.5651, + "step": 5640 + }, + { + "epoch": 0.973173466747175, + "grad_norm": 0.87890625, + "learning_rate": 1.0489742222274231e-05, + "loss": 1.3553, + "step": 5641 + }, + { + "epoch": 0.973345984645907, + "grad_norm": 0.546875, + "learning_rate": 1.0487023591380775e-05, + "loss": 1.4698, + "step": 5642 + }, + { + "epoch": 0.973518502544639, + "grad_norm": 0.59765625, + "learning_rate": 1.0484304924405566e-05, + "loss": 1.4298, + "step": 5643 + }, + { + "epoch": 0.973691020443371, + "grad_norm": 6.875, + "learning_rate": 1.0481586221550022e-05, + "loss": 1.5664, + "step": 5644 + }, + { + "epoch": 0.973863538342103, + "grad_norm": 0.58984375, + "learning_rate": 1.0478867483015563e-05, + "loss": 1.4564, + "step": 5645 + }, + { + "epoch": 0.974036056240835, + "grad_norm": 1.9765625, + "learning_rate": 1.0476148709003607e-05, + "loss": 1.3828, + "step": 5646 + }, + { + "epoch": 0.974208574139567, + "grad_norm": 0.58984375, + "learning_rate": 1.047342989971558e-05, + "loss": 1.4288, + "step": 5647 + }, + { + "epoch": 0.9743810920382989, + "grad_norm": 0.60546875, + "learning_rate": 1.0470711055352912e-05, + "loss": 1.5092, + "step": 5648 + }, + { + "epoch": 0.974553609937031, + "grad_norm": 0.56640625, + "learning_rate": 1.0467992176117024e-05, + "loss": 1.4224, + "step": 5649 + }, + { + "epoch": 0.974726127835763, + "grad_norm": 0.6796875, + "learning_rate": 1.046527326220935e-05, + "loss": 1.5092, + "step": 5650 + }, + { + "epoch": 0.9748986457344949, + "grad_norm": 0.6328125, + "learning_rate": 1.0462554313831327e-05, + "loss": 1.459, + "step": 5651 + }, + { + "epoch": 0.975071163633227, + "grad_norm": 0.58203125, + "learning_rate": 1.0459835331184392e-05, + "loss": 1.413, + "step": 5652 + }, + { + "epoch": 0.975243681531959, + "grad_norm": 0.63671875, + "learning_rate": 1.045711631446998e-05, + "loss": 1.3732, + "step": 5653 + }, + { + "epoch": 0.9754161994306909, + "grad_norm": 0.57421875, + "learning_rate": 1.0454397263889538e-05, + "loss": 1.4446, + "step": 5654 + }, + { + "epoch": 0.975588717329423, + "grad_norm": 0.6796875, + "learning_rate": 1.0451678179644504e-05, + "loss": 1.4371, + "step": 5655 + }, + { + "epoch": 0.9757612352281549, + "grad_norm": 0.75390625, + "learning_rate": 1.044895906193633e-05, + "loss": 1.5077, + "step": 5656 + }, + { + "epoch": 0.9759337531268869, + "grad_norm": 0.59765625, + "learning_rate": 1.0446239910966462e-05, + "loss": 1.4009, + "step": 5657 + }, + { + "epoch": 0.9761062710256189, + "grad_norm": 0.66796875, + "learning_rate": 1.0443520726936356e-05, + "loss": 1.4608, + "step": 5658 + }, + { + "epoch": 0.9762787889243509, + "grad_norm": 0.640625, + "learning_rate": 1.0440801510047462e-05, + "loss": 1.4192, + "step": 5659 + }, + { + "epoch": 0.9764513068230829, + "grad_norm": 0.6171875, + "learning_rate": 1.0438082260501239e-05, + "loss": 1.4076, + "step": 5660 + }, + { + "epoch": 0.9766238247218149, + "grad_norm": 0.67578125, + "learning_rate": 1.0435362978499143e-05, + "loss": 1.3499, + "step": 5661 + }, + { + "epoch": 0.9767963426205469, + "grad_norm": 0.63671875, + "learning_rate": 1.043264366424264e-05, + "loss": 1.4749, + "step": 5662 + }, + { + "epoch": 0.9769688605192789, + "grad_norm": 0.5859375, + "learning_rate": 1.0429924317933189e-05, + "loss": 1.3881, + "step": 5663 + }, + { + "epoch": 0.9771413784180109, + "grad_norm": 0.6015625, + "learning_rate": 1.0427204939772257e-05, + "loss": 1.4782, + "step": 5664 + }, + { + "epoch": 0.9773138963167428, + "grad_norm": 0.60546875, + "learning_rate": 1.0424485529961314e-05, + "loss": 1.4459, + "step": 5665 + }, + { + "epoch": 0.9774864142154749, + "grad_norm": 0.625, + "learning_rate": 1.0421766088701832e-05, + "loss": 1.3883, + "step": 5666 + }, + { + "epoch": 0.9776589321142068, + "grad_norm": 0.65625, + "learning_rate": 1.0419046616195285e-05, + "loss": 1.5965, + "step": 5667 + }, + { + "epoch": 0.9778314500129388, + "grad_norm": 0.609375, + "learning_rate": 1.0416327112643143e-05, + "loss": 1.3919, + "step": 5668 + }, + { + "epoch": 0.9780039679116709, + "grad_norm": 0.62109375, + "learning_rate": 1.0413607578246886e-05, + "loss": 1.4496, + "step": 5669 + }, + { + "epoch": 0.9781764858104028, + "grad_norm": 0.55859375, + "learning_rate": 1.0410888013208001e-05, + "loss": 1.4421, + "step": 5670 + }, + { + "epoch": 0.9783490037091348, + "grad_norm": 0.7578125, + "learning_rate": 1.040816841772796e-05, + "loss": 1.5079, + "step": 5671 + }, + { + "epoch": 0.9785215216078668, + "grad_norm": 0.6953125, + "learning_rate": 1.0405448792008252e-05, + "loss": 1.4715, + "step": 5672 + }, + { + "epoch": 0.9786940395065988, + "grad_norm": 0.58203125, + "learning_rate": 1.0402729136250364e-05, + "loss": 1.4881, + "step": 5673 + }, + { + "epoch": 0.9788665574053308, + "grad_norm": 0.64453125, + "learning_rate": 1.0400009450655789e-05, + "loss": 1.3597, + "step": 5674 + }, + { + "epoch": 0.9790390753040628, + "grad_norm": 0.67578125, + "learning_rate": 1.0397289735426012e-05, + "loss": 1.5159, + "step": 5675 + }, + { + "epoch": 0.9792115932027948, + "grad_norm": 0.6796875, + "learning_rate": 1.0394569990762528e-05, + "loss": 1.4996, + "step": 5676 + }, + { + "epoch": 0.9793841111015268, + "grad_norm": 0.68359375, + "learning_rate": 1.0391850216866834e-05, + "loss": 1.4435, + "step": 5677 + }, + { + "epoch": 0.9795566290002587, + "grad_norm": 0.58984375, + "learning_rate": 1.038913041394043e-05, + "loss": 1.335, + "step": 5678 + }, + { + "epoch": 0.9797291468989908, + "grad_norm": 0.609375, + "learning_rate": 1.0386410582184813e-05, + "loss": 1.3793, + "step": 5679 + }, + { + "epoch": 0.9799016647977228, + "grad_norm": 0.7109375, + "learning_rate": 1.0383690721801485e-05, + "loss": 1.4007, + "step": 5680 + }, + { + "epoch": 0.9800741826964547, + "grad_norm": 0.62109375, + "learning_rate": 1.038097083299195e-05, + "loss": 1.3936, + "step": 5681 + }, + { + "epoch": 0.9802467005951867, + "grad_norm": 0.609375, + "learning_rate": 1.0378250915957716e-05, + "loss": 1.5647, + "step": 5682 + }, + { + "epoch": 0.9804192184939188, + "grad_norm": 0.5703125, + "learning_rate": 1.0375530970900292e-05, + "loss": 1.3682, + "step": 5683 + }, + { + "epoch": 0.9805917363926507, + "grad_norm": 0.62890625, + "learning_rate": 1.0372810998021185e-05, + "loss": 1.4399, + "step": 5684 + }, + { + "epoch": 0.9807642542913827, + "grad_norm": 0.6484375, + "learning_rate": 1.0370090997521906e-05, + "loss": 1.4918, + "step": 5685 + }, + { + "epoch": 0.9809367721901148, + "grad_norm": 0.80078125, + "learning_rate": 1.0367370969603981e-05, + "loss": 1.4271, + "step": 5686 + }, + { + "epoch": 0.9811092900888467, + "grad_norm": 0.60546875, + "learning_rate": 1.0364650914468917e-05, + "loss": 1.4676, + "step": 5687 + }, + { + "epoch": 0.9812818079875787, + "grad_norm": 0.6015625, + "learning_rate": 1.0361930832318232e-05, + "loss": 1.3725, + "step": 5688 + }, + { + "epoch": 0.9814543258863107, + "grad_norm": 0.65625, + "learning_rate": 1.0359210723353451e-05, + "loss": 1.3957, + "step": 5689 + }, + { + "epoch": 0.9816268437850427, + "grad_norm": 1.25, + "learning_rate": 1.0356490587776095e-05, + "loss": 1.4496, + "step": 5690 + }, + { + "epoch": 0.9817993616837747, + "grad_norm": 0.71484375, + "learning_rate": 1.0353770425787693e-05, + "loss": 1.4051, + "step": 5691 + }, + { + "epoch": 0.9819718795825066, + "grad_norm": 0.63671875, + "learning_rate": 1.0351050237589763e-05, + "loss": 1.4122, + "step": 5692 + }, + { + "epoch": 0.9821443974812387, + "grad_norm": 0.640625, + "learning_rate": 1.034833002338384e-05, + "loss": 1.4373, + "step": 5693 + }, + { + "epoch": 0.9823169153799707, + "grad_norm": 0.60546875, + "learning_rate": 1.0345609783371448e-05, + "loss": 1.4795, + "step": 5694 + }, + { + "epoch": 0.9824894332787026, + "grad_norm": 0.78125, + "learning_rate": 1.0342889517754131e-05, + "loss": 1.4264, + "step": 5695 + }, + { + "epoch": 0.9826619511774347, + "grad_norm": 0.58984375, + "learning_rate": 1.034016922673341e-05, + "loss": 1.3468, + "step": 5696 + }, + { + "epoch": 0.9828344690761667, + "grad_norm": 0.59765625, + "learning_rate": 1.033744891051083e-05, + "loss": 1.3907, + "step": 5697 + }, + { + "epoch": 0.9830069869748986, + "grad_norm": 0.63671875, + "learning_rate": 1.0334728569287924e-05, + "loss": 1.4186, + "step": 5698 + }, + { + "epoch": 0.9831795048736306, + "grad_norm": 0.5859375, + "learning_rate": 1.0332008203266237e-05, + "loss": 1.4321, + "step": 5699 + }, + { + "epoch": 0.9833520227723627, + "grad_norm": 0.609375, + "learning_rate": 1.0329287812647307e-05, + "loss": 1.3987, + "step": 5700 + }, + { + "epoch": 0.9833520227723627, + "eval_loss": 1.4110112190246582, + "eval_runtime": 10.8773, + "eval_samples_per_second": 94.141, + "eval_steps_per_second": 23.535, + "step": 5700 + }, + { + "epoch": 0.9835245406710946, + "grad_norm": 0.65234375, + "learning_rate": 1.0326567397632675e-05, + "loss": 1.449, + "step": 5701 + }, + { + "epoch": 0.9836970585698266, + "grad_norm": 0.7890625, + "learning_rate": 1.0323846958423894e-05, + "loss": 1.4914, + "step": 5702 + }, + { + "epoch": 0.9838695764685587, + "grad_norm": 0.55859375, + "learning_rate": 1.0321126495222505e-05, + "loss": 1.4824, + "step": 5703 + }, + { + "epoch": 0.9840420943672906, + "grad_norm": 0.58984375, + "learning_rate": 1.031840600823006e-05, + "loss": 1.4821, + "step": 5704 + }, + { + "epoch": 0.9842146122660226, + "grad_norm": 0.5859375, + "learning_rate": 1.0315685497648107e-05, + "loss": 1.4482, + "step": 5705 + }, + { + "epoch": 0.9843871301647545, + "grad_norm": 0.55078125, + "learning_rate": 1.0312964963678198e-05, + "loss": 1.5063, + "step": 5706 + }, + { + "epoch": 0.9845596480634866, + "grad_norm": 0.578125, + "learning_rate": 1.0310244406521895e-05, + "loss": 1.4312, + "step": 5707 + }, + { + "epoch": 0.9847321659622186, + "grad_norm": 0.59375, + "learning_rate": 1.0307523826380743e-05, + "loss": 1.4777, + "step": 5708 + }, + { + "epoch": 0.9849046838609505, + "grad_norm": 0.671875, + "learning_rate": 1.0304803223456305e-05, + "loss": 1.4131, + "step": 5709 + }, + { + "epoch": 0.9850772017596826, + "grad_norm": 0.62109375, + "learning_rate": 1.0302082597950141e-05, + "loss": 1.4888, + "step": 5710 + }, + { + "epoch": 0.9852497196584146, + "grad_norm": 0.69140625, + "learning_rate": 1.029936195006381e-05, + "loss": 1.5442, + "step": 5711 + }, + { + "epoch": 0.9854222375571465, + "grad_norm": 0.6328125, + "learning_rate": 1.0296641279998876e-05, + "loss": 1.4095, + "step": 5712 + }, + { + "epoch": 0.9855947554558786, + "grad_norm": 0.6875, + "learning_rate": 1.02939205879569e-05, + "loss": 1.4457, + "step": 5713 + }, + { + "epoch": 0.9857672733546106, + "grad_norm": 0.68359375, + "learning_rate": 1.0291199874139453e-05, + "loss": 1.4679, + "step": 5714 + }, + { + "epoch": 0.9859397912533425, + "grad_norm": 0.65625, + "learning_rate": 1.02884791387481e-05, + "loss": 1.4651, + "step": 5715 + }, + { + "epoch": 0.9861123091520745, + "grad_norm": 0.6015625, + "learning_rate": 1.0285758381984408e-05, + "loss": 1.5139, + "step": 5716 + }, + { + "epoch": 0.9862848270508066, + "grad_norm": 0.75, + "learning_rate": 1.0283037604049948e-05, + "loss": 1.4568, + "step": 5717 + }, + { + "epoch": 0.9864573449495385, + "grad_norm": 0.671875, + "learning_rate": 1.0280316805146295e-05, + "loss": 1.4287, + "step": 5718 + }, + { + "epoch": 0.9866298628482705, + "grad_norm": 0.63671875, + "learning_rate": 1.0277595985475024e-05, + "loss": 1.5319, + "step": 5719 + }, + { + "epoch": 0.9868023807470025, + "grad_norm": 0.64453125, + "learning_rate": 1.0274875145237706e-05, + "loss": 1.4455, + "step": 5720 + }, + { + "epoch": 0.9869748986457345, + "grad_norm": 0.62890625, + "learning_rate": 1.027215428463592e-05, + "loss": 1.4713, + "step": 5721 + }, + { + "epoch": 0.9871474165444665, + "grad_norm": 0.65234375, + "learning_rate": 1.0269433403871246e-05, + "loss": 1.4784, + "step": 5722 + }, + { + "epoch": 0.9873199344431984, + "grad_norm": 0.609375, + "learning_rate": 1.0266712503145262e-05, + "loss": 1.5312, + "step": 5723 + }, + { + "epoch": 0.9874924523419305, + "grad_norm": 0.66796875, + "learning_rate": 1.0263991582659547e-05, + "loss": 1.5036, + "step": 5724 + }, + { + "epoch": 0.9876649702406625, + "grad_norm": 0.65234375, + "learning_rate": 1.0261270642615687e-05, + "loss": 1.3068, + "step": 5725 + }, + { + "epoch": 0.9878374881393944, + "grad_norm": 0.609375, + "learning_rate": 1.0258549683215262e-05, + "loss": 1.4302, + "step": 5726 + }, + { + "epoch": 0.9880100060381265, + "grad_norm": 0.796875, + "learning_rate": 1.0255828704659868e-05, + "loss": 1.4892, + "step": 5727 + }, + { + "epoch": 0.9881825239368585, + "grad_norm": 0.796875, + "learning_rate": 1.025310770715108e-05, + "loss": 1.4428, + "step": 5728 + }, + { + "epoch": 0.9883550418355904, + "grad_norm": 0.6015625, + "learning_rate": 1.0250386690890493e-05, + "loss": 1.4245, + "step": 5729 + }, + { + "epoch": 0.9885275597343224, + "grad_norm": 0.64453125, + "learning_rate": 1.0247665656079692e-05, + "loss": 1.4714, + "step": 5730 + }, + { + "epoch": 0.9887000776330545, + "grad_norm": 0.6015625, + "learning_rate": 1.0244944602920277e-05, + "loss": 1.3732, + "step": 5731 + }, + { + "epoch": 0.9888725955317864, + "grad_norm": 0.609375, + "learning_rate": 1.0242223531613834e-05, + "loss": 1.3682, + "step": 5732 + }, + { + "epoch": 0.9890451134305184, + "grad_norm": 0.64453125, + "learning_rate": 1.0239502442361955e-05, + "loss": 1.433, + "step": 5733 + }, + { + "epoch": 0.9892176313292504, + "grad_norm": 0.64453125, + "learning_rate": 1.0236781335366239e-05, + "loss": 1.4126, + "step": 5734 + }, + { + "epoch": 0.9893901492279824, + "grad_norm": 0.6015625, + "learning_rate": 1.0234060210828288e-05, + "loss": 1.3825, + "step": 5735 + }, + { + "epoch": 0.9895626671267144, + "grad_norm": 0.76171875, + "learning_rate": 1.0231339068949688e-05, + "loss": 1.4591, + "step": 5736 + }, + { + "epoch": 0.9897351850254464, + "grad_norm": 0.734375, + "learning_rate": 1.0228617909932046e-05, + "loss": 1.3771, + "step": 5737 + }, + { + "epoch": 0.9899077029241784, + "grad_norm": 0.57421875, + "learning_rate": 1.0225896733976962e-05, + "loss": 1.4394, + "step": 5738 + }, + { + "epoch": 0.9900802208229104, + "grad_norm": 0.59765625, + "learning_rate": 1.0223175541286036e-05, + "loss": 1.4656, + "step": 5739 + }, + { + "epoch": 0.9902527387216423, + "grad_norm": 0.6953125, + "learning_rate": 1.0220454332060872e-05, + "loss": 1.5557, + "step": 5740 + }, + { + "epoch": 0.9904252566203744, + "grad_norm": 0.5859375, + "learning_rate": 1.0217733106503072e-05, + "loss": 1.5004, + "step": 5741 + }, + { + "epoch": 0.9905977745191064, + "grad_norm": 0.58203125, + "learning_rate": 1.0215011864814244e-05, + "loss": 1.4148, + "step": 5742 + }, + { + "epoch": 0.9907702924178383, + "grad_norm": 0.609375, + "learning_rate": 1.0212290607195997e-05, + "loss": 1.4913, + "step": 5743 + }, + { + "epoch": 0.9909428103165704, + "grad_norm": 0.7265625, + "learning_rate": 1.020956933384993e-05, + "loss": 1.4065, + "step": 5744 + }, + { + "epoch": 0.9911153282153023, + "grad_norm": 0.6484375, + "learning_rate": 1.0206848044977658e-05, + "loss": 1.4257, + "step": 5745 + }, + { + "epoch": 0.9912878461140343, + "grad_norm": 0.66796875, + "learning_rate": 1.0204126740780791e-05, + "loss": 1.4675, + "step": 5746 + }, + { + "epoch": 0.9914603640127663, + "grad_norm": 0.5703125, + "learning_rate": 1.0201405421460942e-05, + "loss": 1.508, + "step": 5747 + }, + { + "epoch": 0.9916328819114983, + "grad_norm": 0.58984375, + "learning_rate": 1.0198684087219718e-05, + "loss": 1.472, + "step": 5748 + }, + { + "epoch": 0.9918053998102303, + "grad_norm": 0.59765625, + "learning_rate": 1.0195962738258736e-05, + "loss": 1.4882, + "step": 5749 + }, + { + "epoch": 0.9919779177089623, + "grad_norm": 0.625, + "learning_rate": 1.0193241374779607e-05, + "loss": 1.4459, + "step": 5750 + }, + { + "epoch": 0.9921504356076943, + "grad_norm": 0.58984375, + "learning_rate": 1.0190519996983956e-05, + "loss": 1.4201, + "step": 5751 + }, + { + "epoch": 0.9923229535064263, + "grad_norm": 0.859375, + "learning_rate": 1.0187798605073389e-05, + "loss": 1.4262, + "step": 5752 + }, + { + "epoch": 0.9924954714051583, + "grad_norm": 0.69921875, + "learning_rate": 1.018507719924953e-05, + "loss": 1.3847, + "step": 5753 + }, + { + "epoch": 0.9926679893038903, + "grad_norm": 0.66796875, + "learning_rate": 1.0182355779713992e-05, + "loss": 1.4505, + "step": 5754 + }, + { + "epoch": 0.9928405072026223, + "grad_norm": 0.6171875, + "learning_rate": 1.0179634346668406e-05, + "loss": 1.4877, + "step": 5755 + }, + { + "epoch": 0.9930130251013543, + "grad_norm": 0.59375, + "learning_rate": 1.0176912900314378e-05, + "loss": 1.4688, + "step": 5756 + }, + { + "epoch": 0.9931855430000862, + "grad_norm": 0.94921875, + "learning_rate": 1.0174191440853541e-05, + "loss": 1.5153, + "step": 5757 + }, + { + "epoch": 0.9933580608988183, + "grad_norm": 0.6328125, + "learning_rate": 1.017146996848751e-05, + "loss": 1.5, + "step": 5758 + }, + { + "epoch": 0.9935305787975502, + "grad_norm": 0.71484375, + "learning_rate": 1.0168748483417916e-05, + "loss": 1.4298, + "step": 5759 + }, + { + "epoch": 0.9937030966962822, + "grad_norm": 0.5546875, + "learning_rate": 1.0166026985846377e-05, + "loss": 1.4473, + "step": 5760 + }, + { + "epoch": 0.9938756145950143, + "grad_norm": 0.66015625, + "learning_rate": 1.0163305475974523e-05, + "loss": 1.3173, + "step": 5761 + }, + { + "epoch": 0.9940481324937462, + "grad_norm": 0.6953125, + "learning_rate": 1.0160583954003978e-05, + "loss": 1.4909, + "step": 5762 + }, + { + "epoch": 0.9942206503924782, + "grad_norm": 0.6171875, + "learning_rate": 1.0157862420136371e-05, + "loss": 1.4453, + "step": 5763 + }, + { + "epoch": 0.9943931682912102, + "grad_norm": 0.94921875, + "learning_rate": 1.015514087457333e-05, + "loss": 1.5272, + "step": 5764 + }, + { + "epoch": 0.9945656861899422, + "grad_norm": 0.5859375, + "learning_rate": 1.0152419317516482e-05, + "loss": 1.4821, + "step": 5765 + }, + { + "epoch": 0.9947382040886742, + "grad_norm": 0.62109375, + "learning_rate": 1.0149697749167459e-05, + "loss": 1.3855, + "step": 5766 + }, + { + "epoch": 0.9949107219874062, + "grad_norm": 0.58984375, + "learning_rate": 1.0146976169727893e-05, + "loss": 1.4995, + "step": 5767 + }, + { + "epoch": 0.9950832398861382, + "grad_norm": 0.60546875, + "learning_rate": 1.0144254579399413e-05, + "loss": 1.4256, + "step": 5768 + }, + { + "epoch": 0.9952557577848702, + "grad_norm": 0.5703125, + "learning_rate": 1.0141532978383653e-05, + "loss": 1.35, + "step": 5769 + }, + { + "epoch": 0.9954282756836021, + "grad_norm": 0.625, + "learning_rate": 1.0138811366882243e-05, + "loss": 1.4445, + "step": 5770 + }, + { + "epoch": 0.9956007935823342, + "grad_norm": 0.59375, + "learning_rate": 1.0136089745096824e-05, + "loss": 1.4354, + "step": 5771 + }, + { + "epoch": 0.9957733114810662, + "grad_norm": 0.58984375, + "learning_rate": 1.0133368113229026e-05, + "loss": 1.3907, + "step": 5772 + }, + { + "epoch": 0.9959458293797981, + "grad_norm": 0.6328125, + "learning_rate": 1.013064647148048e-05, + "loss": 1.4249, + "step": 5773 + }, + { + "epoch": 0.9961183472785301, + "grad_norm": 0.609375, + "learning_rate": 1.0127924820052831e-05, + "loss": 1.4933, + "step": 5774 + }, + { + "epoch": 0.9962908651772622, + "grad_norm": 0.61328125, + "learning_rate": 1.0125203159147712e-05, + "loss": 1.4064, + "step": 5775 + }, + { + "epoch": 0.9964633830759941, + "grad_norm": 0.66796875, + "learning_rate": 1.012248148896676e-05, + "loss": 1.3864, + "step": 5776 + }, + { + "epoch": 0.9966359009747261, + "grad_norm": 0.671875, + "learning_rate": 1.0119759809711614e-05, + "loss": 1.3759, + "step": 5777 + }, + { + "epoch": 0.9968084188734582, + "grad_norm": 0.64453125, + "learning_rate": 1.0117038121583911e-05, + "loss": 1.4703, + "step": 5778 + }, + { + "epoch": 0.9969809367721901, + "grad_norm": 0.7421875, + "learning_rate": 1.0114316424785295e-05, + "loss": 1.3742, + "step": 5779 + }, + { + "epoch": 0.9971534546709221, + "grad_norm": 0.6484375, + "learning_rate": 1.0111594719517406e-05, + "loss": 1.4711, + "step": 5780 + }, + { + "epoch": 0.997325972569654, + "grad_norm": 0.60546875, + "learning_rate": 1.0108873005981876e-05, + "loss": 1.3791, + "step": 5781 + }, + { + "epoch": 0.9974984904683861, + "grad_norm": 0.58203125, + "learning_rate": 1.0106151284380359e-05, + "loss": 1.4406, + "step": 5782 + }, + { + "epoch": 0.9976710083671181, + "grad_norm": 0.66796875, + "learning_rate": 1.010342955491449e-05, + "loss": 1.3951, + "step": 5783 + }, + { + "epoch": 0.99784352626585, + "grad_norm": 0.58203125, + "learning_rate": 1.0100707817785915e-05, + "loss": 1.3135, + "step": 5784 + }, + { + "epoch": 0.9980160441645821, + "grad_norm": 0.62890625, + "learning_rate": 1.0097986073196273e-05, + "loss": 1.4802, + "step": 5785 + }, + { + "epoch": 0.9981885620633141, + "grad_norm": 0.75390625, + "learning_rate": 1.0095264321347212e-05, + "loss": 1.5621, + "step": 5786 + }, + { + "epoch": 0.998361079962046, + "grad_norm": 0.64453125, + "learning_rate": 1.0092542562440375e-05, + "loss": 1.3868, + "step": 5787 + }, + { + "epoch": 0.9985335978607781, + "grad_norm": 0.58984375, + "learning_rate": 1.0089820796677407e-05, + "loss": 1.4492, + "step": 5788 + }, + { + "epoch": 0.9987061157595101, + "grad_norm": 0.6484375, + "learning_rate": 1.0087099024259951e-05, + "loss": 1.3685, + "step": 5789 + }, + { + "epoch": 0.998878633658242, + "grad_norm": 0.6484375, + "learning_rate": 1.0084377245389656e-05, + "loss": 1.4912, + "step": 5790 + }, + { + "epoch": 0.999051151556974, + "grad_norm": 0.8828125, + "learning_rate": 1.008165546026817e-05, + "loss": 1.463, + "step": 5791 + }, + { + "epoch": 0.9992236694557061, + "grad_norm": 0.6171875, + "learning_rate": 1.0078933669097135e-05, + "loss": 1.4201, + "step": 5792 + }, + { + "epoch": 0.999396187354438, + "grad_norm": 0.61328125, + "learning_rate": 1.00762118720782e-05, + "loss": 1.5223, + "step": 5793 + }, + { + "epoch": 0.99956870525317, + "grad_norm": 0.61328125, + "learning_rate": 1.0073490069413014e-05, + "loss": 1.4212, + "step": 5794 + }, + { + "epoch": 0.9997412231519021, + "grad_norm": 0.89453125, + "learning_rate": 1.0070768261303226e-05, + "loss": 1.4884, + "step": 5795 + }, + { + "epoch": 0.999913741050634, + "grad_norm": 0.640625, + "learning_rate": 1.006804644795048e-05, + "loss": 1.4197, + "step": 5796 + }, + { + "epoch": 1.000086258949366, + "grad_norm": 0.60546875, + "learning_rate": 1.006532462955643e-05, + "loss": 1.4797, + "step": 5797 + }, + { + "epoch": 1.000258776848098, + "grad_norm": 0.59375, + "learning_rate": 1.006260280632272e-05, + "loss": 1.4291, + "step": 5798 + }, + { + "epoch": 1.00043129474683, + "grad_norm": 0.66796875, + "learning_rate": 1.0059880978451009e-05, + "loss": 1.4776, + "step": 5799 + }, + { + "epoch": 1.000603812645562, + "grad_norm": 0.765625, + "learning_rate": 1.0057159146142937e-05, + "loss": 1.466, + "step": 5800 + }, + { + "epoch": 1.000603812645562, + "eval_loss": 1.4107141494750977, + "eval_runtime": 10.9354, + "eval_samples_per_second": 93.641, + "eval_steps_per_second": 23.41, + "step": 5800 + }, + { + "epoch": 1.000776330544294, + "grad_norm": 0.6171875, + "learning_rate": 1.0054437309600159e-05, + "loss": 1.5335, + "step": 5801 + }, + { + "epoch": 1.000948848443026, + "grad_norm": 0.61328125, + "learning_rate": 1.0051715469024325e-05, + "loss": 1.3836, + "step": 5802 + }, + { + "epoch": 1.001121366341758, + "grad_norm": 0.61328125, + "learning_rate": 1.0048993624617087e-05, + "loss": 1.5043, + "step": 5803 + }, + { + "epoch": 1.00129388424049, + "grad_norm": 0.6171875, + "learning_rate": 1.0046271776580094e-05, + "loss": 1.4079, + "step": 5804 + }, + { + "epoch": 1.001466402139222, + "grad_norm": 0.58203125, + "learning_rate": 1.0043549925115e-05, + "loss": 1.4617, + "step": 5805 + }, + { + "epoch": 1.0016389200379539, + "grad_norm": 0.58984375, + "learning_rate": 1.0040828070423451e-05, + "loss": 1.3933, + "step": 5806 + }, + { + "epoch": 1.001811437936686, + "grad_norm": 0.58984375, + "learning_rate": 1.0038106212707108e-05, + "loss": 1.4361, + "step": 5807 + }, + { + "epoch": 1.001983955835418, + "grad_norm": 0.5859375, + "learning_rate": 1.0035384352167619e-05, + "loss": 1.5073, + "step": 5808 + }, + { + "epoch": 1.00215647373415, + "grad_norm": 0.58203125, + "learning_rate": 1.0032662489006634e-05, + "loss": 1.3779, + "step": 5809 + }, + { + "epoch": 1.002328991632882, + "grad_norm": 0.5703125, + "learning_rate": 1.0029940623425807e-05, + "loss": 1.4819, + "step": 5810 + }, + { + "epoch": 1.0025015095316139, + "grad_norm": 0.6953125, + "learning_rate": 1.0027218755626793e-05, + "loss": 1.3348, + "step": 5811 + }, + { + "epoch": 1.0026740274303458, + "grad_norm": 0.5703125, + "learning_rate": 1.0024496885811245e-05, + "loss": 1.3513, + "step": 5812 + }, + { + "epoch": 1.0028465453290778, + "grad_norm": 0.6015625, + "learning_rate": 1.002177501418081e-05, + "loss": 1.4489, + "step": 5813 + }, + { + "epoch": 1.00301906322781, + "grad_norm": 0.5703125, + "learning_rate": 1.001905314093715e-05, + "loss": 1.4161, + "step": 5814 + }, + { + "epoch": 1.003191581126542, + "grad_norm": 0.52734375, + "learning_rate": 1.0016331266281913e-05, + "loss": 1.3771, + "step": 5815 + }, + { + "epoch": 1.003364099025274, + "grad_norm": 0.69140625, + "learning_rate": 1.0013609390416753e-05, + "loss": 1.2211, + "step": 5816 + }, + { + "epoch": 1.0035366169240059, + "grad_norm": 0.62890625, + "learning_rate": 1.0010887513543325e-05, + "loss": 1.5178, + "step": 5817 + }, + { + "epoch": 1.0037091348227378, + "grad_norm": 0.60546875, + "learning_rate": 1.0008165635863283e-05, + "loss": 1.5569, + "step": 5818 + }, + { + "epoch": 1.0038816527214698, + "grad_norm": 0.6015625, + "learning_rate": 1.0005443757578279e-05, + "loss": 1.3907, + "step": 5819 + }, + { + "epoch": 1.0040541706202017, + "grad_norm": 0.58984375, + "learning_rate": 1.0002721878889967e-05, + "loss": 1.4906, + "step": 5820 + }, + { + "epoch": 1.004226688518934, + "grad_norm": 0.6015625, + "learning_rate": 1e-05, + "loss": 1.4273, + "step": 5821 + }, + { + "epoch": 1.004399206417666, + "grad_norm": 0.6328125, + "learning_rate": 9.997278121110033e-06, + "loss": 1.5296, + "step": 5822 + }, + { + "epoch": 1.0045717243163979, + "grad_norm": 0.6015625, + "learning_rate": 9.994556242421723e-06, + "loss": 1.3968, + "step": 5823 + }, + { + "epoch": 1.0047442422151298, + "grad_norm": 0.6328125, + "learning_rate": 9.991834364136719e-06, + "loss": 1.5477, + "step": 5824 + }, + { + "epoch": 1.0049167601138618, + "grad_norm": 0.61328125, + "learning_rate": 9.989112486456677e-06, + "loss": 1.5011, + "step": 5825 + }, + { + "epoch": 1.0050892780125937, + "grad_norm": 0.58203125, + "learning_rate": 9.986390609583246e-06, + "loss": 1.5186, + "step": 5826 + }, + { + "epoch": 1.0052617959113257, + "grad_norm": 0.60546875, + "learning_rate": 9.983668733718089e-06, + "loss": 1.5327, + "step": 5827 + }, + { + "epoch": 1.0054343138100579, + "grad_norm": 0.6953125, + "learning_rate": 9.980946859062852e-06, + "loss": 1.4618, + "step": 5828 + }, + { + "epoch": 1.0056068317087898, + "grad_norm": 0.59765625, + "learning_rate": 9.978224985819193e-06, + "loss": 1.3387, + "step": 5829 + }, + { + "epoch": 1.0057793496075218, + "grad_norm": 0.58984375, + "learning_rate": 9.975503114188758e-06, + "loss": 1.4009, + "step": 5830 + }, + { + "epoch": 1.0059518675062538, + "grad_norm": 0.61328125, + "learning_rate": 9.97278124437321e-06, + "loss": 1.3932, + "step": 5831 + }, + { + "epoch": 1.0061243854049857, + "grad_norm": 0.578125, + "learning_rate": 9.970059376574195e-06, + "loss": 1.4174, + "step": 5832 + }, + { + "epoch": 1.0062969033037177, + "grad_norm": 0.59765625, + "learning_rate": 9.96733751099337e-06, + "loss": 1.4151, + "step": 5833 + }, + { + "epoch": 1.0064694212024499, + "grad_norm": 0.5625, + "learning_rate": 9.964615647832384e-06, + "loss": 1.4401, + "step": 5834 + }, + { + "epoch": 1.0066419391011818, + "grad_norm": 0.60546875, + "learning_rate": 9.961893787292895e-06, + "loss": 1.4587, + "step": 5835 + }, + { + "epoch": 1.0068144569999138, + "grad_norm": 0.59375, + "learning_rate": 9.95917192957655e-06, + "loss": 1.4193, + "step": 5836 + }, + { + "epoch": 1.0069869748986457, + "grad_norm": 0.578125, + "learning_rate": 9.956450074885007e-06, + "loss": 1.4307, + "step": 5837 + }, + { + "epoch": 1.0071594927973777, + "grad_norm": 0.60546875, + "learning_rate": 9.953728223419908e-06, + "loss": 1.397, + "step": 5838 + }, + { + "epoch": 1.0073320106961097, + "grad_norm": 0.6953125, + "learning_rate": 9.951006375382915e-06, + "loss": 1.5258, + "step": 5839 + }, + { + "epoch": 1.0075045285948416, + "grad_norm": 0.578125, + "learning_rate": 9.948284530975678e-06, + "loss": 1.3965, + "step": 5840 + }, + { + "epoch": 1.0076770464935738, + "grad_norm": 0.60546875, + "learning_rate": 9.945562690399841e-06, + "loss": 1.4365, + "step": 5841 + }, + { + "epoch": 1.0078495643923058, + "grad_norm": 0.703125, + "learning_rate": 9.942840853857065e-06, + "loss": 1.5303, + "step": 5842 + }, + { + "epoch": 1.0080220822910377, + "grad_norm": 0.64453125, + "learning_rate": 9.940119021548994e-06, + "loss": 1.45, + "step": 5843 + }, + { + "epoch": 1.0081946001897697, + "grad_norm": 0.58203125, + "learning_rate": 9.937397193677281e-06, + "loss": 1.4291, + "step": 5844 + }, + { + "epoch": 1.0083671180885017, + "grad_norm": 0.63671875, + "learning_rate": 9.934675370443571e-06, + "loss": 1.3639, + "step": 5845 + }, + { + "epoch": 1.0085396359872336, + "grad_norm": 0.6328125, + "learning_rate": 9.931953552049524e-06, + "loss": 1.316, + "step": 5846 + }, + { + "epoch": 1.0087121538859656, + "grad_norm": 0.70703125, + "learning_rate": 9.929231738696779e-06, + "loss": 1.377, + "step": 5847 + }, + { + "epoch": 1.0088846717846978, + "grad_norm": 0.6640625, + "learning_rate": 9.926509930586991e-06, + "loss": 1.4196, + "step": 5848 + }, + { + "epoch": 1.0090571896834297, + "grad_norm": 0.67578125, + "learning_rate": 9.923788127921801e-06, + "loss": 1.386, + "step": 5849 + }, + { + "epoch": 1.0092297075821617, + "grad_norm": 0.6015625, + "learning_rate": 9.92106633090287e-06, + "loss": 1.3856, + "step": 5850 + }, + { + "epoch": 1.0094022254808936, + "grad_norm": 0.66015625, + "learning_rate": 9.918344539731832e-06, + "loss": 1.4346, + "step": 5851 + }, + { + "epoch": 1.0095747433796256, + "grad_norm": 0.765625, + "learning_rate": 9.915622754610349e-06, + "loss": 1.4435, + "step": 5852 + }, + { + "epoch": 1.0097472612783576, + "grad_norm": 0.73046875, + "learning_rate": 9.912900975740052e-06, + "loss": 1.3264, + "step": 5853 + }, + { + "epoch": 1.0099197791770895, + "grad_norm": 0.57421875, + "learning_rate": 9.910179203322595e-06, + "loss": 1.3786, + "step": 5854 + }, + { + "epoch": 1.0100922970758217, + "grad_norm": 0.578125, + "learning_rate": 9.907457437559626e-06, + "loss": 1.3746, + "step": 5855 + }, + { + "epoch": 1.0102648149745537, + "grad_norm": 0.65625, + "learning_rate": 9.90473567865279e-06, + "loss": 1.5273, + "step": 5856 + }, + { + "epoch": 1.0104373328732856, + "grad_norm": 0.65625, + "learning_rate": 9.90201392680373e-06, + "loss": 1.3937, + "step": 5857 + }, + { + "epoch": 1.0106098507720176, + "grad_norm": 0.66015625, + "learning_rate": 9.899292182214087e-06, + "loss": 1.4236, + "step": 5858 + }, + { + "epoch": 1.0107823686707496, + "grad_norm": 0.62109375, + "learning_rate": 9.896570445085511e-06, + "loss": 1.391, + "step": 5859 + }, + { + "epoch": 1.0109548865694815, + "grad_norm": 0.70703125, + "learning_rate": 9.893848715619643e-06, + "loss": 1.4368, + "step": 5860 + }, + { + "epoch": 1.0111274044682135, + "grad_norm": 0.61328125, + "learning_rate": 9.891126994018126e-06, + "loss": 1.4629, + "step": 5861 + }, + { + "epoch": 1.0112999223669457, + "grad_norm": 1.015625, + "learning_rate": 9.888405280482598e-06, + "loss": 1.4976, + "step": 5862 + }, + { + "epoch": 1.0114724402656776, + "grad_norm": 0.58984375, + "learning_rate": 9.885683575214709e-06, + "loss": 1.4451, + "step": 5863 + }, + { + "epoch": 1.0116449581644096, + "grad_norm": 0.56640625, + "learning_rate": 9.88296187841609e-06, + "loss": 1.4385, + "step": 5864 + }, + { + "epoch": 1.0118174760631415, + "grad_norm": 0.58203125, + "learning_rate": 9.880240190288391e-06, + "loss": 1.4685, + "step": 5865 + }, + { + "epoch": 1.0119899939618735, + "grad_norm": 0.5703125, + "learning_rate": 9.877518511033243e-06, + "loss": 1.4252, + "step": 5866 + }, + { + "epoch": 1.0121625118606055, + "grad_norm": 0.60546875, + "learning_rate": 9.874796840852292e-06, + "loss": 1.416, + "step": 5867 + }, + { + "epoch": 1.0123350297593374, + "grad_norm": 0.671875, + "learning_rate": 9.87207517994717e-06, + "loss": 1.3435, + "step": 5868 + }, + { + "epoch": 1.0125075476580696, + "grad_norm": 0.57421875, + "learning_rate": 9.869353528519523e-06, + "loss": 1.4463, + "step": 5869 + }, + { + "epoch": 1.0126800655568016, + "grad_norm": 0.8203125, + "learning_rate": 9.866631886770979e-06, + "loss": 1.389, + "step": 5870 + }, + { + "epoch": 1.0128525834555335, + "grad_norm": 0.61328125, + "learning_rate": 9.86391025490318e-06, + "loss": 1.3701, + "step": 5871 + }, + { + "epoch": 1.0130251013542655, + "grad_norm": 0.58203125, + "learning_rate": 9.861188633117758e-06, + "loss": 1.4432, + "step": 5872 + }, + { + "epoch": 1.0131976192529975, + "grad_norm": 0.60546875, + "learning_rate": 9.858467021616349e-06, + "loss": 1.3332, + "step": 5873 + }, + { + "epoch": 1.0133701371517294, + "grad_norm": 0.6171875, + "learning_rate": 9.855745420600589e-06, + "loss": 1.3983, + "step": 5874 + }, + { + "epoch": 1.0135426550504616, + "grad_norm": 0.6953125, + "learning_rate": 9.85302383027211e-06, + "loss": 1.4667, + "step": 5875 + }, + { + "epoch": 1.0137151729491936, + "grad_norm": 0.56640625, + "learning_rate": 9.850302250832544e-06, + "loss": 1.3457, + "step": 5876 + }, + { + "epoch": 1.0138876908479255, + "grad_norm": 0.62109375, + "learning_rate": 9.84758068248352e-06, + "loss": 1.4847, + "step": 5877 + }, + { + "epoch": 1.0140602087466575, + "grad_norm": 0.59375, + "learning_rate": 9.844859125426674e-06, + "loss": 1.4181, + "step": 5878 + }, + { + "epoch": 1.0142327266453894, + "grad_norm": 0.5703125, + "learning_rate": 9.842137579863632e-06, + "loss": 1.3819, + "step": 5879 + }, + { + "epoch": 1.0144052445441214, + "grad_norm": 0.76953125, + "learning_rate": 9.839416045996027e-06, + "loss": 1.4222, + "step": 5880 + }, + { + "epoch": 1.0145777624428534, + "grad_norm": 0.57421875, + "learning_rate": 9.83669452402548e-06, + "loss": 1.551, + "step": 5881 + }, + { + "epoch": 1.0147502803415855, + "grad_norm": 0.6953125, + "learning_rate": 9.833973014153628e-06, + "loss": 1.5052, + "step": 5882 + }, + { + "epoch": 1.0149227982403175, + "grad_norm": 0.6171875, + "learning_rate": 9.831251516582087e-06, + "loss": 1.4471, + "step": 5883 + }, + { + "epoch": 1.0150953161390495, + "grad_norm": 0.5703125, + "learning_rate": 9.828530031512493e-06, + "loss": 1.426, + "step": 5884 + }, + { + "epoch": 1.0152678340377814, + "grad_norm": 0.61328125, + "learning_rate": 9.825808559146464e-06, + "loss": 1.3441, + "step": 5885 + }, + { + "epoch": 1.0154403519365134, + "grad_norm": 0.57421875, + "learning_rate": 9.823087099685622e-06, + "loss": 1.48, + "step": 5886 + }, + { + "epoch": 1.0156128698352453, + "grad_norm": 0.67578125, + "learning_rate": 9.820365653331599e-06, + "loss": 1.5439, + "step": 5887 + }, + { + "epoch": 1.0157853877339773, + "grad_norm": 0.625, + "learning_rate": 9.817644220286006e-06, + "loss": 1.381, + "step": 5888 + }, + { + "epoch": 1.0159579056327095, + "grad_norm": 0.7265625, + "learning_rate": 9.814922800750474e-06, + "loss": 1.4234, + "step": 5889 + }, + { + "epoch": 1.0161304235314415, + "grad_norm": 0.59375, + "learning_rate": 9.81220139492661e-06, + "loss": 1.5136, + "step": 5890 + }, + { + "epoch": 1.0163029414301734, + "grad_norm": 0.5703125, + "learning_rate": 9.809480003016046e-06, + "loss": 1.3156, + "step": 5891 + }, + { + "epoch": 1.0164754593289054, + "grad_norm": 0.58984375, + "learning_rate": 9.806758625220391e-06, + "loss": 1.3625, + "step": 5892 + }, + { + "epoch": 1.0166479772276373, + "grad_norm": 0.58984375, + "learning_rate": 9.804037261741269e-06, + "loss": 1.3784, + "step": 5893 + }, + { + "epoch": 1.0168204951263693, + "grad_norm": 0.6171875, + "learning_rate": 9.801315912780283e-06, + "loss": 1.4707, + "step": 5894 + }, + { + "epoch": 1.0169930130251013, + "grad_norm": 0.640625, + "learning_rate": 9.798594578539063e-06, + "loss": 1.4436, + "step": 5895 + }, + { + "epoch": 1.0171655309238334, + "grad_norm": 0.6015625, + "learning_rate": 9.795873259219212e-06, + "loss": 1.4904, + "step": 5896 + }, + { + "epoch": 1.0173380488225654, + "grad_norm": 0.56640625, + "learning_rate": 9.793151955022346e-06, + "loss": 1.4208, + "step": 5897 + }, + { + "epoch": 1.0175105667212974, + "grad_norm": 0.75390625, + "learning_rate": 9.790430666150073e-06, + "loss": 1.4334, + "step": 5898 + }, + { + "epoch": 1.0176830846200293, + "grad_norm": 0.609375, + "learning_rate": 9.78770939280401e-06, + "loss": 1.4077, + "step": 5899 + }, + { + "epoch": 1.0178556025187613, + "grad_norm": 0.55078125, + "learning_rate": 9.78498813518576e-06, + "loss": 1.425, + "step": 5900 + }, + { + "epoch": 1.0178556025187613, + "eval_loss": 1.410369634628296, + "eval_runtime": 10.8142, + "eval_samples_per_second": 94.69, + "eval_steps_per_second": 23.673, + "step": 5900 + }, + { + "epoch": 1.0180281204174932, + "grad_norm": 0.62109375, + "learning_rate": 9.782266893496933e-06, + "loss": 1.3682, + "step": 5901 + }, + { + "epoch": 1.0182006383162252, + "grad_norm": 0.58203125, + "learning_rate": 9.779545667939132e-06, + "loss": 1.4094, + "step": 5902 + }, + { + "epoch": 1.0183731562149574, + "grad_norm": 0.6328125, + "learning_rate": 9.776824458713965e-06, + "loss": 1.4804, + "step": 5903 + }, + { + "epoch": 1.0185456741136893, + "grad_norm": 0.66015625, + "learning_rate": 9.774103266023042e-06, + "loss": 1.4789, + "step": 5904 + }, + { + "epoch": 1.0187181920124213, + "grad_norm": 0.578125, + "learning_rate": 9.771382090067954e-06, + "loss": 1.4821, + "step": 5905 + }, + { + "epoch": 1.0188907099111533, + "grad_norm": 0.59765625, + "learning_rate": 9.768660931050314e-06, + "loss": 1.2989, + "step": 5906 + }, + { + "epoch": 1.0190632278098852, + "grad_norm": 0.703125, + "learning_rate": 9.765939789171717e-06, + "loss": 1.456, + "step": 5907 + }, + { + "epoch": 1.0192357457086172, + "grad_norm": 0.62109375, + "learning_rate": 9.763218664633763e-06, + "loss": 1.4041, + "step": 5908 + }, + { + "epoch": 1.0194082636073492, + "grad_norm": 0.61328125, + "learning_rate": 9.760497557638047e-06, + "loss": 1.5247, + "step": 5909 + }, + { + "epoch": 1.0195807815060813, + "grad_norm": 0.56640625, + "learning_rate": 9.757776468386171e-06, + "loss": 1.3292, + "step": 5910 + }, + { + "epoch": 1.0197532994048133, + "grad_norm": 0.609375, + "learning_rate": 9.755055397079724e-06, + "loss": 1.5492, + "step": 5911 + }, + { + "epoch": 1.0199258173035453, + "grad_norm": 0.578125, + "learning_rate": 9.75233434392031e-06, + "loss": 1.3097, + "step": 5912 + }, + { + "epoch": 1.0200983352022772, + "grad_norm": 0.70703125, + "learning_rate": 9.749613309109512e-06, + "loss": 1.3986, + "step": 5913 + }, + { + "epoch": 1.0202708531010092, + "grad_norm": 0.58203125, + "learning_rate": 9.746892292848925e-06, + "loss": 1.4641, + "step": 5914 + }, + { + "epoch": 1.0204433709997411, + "grad_norm": 0.6015625, + "learning_rate": 9.744171295340136e-06, + "loss": 1.5802, + "step": 5915 + }, + { + "epoch": 1.0206158888984733, + "grad_norm": 0.64453125, + "learning_rate": 9.74145031678474e-06, + "loss": 1.4543, + "step": 5916 + }, + { + "epoch": 1.0207884067972053, + "grad_norm": 0.58984375, + "learning_rate": 9.738729357384318e-06, + "loss": 1.4212, + "step": 5917 + }, + { + "epoch": 1.0209609246959372, + "grad_norm": 0.62109375, + "learning_rate": 9.736008417340455e-06, + "loss": 1.3583, + "step": 5918 + }, + { + "epoch": 1.0211334425946692, + "grad_norm": 0.5859375, + "learning_rate": 9.733287496854743e-06, + "loss": 1.5124, + "step": 5919 + }, + { + "epoch": 1.0213059604934012, + "grad_norm": 0.6484375, + "learning_rate": 9.730566596128756e-06, + "loss": 1.4214, + "step": 5920 + }, + { + "epoch": 1.0214784783921331, + "grad_norm": 0.62890625, + "learning_rate": 9.727845715364081e-06, + "loss": 1.4619, + "step": 5921 + }, + { + "epoch": 1.021650996290865, + "grad_norm": 0.58984375, + "learning_rate": 9.725124854762294e-06, + "loss": 1.329, + "step": 5922 + }, + { + "epoch": 1.0218235141895973, + "grad_norm": 0.6015625, + "learning_rate": 9.722404014524978e-06, + "loss": 1.4048, + "step": 5923 + }, + { + "epoch": 1.0219960320883292, + "grad_norm": 0.59375, + "learning_rate": 9.719683194853705e-06, + "loss": 1.3779, + "step": 5924 + }, + { + "epoch": 1.0221685499870612, + "grad_norm": 0.6328125, + "learning_rate": 9.716962395950055e-06, + "loss": 1.3623, + "step": 5925 + }, + { + "epoch": 1.0223410678857932, + "grad_norm": 0.59375, + "learning_rate": 9.714241618015596e-06, + "loss": 1.3491, + "step": 5926 + }, + { + "epoch": 1.0225135857845251, + "grad_norm": 1.109375, + "learning_rate": 9.711520861251904e-06, + "loss": 1.4558, + "step": 5927 + }, + { + "epoch": 1.022686103683257, + "grad_norm": 0.62890625, + "learning_rate": 9.708800125860552e-06, + "loss": 1.3601, + "step": 5928 + }, + { + "epoch": 1.022858621581989, + "grad_norm": 1.0546875, + "learning_rate": 9.706079412043105e-06, + "loss": 1.4023, + "step": 5929 + }, + { + "epoch": 1.0230311394807212, + "grad_norm": 0.60546875, + "learning_rate": 9.703358720001129e-06, + "loss": 1.5388, + "step": 5930 + }, + { + "epoch": 1.0232036573794532, + "grad_norm": 0.578125, + "learning_rate": 9.700638049936194e-06, + "loss": 1.4385, + "step": 5931 + }, + { + "epoch": 1.0233761752781851, + "grad_norm": 0.58984375, + "learning_rate": 9.697917402049864e-06, + "loss": 1.5253, + "step": 5932 + }, + { + "epoch": 1.023548693176917, + "grad_norm": 0.73046875, + "learning_rate": 9.695196776543701e-06, + "loss": 1.4444, + "step": 5933 + }, + { + "epoch": 1.023721211075649, + "grad_norm": 0.60546875, + "learning_rate": 9.69247617361926e-06, + "loss": 1.3254, + "step": 5934 + }, + { + "epoch": 1.023893728974381, + "grad_norm": 0.61328125, + "learning_rate": 9.689755593478109e-06, + "loss": 1.3835, + "step": 5935 + }, + { + "epoch": 1.024066246873113, + "grad_norm": 0.65625, + "learning_rate": 9.687035036321804e-06, + "loss": 1.3976, + "step": 5936 + }, + { + "epoch": 1.0242387647718452, + "grad_norm": 0.65234375, + "learning_rate": 9.684314502351895e-06, + "loss": 1.5053, + "step": 5937 + }, + { + "epoch": 1.0244112826705771, + "grad_norm": 0.59765625, + "learning_rate": 9.681593991769944e-06, + "loss": 1.423, + "step": 5938 + }, + { + "epoch": 1.024583800569309, + "grad_norm": 0.5703125, + "learning_rate": 9.678873504777495e-06, + "loss": 1.4018, + "step": 5939 + }, + { + "epoch": 1.024756318468041, + "grad_norm": 0.61328125, + "learning_rate": 9.676153041576111e-06, + "loss": 1.4337, + "step": 5940 + }, + { + "epoch": 1.024928836366773, + "grad_norm": 0.6015625, + "learning_rate": 9.673432602367325e-06, + "loss": 1.4924, + "step": 5941 + }, + { + "epoch": 1.025101354265505, + "grad_norm": 0.7890625, + "learning_rate": 9.670712187352698e-06, + "loss": 1.4626, + "step": 5942 + }, + { + "epoch": 1.025273872164237, + "grad_norm": 0.63671875, + "learning_rate": 9.667991796733764e-06, + "loss": 1.5558, + "step": 5943 + }, + { + "epoch": 1.0254463900629691, + "grad_norm": 0.578125, + "learning_rate": 9.665271430712079e-06, + "loss": 1.4, + "step": 5944 + }, + { + "epoch": 1.025618907961701, + "grad_norm": 1.359375, + "learning_rate": 9.662551089489174e-06, + "loss": 1.474, + "step": 5945 + }, + { + "epoch": 1.025791425860433, + "grad_norm": 0.63671875, + "learning_rate": 9.659830773266593e-06, + "loss": 1.4282, + "step": 5946 + }, + { + "epoch": 1.025963943759165, + "grad_norm": 0.60546875, + "learning_rate": 9.657110482245872e-06, + "loss": 1.4422, + "step": 5947 + }, + { + "epoch": 1.026136461657897, + "grad_norm": 0.60546875, + "learning_rate": 9.654390216628554e-06, + "loss": 1.3946, + "step": 5948 + }, + { + "epoch": 1.026308979556629, + "grad_norm": 0.6796875, + "learning_rate": 9.651669976616165e-06, + "loss": 1.547, + "step": 5949 + }, + { + "epoch": 1.026481497455361, + "grad_norm": 0.65234375, + "learning_rate": 9.648949762410238e-06, + "loss": 1.449, + "step": 5950 + }, + { + "epoch": 1.026654015354093, + "grad_norm": 0.58203125, + "learning_rate": 9.64622957421231e-06, + "loss": 1.4005, + "step": 5951 + }, + { + "epoch": 1.026826533252825, + "grad_norm": 0.75, + "learning_rate": 9.643509412223905e-06, + "loss": 1.3911, + "step": 5952 + }, + { + "epoch": 1.026999051151557, + "grad_norm": 0.640625, + "learning_rate": 9.64078927664655e-06, + "loss": 1.5121, + "step": 5953 + }, + { + "epoch": 1.027171569050289, + "grad_norm": 0.6171875, + "learning_rate": 9.638069167681768e-06, + "loss": 1.3723, + "step": 5954 + }, + { + "epoch": 1.027344086949021, + "grad_norm": 0.703125, + "learning_rate": 9.635349085531088e-06, + "loss": 1.3944, + "step": 5955 + }, + { + "epoch": 1.0275166048477529, + "grad_norm": 0.62109375, + "learning_rate": 9.63262903039602e-06, + "loss": 1.4605, + "step": 5956 + }, + { + "epoch": 1.027689122746485, + "grad_norm": 0.58984375, + "learning_rate": 9.629909002478095e-06, + "loss": 1.5529, + "step": 5957 + }, + { + "epoch": 1.027861640645217, + "grad_norm": 0.61328125, + "learning_rate": 9.627189001978818e-06, + "loss": 1.5283, + "step": 5958 + }, + { + "epoch": 1.028034158543949, + "grad_norm": 0.6328125, + "learning_rate": 9.624469029099713e-06, + "loss": 1.4798, + "step": 5959 + }, + { + "epoch": 1.028206676442681, + "grad_norm": 0.73046875, + "learning_rate": 9.621749084042288e-06, + "loss": 1.4375, + "step": 5960 + }, + { + "epoch": 1.028379194341413, + "grad_norm": 0.7109375, + "learning_rate": 9.619029167008056e-06, + "loss": 1.4525, + "step": 5961 + }, + { + "epoch": 1.0285517122401449, + "grad_norm": 0.7265625, + "learning_rate": 9.61630927819852e-06, + "loss": 1.4167, + "step": 5962 + }, + { + "epoch": 1.0287242301388768, + "grad_norm": 0.5859375, + "learning_rate": 9.613589417815192e-06, + "loss": 1.4118, + "step": 5963 + }, + { + "epoch": 1.028896748037609, + "grad_norm": 0.6328125, + "learning_rate": 9.610869586059574e-06, + "loss": 1.4564, + "step": 5964 + }, + { + "epoch": 1.029069265936341, + "grad_norm": 0.578125, + "learning_rate": 9.608149783133169e-06, + "loss": 1.4491, + "step": 5965 + }, + { + "epoch": 1.029241783835073, + "grad_norm": 0.5703125, + "learning_rate": 9.605430009237474e-06, + "loss": 1.4347, + "step": 5966 + }, + { + "epoch": 1.0294143017338049, + "grad_norm": 0.6015625, + "learning_rate": 9.602710264573991e-06, + "loss": 1.4235, + "step": 5967 + }, + { + "epoch": 1.0295868196325368, + "grad_norm": 0.5703125, + "learning_rate": 9.599990549344216e-06, + "loss": 1.3666, + "step": 5968 + }, + { + "epoch": 1.0297593375312688, + "grad_norm": 0.58984375, + "learning_rate": 9.597270863749635e-06, + "loss": 1.4757, + "step": 5969 + }, + { + "epoch": 1.0299318554300008, + "grad_norm": 0.55859375, + "learning_rate": 9.59455120799175e-06, + "loss": 1.4431, + "step": 5970 + }, + { + "epoch": 1.030104373328733, + "grad_norm": 0.5859375, + "learning_rate": 9.59183158227204e-06, + "loss": 1.4592, + "step": 5971 + }, + { + "epoch": 1.030276891227465, + "grad_norm": 0.6328125, + "learning_rate": 9.589111986792004e-06, + "loss": 1.5002, + "step": 5972 + }, + { + "epoch": 1.0304494091261969, + "grad_norm": 0.6484375, + "learning_rate": 9.586392421753115e-06, + "loss": 1.4541, + "step": 5973 + }, + { + "epoch": 1.0306219270249288, + "grad_norm": 1.0625, + "learning_rate": 9.583672887356862e-06, + "loss": 1.3925, + "step": 5974 + }, + { + "epoch": 1.0307944449236608, + "grad_norm": 0.6484375, + "learning_rate": 9.580953383804718e-06, + "loss": 1.5073, + "step": 5975 + }, + { + "epoch": 1.0309669628223928, + "grad_norm": 0.6328125, + "learning_rate": 9.57823391129817e-06, + "loss": 1.4615, + "step": 5976 + }, + { + "epoch": 1.0311394807211247, + "grad_norm": 0.61328125, + "learning_rate": 9.575514470038688e-06, + "loss": 1.525, + "step": 5977 + }, + { + "epoch": 1.031311998619857, + "grad_norm": 0.62890625, + "learning_rate": 9.572795060227748e-06, + "loss": 1.4352, + "step": 5978 + }, + { + "epoch": 1.0314845165185889, + "grad_norm": 0.765625, + "learning_rate": 9.570075682066815e-06, + "loss": 1.4863, + "step": 5979 + }, + { + "epoch": 1.0316570344173208, + "grad_norm": 0.609375, + "learning_rate": 9.567356335757366e-06, + "loss": 1.4522, + "step": 5980 + }, + { + "epoch": 1.0318295523160528, + "grad_norm": 0.66015625, + "learning_rate": 9.56463702150086e-06, + "loss": 1.4746, + "step": 5981 + }, + { + "epoch": 1.0320020702147847, + "grad_norm": 0.60546875, + "learning_rate": 9.561917739498761e-06, + "loss": 1.4016, + "step": 5982 + }, + { + "epoch": 1.0321745881135167, + "grad_norm": 0.61328125, + "learning_rate": 9.55919848995254e-06, + "loss": 1.4406, + "step": 5983 + }, + { + "epoch": 1.0323471060122489, + "grad_norm": 0.6484375, + "learning_rate": 9.556479273063644e-06, + "loss": 1.4135, + "step": 5984 + }, + { + "epoch": 1.0325196239109808, + "grad_norm": 0.75390625, + "learning_rate": 9.55376008903354e-06, + "loss": 1.4579, + "step": 5985 + }, + { + "epoch": 1.0326921418097128, + "grad_norm": 0.625, + "learning_rate": 9.551040938063671e-06, + "loss": 1.4233, + "step": 5986 + }, + { + "epoch": 1.0328646597084448, + "grad_norm": 0.6953125, + "learning_rate": 9.548321820355498e-06, + "loss": 1.4576, + "step": 5987 + }, + { + "epoch": 1.0330371776071767, + "grad_norm": 0.6953125, + "learning_rate": 9.545602736110467e-06, + "loss": 1.4506, + "step": 5988 + }, + { + "epoch": 1.0332096955059087, + "grad_norm": 0.61328125, + "learning_rate": 9.542883685530024e-06, + "loss": 1.4305, + "step": 5989 + }, + { + "epoch": 1.0333822134046406, + "grad_norm": 0.6015625, + "learning_rate": 9.54016466881561e-06, + "loss": 1.4593, + "step": 5990 + }, + { + "epoch": 1.0335547313033728, + "grad_norm": 0.58984375, + "learning_rate": 9.537445686168676e-06, + "loss": 1.3949, + "step": 5991 + }, + { + "epoch": 1.0337272492021048, + "grad_norm": 0.64453125, + "learning_rate": 9.534726737790652e-06, + "loss": 1.3817, + "step": 5992 + }, + { + "epoch": 1.0338997671008368, + "grad_norm": 0.62109375, + "learning_rate": 9.532007823882983e-06, + "loss": 1.3964, + "step": 5993 + }, + { + "epoch": 1.0340722849995687, + "grad_norm": 0.70703125, + "learning_rate": 9.529288944647093e-06, + "loss": 1.4328, + "step": 5994 + }, + { + "epoch": 1.0342448028983007, + "grad_norm": 0.578125, + "learning_rate": 9.526570100284423e-06, + "loss": 1.4962, + "step": 5995 + }, + { + "epoch": 1.0344173207970326, + "grad_norm": 0.63671875, + "learning_rate": 9.523851290996397e-06, + "loss": 1.4092, + "step": 5996 + }, + { + "epoch": 1.0345898386957646, + "grad_norm": 0.6328125, + "learning_rate": 9.521132516984442e-06, + "loss": 1.3904, + "step": 5997 + }, + { + "epoch": 1.0347623565944968, + "grad_norm": 0.58203125, + "learning_rate": 9.518413778449981e-06, + "loss": 1.3421, + "step": 5998 + }, + { + "epoch": 1.0349348744932287, + "grad_norm": 0.859375, + "learning_rate": 9.515695075594434e-06, + "loss": 1.3831, + "step": 5999 + }, + { + "epoch": 1.0351073923919607, + "grad_norm": 0.66796875, + "learning_rate": 9.512976408619227e-06, + "loss": 1.4474, + "step": 6000 + }, + { + "epoch": 1.0351073923919607, + "eval_loss": 1.4101243019104004, + "eval_runtime": 10.873, + "eval_samples_per_second": 94.178, + "eval_steps_per_second": 23.545, + "step": 6000 + }, + { + "epoch": 1.0352799102906927, + "grad_norm": 0.86328125, + "learning_rate": 9.510257777725769e-06, + "loss": 1.3563, + "step": 6001 + }, + { + "epoch": 1.0354524281894246, + "grad_norm": 0.6015625, + "learning_rate": 9.507539183115479e-06, + "loss": 1.3893, + "step": 6002 + }, + { + "epoch": 1.0356249460881566, + "grad_norm": 0.62890625, + "learning_rate": 9.504820624989756e-06, + "loss": 1.3792, + "step": 6003 + }, + { + "epoch": 1.0357974639868885, + "grad_norm": 0.61328125, + "learning_rate": 9.50210210355002e-06, + "loss": 1.5117, + "step": 6004 + }, + { + "epoch": 1.0359699818856207, + "grad_norm": 0.578125, + "learning_rate": 9.499383618997669e-06, + "loss": 1.3749, + "step": 6005 + }, + { + "epoch": 1.0361424997843527, + "grad_norm": 0.58984375, + "learning_rate": 9.49666517153411e-06, + "loss": 1.5192, + "step": 6006 + }, + { + "epoch": 1.0363150176830846, + "grad_norm": 0.5703125, + "learning_rate": 9.493946761360736e-06, + "loss": 1.4583, + "step": 6007 + }, + { + "epoch": 1.0364875355818166, + "grad_norm": 0.55078125, + "learning_rate": 9.491228388678952e-06, + "loss": 1.4411, + "step": 6008 + }, + { + "epoch": 1.0366600534805486, + "grad_norm": 0.66796875, + "learning_rate": 9.488510053690147e-06, + "loss": 1.4219, + "step": 6009 + }, + { + "epoch": 1.0368325713792805, + "grad_norm": 0.58984375, + "learning_rate": 9.485791756595714e-06, + "loss": 1.4116, + "step": 6010 + }, + { + "epoch": 1.0370050892780125, + "grad_norm": 0.64453125, + "learning_rate": 9.483073497597037e-06, + "loss": 1.4544, + "step": 6011 + }, + { + "epoch": 1.0371776071767447, + "grad_norm": 0.6875, + "learning_rate": 9.480355276895508e-06, + "loss": 1.381, + "step": 6012 + }, + { + "epoch": 1.0373501250754766, + "grad_norm": 0.62109375, + "learning_rate": 9.477637094692505e-06, + "loss": 1.5196, + "step": 6013 + }, + { + "epoch": 1.0375226429742086, + "grad_norm": 0.734375, + "learning_rate": 9.47491895118941e-06, + "loss": 1.4163, + "step": 6014 + }, + { + "epoch": 1.0376951608729406, + "grad_norm": 0.6484375, + "learning_rate": 9.472200846587603e-06, + "loss": 1.4176, + "step": 6015 + }, + { + "epoch": 1.0378676787716725, + "grad_norm": 0.5859375, + "learning_rate": 9.469482781088451e-06, + "loss": 1.435, + "step": 6016 + }, + { + "epoch": 1.0380401966704045, + "grad_norm": 0.6328125, + "learning_rate": 9.466764754893334e-06, + "loss": 1.4506, + "step": 6017 + }, + { + "epoch": 1.0382127145691364, + "grad_norm": 0.8671875, + "learning_rate": 9.464046768203611e-06, + "loss": 1.3685, + "step": 6018 + }, + { + "epoch": 1.0383852324678686, + "grad_norm": 0.81640625, + "learning_rate": 9.461328821220657e-06, + "loss": 1.4066, + "step": 6019 + }, + { + "epoch": 1.0385577503666006, + "grad_norm": 0.64453125, + "learning_rate": 9.458610914145826e-06, + "loss": 1.3749, + "step": 6020 + }, + { + "epoch": 1.0387302682653325, + "grad_norm": 0.6015625, + "learning_rate": 9.455893047180485e-06, + "loss": 1.3851, + "step": 6021 + }, + { + "epoch": 1.0389027861640645, + "grad_norm": 0.60546875, + "learning_rate": 9.453175220525981e-06, + "loss": 1.4537, + "step": 6022 + }, + { + "epoch": 1.0390753040627965, + "grad_norm": 0.5859375, + "learning_rate": 9.450457434383679e-06, + "loss": 1.3407, + "step": 6023 + }, + { + "epoch": 1.0392478219615284, + "grad_norm": 0.65625, + "learning_rate": 9.44773968895492e-06, + "loss": 1.4205, + "step": 6024 + }, + { + "epoch": 1.0394203398602606, + "grad_norm": 0.609375, + "learning_rate": 9.445021984441059e-06, + "loss": 1.4163, + "step": 6025 + }, + { + "epoch": 1.0395928577589926, + "grad_norm": 0.57421875, + "learning_rate": 9.442304321043433e-06, + "loss": 1.4961, + "step": 6026 + }, + { + "epoch": 1.0397653756577245, + "grad_norm": 0.609375, + "learning_rate": 9.43958669896339e-06, + "loss": 1.3901, + "step": 6027 + }, + { + "epoch": 1.0399378935564565, + "grad_norm": 0.578125, + "learning_rate": 9.436869118402265e-06, + "loss": 1.4272, + "step": 6028 + }, + { + "epoch": 1.0401104114551885, + "grad_norm": 0.68359375, + "learning_rate": 9.434151579561397e-06, + "loss": 1.4163, + "step": 6029 + }, + { + "epoch": 1.0402829293539204, + "grad_norm": 0.6328125, + "learning_rate": 9.431434082642112e-06, + "loss": 1.5161, + "step": 6030 + }, + { + "epoch": 1.0404554472526524, + "grad_norm": 0.5859375, + "learning_rate": 9.428716627845742e-06, + "loss": 1.3931, + "step": 6031 + }, + { + "epoch": 1.0406279651513846, + "grad_norm": 0.6328125, + "learning_rate": 9.425999215373617e-06, + "loss": 1.4832, + "step": 6032 + }, + { + "epoch": 1.0408004830501165, + "grad_norm": 0.63671875, + "learning_rate": 9.423281845427056e-06, + "loss": 1.4823, + "step": 6033 + }, + { + "epoch": 1.0409730009488485, + "grad_norm": 0.58984375, + "learning_rate": 9.420564518207382e-06, + "loss": 1.4338, + "step": 6034 + }, + { + "epoch": 1.0411455188475804, + "grad_norm": 0.609375, + "learning_rate": 9.417847233915902e-06, + "loss": 1.5086, + "step": 6035 + }, + { + "epoch": 1.0413180367463124, + "grad_norm": 0.61328125, + "learning_rate": 9.415129992753943e-06, + "loss": 1.4712, + "step": 6036 + }, + { + "epoch": 1.0414905546450444, + "grad_norm": 0.60546875, + "learning_rate": 9.412412794922805e-06, + "loss": 1.4372, + "step": 6037 + }, + { + "epoch": 1.0416630725437763, + "grad_norm": 0.5390625, + "learning_rate": 9.409695640623802e-06, + "loss": 1.3527, + "step": 6038 + }, + { + "epoch": 1.0418355904425085, + "grad_norm": 0.6484375, + "learning_rate": 9.406978530058229e-06, + "loss": 1.4659, + "step": 6039 + }, + { + "epoch": 1.0420081083412405, + "grad_norm": 0.80859375, + "learning_rate": 9.404261463427396e-06, + "loss": 1.3717, + "step": 6040 + }, + { + "epoch": 1.0421806262399724, + "grad_norm": 0.91015625, + "learning_rate": 9.401544440932596e-06, + "loss": 1.3971, + "step": 6041 + }, + { + "epoch": 1.0423531441387044, + "grad_norm": 0.6796875, + "learning_rate": 9.398827462775122e-06, + "loss": 1.408, + "step": 6042 + }, + { + "epoch": 1.0425256620374364, + "grad_norm": 0.70703125, + "learning_rate": 9.396110529156263e-06, + "loss": 1.4351, + "step": 6043 + }, + { + "epoch": 1.0426981799361683, + "grad_norm": 0.609375, + "learning_rate": 9.393393640277316e-06, + "loss": 1.4595, + "step": 6044 + }, + { + "epoch": 1.0428706978349003, + "grad_norm": 0.57421875, + "learning_rate": 9.390676796339552e-06, + "loss": 1.4187, + "step": 6045 + }, + { + "epoch": 1.0430432157336325, + "grad_norm": 0.59375, + "learning_rate": 9.387959997544263e-06, + "loss": 1.3626, + "step": 6046 + }, + { + "epoch": 1.0432157336323644, + "grad_norm": 0.5859375, + "learning_rate": 9.385243244092717e-06, + "loss": 1.6359, + "step": 6047 + }, + { + "epoch": 1.0433882515310964, + "grad_norm": 0.6484375, + "learning_rate": 9.382526536186194e-06, + "loss": 1.3945, + "step": 6048 + }, + { + "epoch": 1.0435607694298283, + "grad_norm": 0.70703125, + "learning_rate": 9.379809874025967e-06, + "loss": 1.5335, + "step": 6049 + }, + { + "epoch": 1.0437332873285603, + "grad_norm": 0.57421875, + "learning_rate": 9.377093257813293e-06, + "loss": 1.3544, + "step": 6050 + }, + { + "epoch": 1.0439058052272923, + "grad_norm": 0.62109375, + "learning_rate": 9.374376687749449e-06, + "loss": 1.4395, + "step": 6051 + }, + { + "epoch": 1.0440783231260242, + "grad_norm": 0.63671875, + "learning_rate": 9.371660164035687e-06, + "loss": 1.4785, + "step": 6052 + }, + { + "epoch": 1.0442508410247564, + "grad_norm": 0.54296875, + "learning_rate": 9.368943686873266e-06, + "loss": 1.4134, + "step": 6053 + }, + { + "epoch": 1.0444233589234884, + "grad_norm": 0.65234375, + "learning_rate": 9.366227256463437e-06, + "loss": 1.4454, + "step": 6054 + }, + { + "epoch": 1.0445958768222203, + "grad_norm": 0.5859375, + "learning_rate": 9.363510873007458e-06, + "loss": 1.4406, + "step": 6055 + }, + { + "epoch": 1.0447683947209523, + "grad_norm": 0.69921875, + "learning_rate": 9.360794536706566e-06, + "loss": 1.5395, + "step": 6056 + }, + { + "epoch": 1.0449409126196842, + "grad_norm": 0.7109375, + "learning_rate": 9.358078247762012e-06, + "loss": 1.4343, + "step": 6057 + }, + { + "epoch": 1.0451134305184162, + "grad_norm": 0.5703125, + "learning_rate": 9.355362006375029e-06, + "loss": 1.4209, + "step": 6058 + }, + { + "epoch": 1.0452859484171482, + "grad_norm": 0.69921875, + "learning_rate": 9.35264581274686e-06, + "loss": 1.4438, + "step": 6059 + }, + { + "epoch": 1.0454584663158804, + "grad_norm": 0.6015625, + "learning_rate": 9.349929667078729e-06, + "loss": 1.4041, + "step": 6060 + }, + { + "epoch": 1.0456309842146123, + "grad_norm": 0.640625, + "learning_rate": 9.347213569571876e-06, + "loss": 1.4805, + "step": 6061 + }, + { + "epoch": 1.0458035021133443, + "grad_norm": 0.6015625, + "learning_rate": 9.344497520427517e-06, + "loss": 1.4596, + "step": 6062 + }, + { + "epoch": 1.0459760200120762, + "grad_norm": 0.6015625, + "learning_rate": 9.341781519846875e-06, + "loss": 1.4914, + "step": 6063 + }, + { + "epoch": 1.0461485379108082, + "grad_norm": 0.65625, + "learning_rate": 9.339065568031176e-06, + "loss": 1.4308, + "step": 6064 + }, + { + "epoch": 1.0463210558095402, + "grad_norm": 0.640625, + "learning_rate": 9.336349665181628e-06, + "loss": 1.3531, + "step": 6065 + }, + { + "epoch": 1.0464935737082723, + "grad_norm": 0.58984375, + "learning_rate": 9.333633811499444e-06, + "loss": 1.4219, + "step": 6066 + }, + { + "epoch": 1.0466660916070043, + "grad_norm": 0.6015625, + "learning_rate": 9.330918007185828e-06, + "loss": 1.4653, + "step": 6067 + }, + { + "epoch": 1.0468386095057363, + "grad_norm": 0.546875, + "learning_rate": 9.328202252441989e-06, + "loss": 1.332, + "step": 6068 + }, + { + "epoch": 1.0470111274044682, + "grad_norm": 0.69140625, + "learning_rate": 9.325486547469124e-06, + "loss": 1.3493, + "step": 6069 + }, + { + "epoch": 1.0471836453032002, + "grad_norm": 0.65234375, + "learning_rate": 9.322770892468433e-06, + "loss": 1.4375, + "step": 6070 + }, + { + "epoch": 1.0473561632019321, + "grad_norm": 0.6640625, + "learning_rate": 9.3200552876411e-06, + "loss": 1.3934, + "step": 6071 + }, + { + "epoch": 1.047528681100664, + "grad_norm": 0.60546875, + "learning_rate": 9.317339733188324e-06, + "loss": 1.4507, + "step": 6072 + }, + { + "epoch": 1.0477011989993963, + "grad_norm": 0.56640625, + "learning_rate": 9.314624229311285e-06, + "loss": 1.4125, + "step": 6073 + }, + { + "epoch": 1.0478737168981282, + "grad_norm": 0.609375, + "learning_rate": 9.311908776211167e-06, + "loss": 1.4712, + "step": 6074 + }, + { + "epoch": 1.0480462347968602, + "grad_norm": 0.59765625, + "learning_rate": 9.309193374089141e-06, + "loss": 1.4621, + "step": 6075 + }, + { + "epoch": 1.0482187526955922, + "grad_norm": 0.609375, + "learning_rate": 9.306478023146393e-06, + "loss": 1.5664, + "step": 6076 + }, + { + "epoch": 1.0483912705943241, + "grad_norm": 0.5625, + "learning_rate": 9.303762723584082e-06, + "loss": 1.4725, + "step": 6077 + }, + { + "epoch": 1.048563788493056, + "grad_norm": 0.63671875, + "learning_rate": 9.301047475603382e-06, + "loss": 1.3968, + "step": 6078 + }, + { + "epoch": 1.048736306391788, + "grad_norm": 0.75390625, + "learning_rate": 9.298332279405447e-06, + "loss": 1.4459, + "step": 6079 + }, + { + "epoch": 1.0489088242905202, + "grad_norm": 0.6171875, + "learning_rate": 9.295617135191445e-06, + "loss": 1.353, + "step": 6080 + }, + { + "epoch": 1.0490813421892522, + "grad_norm": 0.62109375, + "learning_rate": 9.292902043162526e-06, + "loss": 1.428, + "step": 6081 + }, + { + "epoch": 1.0492538600879842, + "grad_norm": 0.59375, + "learning_rate": 9.290187003519841e-06, + "loss": 1.3785, + "step": 6082 + }, + { + "epoch": 1.0494263779867161, + "grad_norm": 0.5859375, + "learning_rate": 9.28747201646454e-06, + "loss": 1.4798, + "step": 6083 + }, + { + "epoch": 1.049598895885448, + "grad_norm": 0.59765625, + "learning_rate": 9.284757082197763e-06, + "loss": 1.3638, + "step": 6084 + }, + { + "epoch": 1.04977141378418, + "grad_norm": 0.6796875, + "learning_rate": 9.282042200920652e-06, + "loss": 1.3908, + "step": 6085 + }, + { + "epoch": 1.049943931682912, + "grad_norm": 0.6015625, + "learning_rate": 9.279327372834335e-06, + "loss": 1.3872, + "step": 6086 + }, + { + "epoch": 1.0501164495816442, + "grad_norm": 0.63671875, + "learning_rate": 9.276612598139956e-06, + "loss": 1.5071, + "step": 6087 + }, + { + "epoch": 1.0502889674803761, + "grad_norm": 0.58203125, + "learning_rate": 9.27389787703863e-06, + "loss": 1.454, + "step": 6088 + }, + { + "epoch": 1.050461485379108, + "grad_norm": 0.59765625, + "learning_rate": 9.27118320973149e-06, + "loss": 1.4289, + "step": 6089 + }, + { + "epoch": 1.05063400327784, + "grad_norm": 0.6875, + "learning_rate": 9.268468596419651e-06, + "loss": 1.473, + "step": 6090 + }, + { + "epoch": 1.050806521176572, + "grad_norm": 1.1171875, + "learning_rate": 9.265754037304232e-06, + "loss": 1.4485, + "step": 6091 + }, + { + "epoch": 1.050979039075304, + "grad_norm": 0.62890625, + "learning_rate": 9.263039532586336e-06, + "loss": 1.4458, + "step": 6092 + }, + { + "epoch": 1.051151556974036, + "grad_norm": 0.578125, + "learning_rate": 9.26032508246708e-06, + "loss": 1.3861, + "step": 6093 + }, + { + "epoch": 1.0513240748727681, + "grad_norm": 0.58984375, + "learning_rate": 9.257610687147562e-06, + "loss": 1.4952, + "step": 6094 + }, + { + "epoch": 1.0514965927715, + "grad_norm": 0.56640625, + "learning_rate": 9.25489634682888e-06, + "loss": 1.419, + "step": 6095 + }, + { + "epoch": 1.051669110670232, + "grad_norm": 0.53515625, + "learning_rate": 9.252182061712137e-06, + "loss": 1.3722, + "step": 6096 + }, + { + "epoch": 1.051841628568964, + "grad_norm": 0.59375, + "learning_rate": 9.249467831998417e-06, + "loss": 1.4752, + "step": 6097 + }, + { + "epoch": 1.052014146467696, + "grad_norm": 0.58984375, + "learning_rate": 9.246753657888814e-06, + "loss": 1.4241, + "step": 6098 + }, + { + "epoch": 1.052186664366428, + "grad_norm": 0.640625, + "learning_rate": 9.244039539584399e-06, + "loss": 1.3496, + "step": 6099 + }, + { + "epoch": 1.05235918226516, + "grad_norm": 0.58203125, + "learning_rate": 9.241325477286265e-06, + "loss": 1.4379, + "step": 6100 + }, + { + "epoch": 1.05235918226516, + "eval_loss": 1.4098589420318604, + "eval_runtime": 10.8253, + "eval_samples_per_second": 94.593, + "eval_steps_per_second": 23.648, + "step": 6100 + }, + { + "epoch": 1.052531700163892, + "grad_norm": 0.58203125, + "learning_rate": 9.238611471195477e-06, + "loss": 1.3604, + "step": 6101 + }, + { + "epoch": 1.052704218062624, + "grad_norm": 0.59765625, + "learning_rate": 9.235897521513111e-06, + "loss": 1.401, + "step": 6102 + }, + { + "epoch": 1.052876735961356, + "grad_norm": 0.609375, + "learning_rate": 9.233183628440227e-06, + "loss": 1.4434, + "step": 6103 + }, + { + "epoch": 1.053049253860088, + "grad_norm": 0.59765625, + "learning_rate": 9.230469792177894e-06, + "loss": 1.3778, + "step": 6104 + }, + { + "epoch": 1.05322177175882, + "grad_norm": 0.62109375, + "learning_rate": 9.227756012927166e-06, + "loss": 1.3925, + "step": 6105 + }, + { + "epoch": 1.0533942896575519, + "grad_norm": 0.62109375, + "learning_rate": 9.2250422908891e-06, + "loss": 1.4522, + "step": 6106 + }, + { + "epoch": 1.053566807556284, + "grad_norm": 0.5390625, + "learning_rate": 9.222328626264738e-06, + "loss": 1.3521, + "step": 6107 + }, + { + "epoch": 1.053739325455016, + "grad_norm": 0.59375, + "learning_rate": 9.219615019255136e-06, + "loss": 1.4829, + "step": 6108 + }, + { + "epoch": 1.053911843353748, + "grad_norm": 0.6796875, + "learning_rate": 9.216901470061326e-06, + "loss": 1.3929, + "step": 6109 + }, + { + "epoch": 1.05408436125248, + "grad_norm": 0.58984375, + "learning_rate": 9.21418797888435e-06, + "loss": 1.4325, + "step": 6110 + }, + { + "epoch": 1.054256879151212, + "grad_norm": 0.6015625, + "learning_rate": 9.211474545925237e-06, + "loss": 1.4691, + "step": 6111 + }, + { + "epoch": 1.0544293970499439, + "grad_norm": 0.59765625, + "learning_rate": 9.208761171385017e-06, + "loss": 1.4809, + "step": 6112 + }, + { + "epoch": 1.0546019149486758, + "grad_norm": 0.5859375, + "learning_rate": 9.206047855464715e-06, + "loss": 1.4147, + "step": 6113 + }, + { + "epoch": 1.054774432847408, + "grad_norm": 0.59765625, + "learning_rate": 9.203334598365345e-06, + "loss": 1.3983, + "step": 6114 + }, + { + "epoch": 1.05494695074614, + "grad_norm": 0.71875, + "learning_rate": 9.200621400287929e-06, + "loss": 1.4555, + "step": 6115 + }, + { + "epoch": 1.055119468644872, + "grad_norm": 0.5703125, + "learning_rate": 9.19790826143347e-06, + "loss": 1.4068, + "step": 6116 + }, + { + "epoch": 1.055291986543604, + "grad_norm": 0.65234375, + "learning_rate": 9.195195182002987e-06, + "loss": 1.381, + "step": 6117 + }, + { + "epoch": 1.0554645044423359, + "grad_norm": 0.58203125, + "learning_rate": 9.192482162197466e-06, + "loss": 1.4942, + "step": 6118 + }, + { + "epoch": 1.0556370223410678, + "grad_norm": 0.59375, + "learning_rate": 9.189769202217918e-06, + "loss": 1.4388, + "step": 6119 + }, + { + "epoch": 1.0558095402397998, + "grad_norm": 0.61328125, + "learning_rate": 9.187056302265324e-06, + "loss": 1.5254, + "step": 6120 + }, + { + "epoch": 1.055982058138532, + "grad_norm": 0.6171875, + "learning_rate": 9.184343462540683e-06, + "loss": 1.3021, + "step": 6121 + }, + { + "epoch": 1.056154576037264, + "grad_norm": 0.6015625, + "learning_rate": 9.181630683244972e-06, + "loss": 1.3728, + "step": 6122 + }, + { + "epoch": 1.0563270939359959, + "grad_norm": 0.609375, + "learning_rate": 9.178917964579176e-06, + "loss": 1.3719, + "step": 6123 + }, + { + "epoch": 1.0564996118347278, + "grad_norm": 0.5859375, + "learning_rate": 9.176205306744265e-06, + "loss": 1.4278, + "step": 6124 + }, + { + "epoch": 1.0566721297334598, + "grad_norm": 0.62109375, + "learning_rate": 9.173492709941215e-06, + "loss": 1.459, + "step": 6125 + }, + { + "epoch": 1.0568446476321918, + "grad_norm": 0.65625, + "learning_rate": 9.170780174370988e-06, + "loss": 1.4969, + "step": 6126 + }, + { + "epoch": 1.0570171655309237, + "grad_norm": 0.625, + "learning_rate": 9.168067700234542e-06, + "loss": 1.4081, + "step": 6127 + }, + { + "epoch": 1.057189683429656, + "grad_norm": 0.67578125, + "learning_rate": 9.165355287732846e-06, + "loss": 1.3269, + "step": 6128 + }, + { + "epoch": 1.0573622013283879, + "grad_norm": 0.609375, + "learning_rate": 9.162642937066843e-06, + "loss": 1.3879, + "step": 6129 + }, + { + "epoch": 1.0575347192271198, + "grad_norm": 0.60546875, + "learning_rate": 9.159930648437484e-06, + "loss": 1.5195, + "step": 6130 + }, + { + "epoch": 1.0577072371258518, + "grad_norm": 0.6328125, + "learning_rate": 9.157218422045708e-06, + "loss": 1.5369, + "step": 6131 + }, + { + "epoch": 1.0578797550245838, + "grad_norm": 0.69921875, + "learning_rate": 9.154506258092462e-06, + "loss": 1.4377, + "step": 6132 + }, + { + "epoch": 1.0580522729233157, + "grad_norm": 0.65234375, + "learning_rate": 9.151794156778673e-06, + "loss": 1.4826, + "step": 6133 + }, + { + "epoch": 1.058224790822048, + "grad_norm": 0.5546875, + "learning_rate": 9.149082118305274e-06, + "loss": 1.4615, + "step": 6134 + }, + { + "epoch": 1.0583973087207799, + "grad_norm": 0.58203125, + "learning_rate": 9.146370142873185e-06, + "loss": 1.5071, + "step": 6135 + }, + { + "epoch": 1.0585698266195118, + "grad_norm": 0.6015625, + "learning_rate": 9.143658230683335e-06, + "loss": 1.412, + "step": 6136 + }, + { + "epoch": 1.0587423445182438, + "grad_norm": 0.640625, + "learning_rate": 9.140946381936629e-06, + "loss": 1.322, + "step": 6137 + }, + { + "epoch": 1.0589148624169757, + "grad_norm": 0.625, + "learning_rate": 9.138234596833987e-06, + "loss": 1.4451, + "step": 6138 + }, + { + "epoch": 1.0590873803157077, + "grad_norm": 0.5703125, + "learning_rate": 9.135522875576305e-06, + "loss": 1.3518, + "step": 6139 + }, + { + "epoch": 1.0592598982144397, + "grad_norm": 0.74609375, + "learning_rate": 9.132811218364494e-06, + "loss": 1.46, + "step": 6140 + }, + { + "epoch": 1.0594324161131718, + "grad_norm": 0.8125, + "learning_rate": 9.130099625399446e-06, + "loss": 1.4321, + "step": 6141 + }, + { + "epoch": 1.0596049340119038, + "grad_norm": 0.63671875, + "learning_rate": 9.127388096882054e-06, + "loss": 1.4989, + "step": 6142 + }, + { + "epoch": 1.0597774519106358, + "grad_norm": 0.796875, + "learning_rate": 9.1246766330132e-06, + "loss": 1.4134, + "step": 6143 + }, + { + "epoch": 1.0599499698093677, + "grad_norm": 0.609375, + "learning_rate": 9.121965233993773e-06, + "loss": 1.3734, + "step": 6144 + }, + { + "epoch": 1.0601224877080997, + "grad_norm": 0.671875, + "learning_rate": 9.119253900024649e-06, + "loss": 1.4286, + "step": 6145 + }, + { + "epoch": 1.0602950056068317, + "grad_norm": 0.9609375, + "learning_rate": 9.116542631306695e-06, + "loss": 1.5147, + "step": 6146 + }, + { + "epoch": 1.0604675235055636, + "grad_norm": 0.6484375, + "learning_rate": 9.113831428040789e-06, + "loss": 1.4119, + "step": 6147 + }, + { + "epoch": 1.0606400414042958, + "grad_norm": 0.6171875, + "learning_rate": 9.11112029042778e-06, + "loss": 1.4222, + "step": 6148 + }, + { + "epoch": 1.0608125593030278, + "grad_norm": 0.65234375, + "learning_rate": 9.108409218668542e-06, + "loss": 1.3741, + "step": 6149 + }, + { + "epoch": 1.0609850772017597, + "grad_norm": 0.58984375, + "learning_rate": 9.105698212963915e-06, + "loss": 1.4079, + "step": 6150 + }, + { + "epoch": 1.0611575951004917, + "grad_norm": 0.6953125, + "learning_rate": 9.102987273514757e-06, + "loss": 1.4237, + "step": 6151 + }, + { + "epoch": 1.0613301129992236, + "grad_norm": 0.6171875, + "learning_rate": 9.100276400521898e-06, + "loss": 1.3756, + "step": 6152 + }, + { + "epoch": 1.0615026308979556, + "grad_norm": 0.60546875, + "learning_rate": 9.097565594186194e-06, + "loss": 1.4186, + "step": 6153 + }, + { + "epoch": 1.0616751487966876, + "grad_norm": 0.72265625, + "learning_rate": 9.094854854708464e-06, + "loss": 1.4798, + "step": 6154 + }, + { + "epoch": 1.0618476666954197, + "grad_norm": 0.609375, + "learning_rate": 9.092144182289546e-06, + "loss": 1.4217, + "step": 6155 + }, + { + "epoch": 1.0620201845941517, + "grad_norm": 0.59375, + "learning_rate": 9.089433577130256e-06, + "loss": 1.4153, + "step": 6156 + }, + { + "epoch": 1.0621927024928837, + "grad_norm": 1.125, + "learning_rate": 9.086723039431418e-06, + "loss": 1.3932, + "step": 6157 + }, + { + "epoch": 1.0623652203916156, + "grad_norm": 0.6640625, + "learning_rate": 9.084012569393842e-06, + "loss": 1.4848, + "step": 6158 + }, + { + "epoch": 1.0625377382903476, + "grad_norm": 0.6015625, + "learning_rate": 9.081302167218339e-06, + "loss": 1.4844, + "step": 6159 + }, + { + "epoch": 1.0627102561890795, + "grad_norm": 0.6328125, + "learning_rate": 9.078591833105712e-06, + "loss": 1.4191, + "step": 6160 + }, + { + "epoch": 1.0628827740878115, + "grad_norm": 0.578125, + "learning_rate": 9.075881567256759e-06, + "loss": 1.3469, + "step": 6161 + }, + { + "epoch": 1.0630552919865437, + "grad_norm": 0.7421875, + "learning_rate": 9.073171369872275e-06, + "loss": 1.4488, + "step": 6162 + }, + { + "epoch": 1.0632278098852757, + "grad_norm": 0.6015625, + "learning_rate": 9.070461241153044e-06, + "loss": 1.3692, + "step": 6163 + }, + { + "epoch": 1.0634003277840076, + "grad_norm": 0.55859375, + "learning_rate": 9.067751181299856e-06, + "loss": 1.3982, + "step": 6164 + }, + { + "epoch": 1.0635728456827396, + "grad_norm": 0.73046875, + "learning_rate": 9.065041190513483e-06, + "loss": 1.406, + "step": 6165 + }, + { + "epoch": 1.0637453635814715, + "grad_norm": 0.61328125, + "learning_rate": 9.062331268994704e-06, + "loss": 1.4365, + "step": 6166 + }, + { + "epoch": 1.0639178814802035, + "grad_norm": 0.640625, + "learning_rate": 9.059621416944277e-06, + "loss": 1.3919, + "step": 6167 + }, + { + "epoch": 1.0640903993789355, + "grad_norm": 0.6171875, + "learning_rate": 9.056911634562975e-06, + "loss": 1.2976, + "step": 6168 + }, + { + "epoch": 1.0642629172776676, + "grad_norm": 0.625, + "learning_rate": 9.054201922051552e-06, + "loss": 1.3825, + "step": 6169 + }, + { + "epoch": 1.0644354351763996, + "grad_norm": 0.53125, + "learning_rate": 9.051492279610763e-06, + "loss": 1.3458, + "step": 6170 + }, + { + "epoch": 1.0646079530751316, + "grad_norm": 0.6171875, + "learning_rate": 9.048782707441346e-06, + "loss": 1.3666, + "step": 6171 + }, + { + "epoch": 1.0647804709738635, + "grad_norm": 0.6015625, + "learning_rate": 9.046073205744053e-06, + "loss": 1.4677, + "step": 6172 + }, + { + "epoch": 1.0649529888725955, + "grad_norm": 0.6328125, + "learning_rate": 9.043363774719618e-06, + "loss": 1.4026, + "step": 6173 + }, + { + "epoch": 1.0651255067713274, + "grad_norm": 0.6171875, + "learning_rate": 9.040654414568772e-06, + "loss": 1.5799, + "step": 6174 + }, + { + "epoch": 1.0652980246700596, + "grad_norm": 0.5859375, + "learning_rate": 9.037945125492238e-06, + "loss": 1.4668, + "step": 6175 + }, + { + "epoch": 1.0654705425687916, + "grad_norm": 0.6484375, + "learning_rate": 9.035235907690739e-06, + "loss": 1.4445, + "step": 6176 + }, + { + "epoch": 1.0656430604675236, + "grad_norm": 0.6171875, + "learning_rate": 9.032526761364999e-06, + "loss": 1.4619, + "step": 6177 + }, + { + "epoch": 1.0658155783662555, + "grad_norm": 0.6484375, + "learning_rate": 9.029817686715717e-06, + "loss": 1.4089, + "step": 6178 + }, + { + "epoch": 1.0659880962649875, + "grad_norm": 0.62890625, + "learning_rate": 9.027108683943605e-06, + "loss": 1.4093, + "step": 6179 + }, + { + "epoch": 1.0661606141637194, + "grad_norm": 0.66796875, + "learning_rate": 9.024399753249358e-06, + "loss": 1.4857, + "step": 6180 + }, + { + "epoch": 1.0663331320624514, + "grad_norm": 0.578125, + "learning_rate": 9.021690894833676e-06, + "loss": 1.4078, + "step": 6181 + }, + { + "epoch": 1.0665056499611834, + "grad_norm": 0.6015625, + "learning_rate": 9.018982108897243e-06, + "loss": 1.4882, + "step": 6182 + }, + { + "epoch": 1.0666781678599155, + "grad_norm": 0.5703125, + "learning_rate": 9.016273395640748e-06, + "loss": 1.362, + "step": 6183 + }, + { + "epoch": 1.0668506857586475, + "grad_norm": 0.80859375, + "learning_rate": 9.013564755264862e-06, + "loss": 1.411, + "step": 6184 + }, + { + "epoch": 1.0670232036573795, + "grad_norm": 0.69140625, + "learning_rate": 9.010856187970267e-06, + "loss": 1.3623, + "step": 6185 + }, + { + "epoch": 1.0671957215561114, + "grad_norm": 0.6875, + "learning_rate": 9.008147693957624e-06, + "loss": 1.4222, + "step": 6186 + }, + { + "epoch": 1.0673682394548434, + "grad_norm": 0.578125, + "learning_rate": 9.005439273427597e-06, + "loss": 1.3998, + "step": 6187 + }, + { + "epoch": 1.0675407573535753, + "grad_norm": 0.59375, + "learning_rate": 9.00273092658084e-06, + "loss": 1.4059, + "step": 6188 + }, + { + "epoch": 1.0677132752523075, + "grad_norm": 0.6015625, + "learning_rate": 9.000022653618012e-06, + "loss": 1.3935, + "step": 6189 + }, + { + "epoch": 1.0678857931510395, + "grad_norm": 0.8046875, + "learning_rate": 8.997314454739752e-06, + "loss": 1.2628, + "step": 6190 + }, + { + "epoch": 1.0680583110497714, + "grad_norm": 0.64453125, + "learning_rate": 8.9946063301467e-06, + "loss": 1.4216, + "step": 6191 + }, + { + "epoch": 1.0682308289485034, + "grad_norm": 0.62890625, + "learning_rate": 8.991898280039498e-06, + "loss": 1.4487, + "step": 6192 + }, + { + "epoch": 1.0684033468472354, + "grad_norm": 0.61328125, + "learning_rate": 8.989190304618767e-06, + "loss": 1.4705, + "step": 6193 + }, + { + "epoch": 1.0685758647459673, + "grad_norm": 0.60546875, + "learning_rate": 8.986482404085137e-06, + "loss": 1.4954, + "step": 6194 + }, + { + "epoch": 1.0687483826446993, + "grad_norm": 1.3515625, + "learning_rate": 8.983774578639219e-06, + "loss": 1.4177, + "step": 6195 + }, + { + "epoch": 1.0689209005434315, + "grad_norm": 0.55859375, + "learning_rate": 8.981066828481635e-06, + "loss": 1.4546, + "step": 6196 + }, + { + "epoch": 1.0690934184421634, + "grad_norm": 0.58984375, + "learning_rate": 8.978359153812982e-06, + "loss": 1.4029, + "step": 6197 + }, + { + "epoch": 1.0692659363408954, + "grad_norm": 0.6328125, + "learning_rate": 8.975651554833869e-06, + "loss": 1.3723, + "step": 6198 + }, + { + "epoch": 1.0694384542396274, + "grad_norm": 0.625, + "learning_rate": 8.972944031744886e-06, + "loss": 1.4171, + "step": 6199 + }, + { + "epoch": 1.0696109721383593, + "grad_norm": 0.578125, + "learning_rate": 8.970236584746631e-06, + "loss": 1.4983, + "step": 6200 + }, + { + "epoch": 1.0696109721383593, + "eval_loss": 1.4095999002456665, + "eval_runtime": 11.2941, + "eval_samples_per_second": 90.667, + "eval_steps_per_second": 22.667, + "step": 6200 + }, + { + "epoch": 1.0697834900370913, + "grad_norm": 0.609375, + "learning_rate": 8.967529214039682e-06, + "loss": 1.3528, + "step": 6201 + }, + { + "epoch": 1.0699560079358232, + "grad_norm": 0.734375, + "learning_rate": 8.964821919824623e-06, + "loss": 1.4585, + "step": 6202 + }, + { + "epoch": 1.0701285258345554, + "grad_norm": 6.25, + "learning_rate": 8.962114702302018e-06, + "loss": 1.4366, + "step": 6203 + }, + { + "epoch": 1.0703010437332874, + "grad_norm": 0.640625, + "learning_rate": 8.959407561672447e-06, + "loss": 1.4571, + "step": 6204 + }, + { + "epoch": 1.0704735616320193, + "grad_norm": 0.546875, + "learning_rate": 8.956700498136461e-06, + "loss": 1.4125, + "step": 6205 + }, + { + "epoch": 1.0706460795307513, + "grad_norm": 0.609375, + "learning_rate": 8.953993511894626e-06, + "loss": 1.4351, + "step": 6206 + }, + { + "epoch": 1.0708185974294833, + "grad_norm": 0.58984375, + "learning_rate": 8.951286603147481e-06, + "loss": 1.4297, + "step": 6207 + }, + { + "epoch": 1.0709911153282152, + "grad_norm": 1.765625, + "learning_rate": 8.948579772095578e-06, + "loss": 1.4696, + "step": 6208 + }, + { + "epoch": 1.0711636332269472, + "grad_norm": 0.625, + "learning_rate": 8.94587301893946e-06, + "loss": 1.4402, + "step": 6209 + }, + { + "epoch": 1.0713361511256794, + "grad_norm": 0.63671875, + "learning_rate": 8.943166343879652e-06, + "loss": 1.4923, + "step": 6210 + }, + { + "epoch": 1.0715086690244113, + "grad_norm": 0.57421875, + "learning_rate": 8.940459747116688e-06, + "loss": 1.403, + "step": 6211 + }, + { + "epoch": 1.0716811869231433, + "grad_norm": 0.6875, + "learning_rate": 8.93775322885108e-06, + "loss": 1.4063, + "step": 6212 + }, + { + "epoch": 1.0718537048218753, + "grad_norm": 0.70703125, + "learning_rate": 8.935046789283356e-06, + "loss": 1.4808, + "step": 6213 + }, + { + "epoch": 1.0720262227206072, + "grad_norm": 0.609375, + "learning_rate": 8.932340428614016e-06, + "loss": 1.4331, + "step": 6214 + }, + { + "epoch": 1.0721987406193392, + "grad_norm": 0.63671875, + "learning_rate": 8.929634147043573e-06, + "loss": 1.4485, + "step": 6215 + }, + { + "epoch": 1.0723712585180714, + "grad_norm": 0.6328125, + "learning_rate": 8.926927944772514e-06, + "loss": 1.4356, + "step": 6216 + }, + { + "epoch": 1.0725437764168033, + "grad_norm": 0.62890625, + "learning_rate": 8.924221822001342e-06, + "loss": 1.4756, + "step": 6217 + }, + { + "epoch": 1.0727162943155353, + "grad_norm": 0.7109375, + "learning_rate": 8.921515778930538e-06, + "loss": 1.4703, + "step": 6218 + }, + { + "epoch": 1.0728888122142672, + "grad_norm": 0.6640625, + "learning_rate": 8.918809815760585e-06, + "loss": 1.3884, + "step": 6219 + }, + { + "epoch": 1.0730613301129992, + "grad_norm": 0.578125, + "learning_rate": 8.916103932691953e-06, + "loss": 1.4818, + "step": 6220 + }, + { + "epoch": 1.0732338480117312, + "grad_norm": 0.65625, + "learning_rate": 8.913398129925118e-06, + "loss": 1.5478, + "step": 6221 + }, + { + "epoch": 1.0734063659104631, + "grad_norm": 0.7109375, + "learning_rate": 8.910692407660538e-06, + "loss": 1.3961, + "step": 6222 + }, + { + "epoch": 1.0735788838091953, + "grad_norm": 0.7421875, + "learning_rate": 8.907986766098666e-06, + "loss": 1.5076, + "step": 6223 + }, + { + "epoch": 1.0737514017079273, + "grad_norm": 0.69140625, + "learning_rate": 8.905281205439965e-06, + "loss": 1.4828, + "step": 6224 + }, + { + "epoch": 1.0739239196066592, + "grad_norm": 0.5859375, + "learning_rate": 8.902575725884867e-06, + "loss": 1.41, + "step": 6225 + }, + { + "epoch": 1.0740964375053912, + "grad_norm": 0.734375, + "learning_rate": 8.89987032763382e-06, + "loss": 1.431, + "step": 6226 + }, + { + "epoch": 1.0742689554041231, + "grad_norm": 0.7578125, + "learning_rate": 8.89716501088725e-06, + "loss": 1.3323, + "step": 6227 + }, + { + "epoch": 1.074441473302855, + "grad_norm": 0.6875, + "learning_rate": 8.89445977584559e-06, + "loss": 1.4044, + "step": 6228 + }, + { + "epoch": 1.074613991201587, + "grad_norm": 0.58984375, + "learning_rate": 8.891754622709254e-06, + "loss": 1.4322, + "step": 6229 + }, + { + "epoch": 1.0747865091003193, + "grad_norm": 0.59375, + "learning_rate": 8.889049551678664e-06, + "loss": 1.4238, + "step": 6230 + }, + { + "epoch": 1.0749590269990512, + "grad_norm": 0.60546875, + "learning_rate": 8.886344562954221e-06, + "loss": 1.3572, + "step": 6231 + }, + { + "epoch": 1.0751315448977832, + "grad_norm": 0.734375, + "learning_rate": 8.883639656736334e-06, + "loss": 1.4339, + "step": 6232 + }, + { + "epoch": 1.0753040627965151, + "grad_norm": 0.6171875, + "learning_rate": 8.880934833225395e-06, + "loss": 1.4185, + "step": 6233 + }, + { + "epoch": 1.075476580695247, + "grad_norm": 0.625, + "learning_rate": 8.878230092621799e-06, + "loss": 1.4326, + "step": 6234 + }, + { + "epoch": 1.075649098593979, + "grad_norm": 0.64453125, + "learning_rate": 8.87552543512592e-06, + "loss": 1.4075, + "step": 6235 + }, + { + "epoch": 1.075821616492711, + "grad_norm": 0.60546875, + "learning_rate": 8.872820860938149e-06, + "loss": 1.315, + "step": 6236 + }, + { + "epoch": 1.0759941343914432, + "grad_norm": 0.66796875, + "learning_rate": 8.870116370258847e-06, + "loss": 1.3736, + "step": 6237 + }, + { + "epoch": 1.0761666522901752, + "grad_norm": 0.70703125, + "learning_rate": 8.86741196328839e-06, + "loss": 1.4035, + "step": 6238 + }, + { + "epoch": 1.0763391701889071, + "grad_norm": 0.6015625, + "learning_rate": 8.864707640227127e-06, + "loss": 1.3408, + "step": 6239 + }, + { + "epoch": 1.076511688087639, + "grad_norm": 0.640625, + "learning_rate": 8.862003401275414e-06, + "loss": 1.4941, + "step": 6240 + }, + { + "epoch": 1.076684205986371, + "grad_norm": 0.71484375, + "learning_rate": 8.859299246633604e-06, + "loss": 1.3956, + "step": 6241 + }, + { + "epoch": 1.076856723885103, + "grad_norm": 0.59765625, + "learning_rate": 8.85659517650203e-06, + "loss": 1.329, + "step": 6242 + }, + { + "epoch": 1.0770292417838352, + "grad_norm": 0.60546875, + "learning_rate": 8.853891191081035e-06, + "loss": 1.3659, + "step": 6243 + }, + { + "epoch": 1.0772017596825672, + "grad_norm": 0.60546875, + "learning_rate": 8.851187290570934e-06, + "loss": 1.4607, + "step": 6244 + }, + { + "epoch": 1.0773742775812991, + "grad_norm": 0.6015625, + "learning_rate": 8.848483475172063e-06, + "loss": 1.4711, + "step": 6245 + }, + { + "epoch": 1.077546795480031, + "grad_norm": 0.578125, + "learning_rate": 8.845779745084728e-06, + "loss": 1.4921, + "step": 6246 + }, + { + "epoch": 1.077719313378763, + "grad_norm": 0.56640625, + "learning_rate": 8.843076100509244e-06, + "loss": 1.4246, + "step": 6247 + }, + { + "epoch": 1.077891831277495, + "grad_norm": 0.609375, + "learning_rate": 8.840372541645907e-06, + "loss": 1.4466, + "step": 6248 + }, + { + "epoch": 1.078064349176227, + "grad_norm": 0.69140625, + "learning_rate": 8.837669068695023e-06, + "loss": 1.4841, + "step": 6249 + }, + { + "epoch": 1.078236867074959, + "grad_norm": 0.62890625, + "learning_rate": 8.834965681856873e-06, + "loss": 1.4025, + "step": 6250 + }, + { + "epoch": 1.078409384973691, + "grad_norm": 0.62109375, + "learning_rate": 8.832262381331747e-06, + "loss": 1.4657, + "step": 6251 + }, + { + "epoch": 1.078581902872423, + "grad_norm": 0.66796875, + "learning_rate": 8.829559167319917e-06, + "loss": 1.3731, + "step": 6252 + }, + { + "epoch": 1.078754420771155, + "grad_norm": 0.5859375, + "learning_rate": 8.826856040021661e-06, + "loss": 1.5165, + "step": 6253 + }, + { + "epoch": 1.078926938669887, + "grad_norm": 0.578125, + "learning_rate": 8.824152999637237e-06, + "loss": 1.3957, + "step": 6254 + }, + { + "epoch": 1.079099456568619, + "grad_norm": 0.6328125, + "learning_rate": 8.821450046366909e-06, + "loss": 1.4961, + "step": 6255 + }, + { + "epoch": 1.079271974467351, + "grad_norm": 0.57421875, + "learning_rate": 8.81874718041092e-06, + "loss": 1.2706, + "step": 6256 + }, + { + "epoch": 1.079444492366083, + "grad_norm": 0.6328125, + "learning_rate": 8.816044401969524e-06, + "loss": 1.5226, + "step": 6257 + }, + { + "epoch": 1.079617010264815, + "grad_norm": 0.578125, + "learning_rate": 8.813341711242959e-06, + "loss": 1.4211, + "step": 6258 + }, + { + "epoch": 1.079789528163547, + "grad_norm": 0.703125, + "learning_rate": 8.810639108431449e-06, + "loss": 1.4257, + "step": 6259 + }, + { + "epoch": 1.079962046062279, + "grad_norm": 0.65625, + "learning_rate": 8.80793659373523e-06, + "loss": 1.5166, + "step": 6260 + }, + { + "epoch": 1.080134563961011, + "grad_norm": 0.625, + "learning_rate": 8.805234167354515e-06, + "loss": 1.4059, + "step": 6261 + }, + { + "epoch": 1.080307081859743, + "grad_norm": 0.62890625, + "learning_rate": 8.80253182948952e-06, + "loss": 1.4157, + "step": 6262 + }, + { + "epoch": 1.0804795997584749, + "grad_norm": 0.68359375, + "learning_rate": 8.799829580340444e-06, + "loss": 1.4555, + "step": 6263 + }, + { + "epoch": 1.080652117657207, + "grad_norm": 0.65234375, + "learning_rate": 8.797127420107496e-06, + "loss": 1.4794, + "step": 6264 + }, + { + "epoch": 1.080824635555939, + "grad_norm": 0.5859375, + "learning_rate": 8.794425348990861e-06, + "loss": 1.4365, + "step": 6265 + }, + { + "epoch": 1.080997153454671, + "grad_norm": 0.578125, + "learning_rate": 8.791723367190736e-06, + "loss": 1.4629, + "step": 6266 + }, + { + "epoch": 1.081169671353403, + "grad_norm": 0.578125, + "learning_rate": 8.789021474907286e-06, + "loss": 1.4438, + "step": 6267 + }, + { + "epoch": 1.0813421892521349, + "grad_norm": 0.5859375, + "learning_rate": 8.786319672340696e-06, + "loss": 1.4537, + "step": 6268 + }, + { + "epoch": 1.0815147071508668, + "grad_norm": 0.6328125, + "learning_rate": 8.783617959691126e-06, + "loss": 1.4335, + "step": 6269 + }, + { + "epoch": 1.0816872250495988, + "grad_norm": 0.59765625, + "learning_rate": 8.780916337158739e-06, + "loss": 1.4098, + "step": 6270 + }, + { + "epoch": 1.081859742948331, + "grad_norm": 0.57421875, + "learning_rate": 8.778214804943687e-06, + "loss": 1.4097, + "step": 6271 + }, + { + "epoch": 1.082032260847063, + "grad_norm": 0.59765625, + "learning_rate": 8.775513363246113e-06, + "loss": 1.4887, + "step": 6272 + }, + { + "epoch": 1.082204778745795, + "grad_norm": 0.59765625, + "learning_rate": 8.772812012266165e-06, + "loss": 1.3853, + "step": 6273 + }, + { + "epoch": 1.0823772966445269, + "grad_norm": 0.60546875, + "learning_rate": 8.770110752203968e-06, + "loss": 1.3535, + "step": 6274 + }, + { + "epoch": 1.0825498145432588, + "grad_norm": 0.61328125, + "learning_rate": 8.767409583259654e-06, + "loss": 1.4843, + "step": 6275 + }, + { + "epoch": 1.0827223324419908, + "grad_norm": 1.078125, + "learning_rate": 8.764708505633334e-06, + "loss": 1.4779, + "step": 6276 + }, + { + "epoch": 1.0828948503407227, + "grad_norm": 0.60546875, + "learning_rate": 8.762007519525132e-06, + "loss": 1.306, + "step": 6277 + }, + { + "epoch": 1.083067368239455, + "grad_norm": 0.61328125, + "learning_rate": 8.759306625135147e-06, + "loss": 1.4475, + "step": 6278 + }, + { + "epoch": 1.083239886138187, + "grad_norm": 0.6796875, + "learning_rate": 8.75660582266348e-06, + "loss": 1.4575, + "step": 6279 + }, + { + "epoch": 1.0834124040369189, + "grad_norm": 0.58203125, + "learning_rate": 8.753905112310217e-06, + "loss": 1.3896, + "step": 6280 + }, + { + "epoch": 1.0835849219356508, + "grad_norm": 0.64453125, + "learning_rate": 8.751204494275457e-06, + "loss": 1.3266, + "step": 6281 + }, + { + "epoch": 1.0837574398343828, + "grad_norm": 0.6171875, + "learning_rate": 8.748503968759267e-06, + "loss": 1.4157, + "step": 6282 + }, + { + "epoch": 1.0839299577331147, + "grad_norm": 0.53125, + "learning_rate": 8.745803535961725e-06, + "loss": 1.3011, + "step": 6283 + }, + { + "epoch": 1.084102475631847, + "grad_norm": 0.59765625, + "learning_rate": 8.74310319608289e-06, + "loss": 1.3948, + "step": 6284 + }, + { + "epoch": 1.0842749935305789, + "grad_norm": 0.6171875, + "learning_rate": 8.740402949322827e-06, + "loss": 1.3987, + "step": 6285 + }, + { + "epoch": 1.0844475114293108, + "grad_norm": 0.6015625, + "learning_rate": 8.737702795881581e-06, + "loss": 1.4962, + "step": 6286 + }, + { + "epoch": 1.0846200293280428, + "grad_norm": 0.578125, + "learning_rate": 8.735002735959203e-06, + "loss": 1.4318, + "step": 6287 + }, + { + "epoch": 1.0847925472267748, + "grad_norm": 0.609375, + "learning_rate": 8.732302769755722e-06, + "loss": 1.4156, + "step": 6288 + }, + { + "epoch": 1.0849650651255067, + "grad_norm": 0.55859375, + "learning_rate": 8.729602897471175e-06, + "loss": 1.3992, + "step": 6289 + }, + { + "epoch": 1.0851375830242387, + "grad_norm": 0.58984375, + "learning_rate": 8.726903119305583e-06, + "loss": 1.3411, + "step": 6290 + }, + { + "epoch": 1.0853101009229706, + "grad_norm": 0.78515625, + "learning_rate": 8.72420343545896e-06, + "loss": 1.4526, + "step": 6291 + }, + { + "epoch": 1.0854826188217028, + "grad_norm": 0.58203125, + "learning_rate": 8.721503846131322e-06, + "loss": 1.4865, + "step": 6292 + }, + { + "epoch": 1.0856551367204348, + "grad_norm": 0.63671875, + "learning_rate": 8.718804351522666e-06, + "loss": 1.3999, + "step": 6293 + }, + { + "epoch": 1.0858276546191667, + "grad_norm": 0.609375, + "learning_rate": 8.716104951832992e-06, + "loss": 1.3276, + "step": 6294 + }, + { + "epoch": 1.0860001725178987, + "grad_norm": 0.58203125, + "learning_rate": 8.713405647262279e-06, + "loss": 1.3087, + "step": 6295 + }, + { + "epoch": 1.0861726904166307, + "grad_norm": 0.6796875, + "learning_rate": 8.71070643801052e-06, + "loss": 1.3814, + "step": 6296 + }, + { + "epoch": 1.0863452083153626, + "grad_norm": 0.60546875, + "learning_rate": 8.708007324277678e-06, + "loss": 1.4415, + "step": 6297 + }, + { + "epoch": 1.0865177262140948, + "grad_norm": 0.7578125, + "learning_rate": 8.705308306263732e-06, + "loss": 1.2513, + "step": 6298 + }, + { + "epoch": 1.0866902441128268, + "grad_norm": 0.59765625, + "learning_rate": 8.702609384168634e-06, + "loss": 1.4319, + "step": 6299 + }, + { + "epoch": 1.0868627620115587, + "grad_norm": 0.671875, + "learning_rate": 8.69991055819234e-06, + "loss": 1.4037, + "step": 6300 + }, + { + "epoch": 1.0868627620115587, + "eval_loss": 1.4092991352081299, + "eval_runtime": 10.9216, + "eval_samples_per_second": 93.759, + "eval_steps_per_second": 23.44, + "step": 6300 + }, + { + "epoch": 1.0870352799102907, + "grad_norm": 0.7421875, + "learning_rate": 8.697211828534793e-06, + "loss": 1.4708, + "step": 6301 + }, + { + "epoch": 1.0872077978090227, + "grad_norm": 0.6015625, + "learning_rate": 8.694513195395937e-06, + "loss": 1.2764, + "step": 6302 + }, + { + "epoch": 1.0873803157077546, + "grad_norm": 0.58984375, + "learning_rate": 8.691814658975699e-06, + "loss": 1.4155, + "step": 6303 + }, + { + "epoch": 1.0875528336064866, + "grad_norm": 0.6015625, + "learning_rate": 8.689116219474004e-06, + "loss": 1.3576, + "step": 6304 + }, + { + "epoch": 1.0877253515052188, + "grad_norm": 0.640625, + "learning_rate": 8.686417877090772e-06, + "loss": 1.4314, + "step": 6305 + }, + { + "epoch": 1.0878978694039507, + "grad_norm": 0.68359375, + "learning_rate": 8.68371963202591e-06, + "loss": 1.4714, + "step": 6306 + }, + { + "epoch": 1.0880703873026827, + "grad_norm": 0.61328125, + "learning_rate": 8.681021484479327e-06, + "loss": 1.465, + "step": 6307 + }, + { + "epoch": 1.0882429052014146, + "grad_norm": 0.71875, + "learning_rate": 8.678323434650906e-06, + "loss": 1.4521, + "step": 6308 + }, + { + "epoch": 1.0884154231001466, + "grad_norm": 0.609375, + "learning_rate": 8.675625482740549e-06, + "loss": 1.5038, + "step": 6309 + }, + { + "epoch": 1.0885879409988786, + "grad_norm": 0.69921875, + "learning_rate": 8.672927628948128e-06, + "loss": 1.4076, + "step": 6310 + }, + { + "epoch": 1.0887604588976105, + "grad_norm": 0.609375, + "learning_rate": 8.670229873473524e-06, + "loss": 1.5556, + "step": 6311 + }, + { + "epoch": 1.0889329767963427, + "grad_norm": 0.6640625, + "learning_rate": 8.667532216516594e-06, + "loss": 1.4564, + "step": 6312 + }, + { + "epoch": 1.0891054946950747, + "grad_norm": 0.60546875, + "learning_rate": 8.664834658277208e-06, + "loss": 1.4006, + "step": 6313 + }, + { + "epoch": 1.0892780125938066, + "grad_norm": 0.66015625, + "learning_rate": 8.662137198955211e-06, + "loss": 1.4791, + "step": 6314 + }, + { + "epoch": 1.0894505304925386, + "grad_norm": 0.5625, + "learning_rate": 8.659439838750451e-06, + "loss": 1.4249, + "step": 6315 + }, + { + "epoch": 1.0896230483912706, + "grad_norm": 0.62109375, + "learning_rate": 8.656742577862761e-06, + "loss": 1.4844, + "step": 6316 + }, + { + "epoch": 1.0897955662900025, + "grad_norm": 0.65234375, + "learning_rate": 8.654045416491975e-06, + "loss": 1.3683, + "step": 6317 + }, + { + "epoch": 1.0899680841887345, + "grad_norm": 0.6171875, + "learning_rate": 8.651348354837917e-06, + "loss": 1.4811, + "step": 6318 + }, + { + "epoch": 1.0901406020874667, + "grad_norm": 0.609375, + "learning_rate": 8.6486513931004e-06, + "loss": 1.4244, + "step": 6319 + }, + { + "epoch": 1.0903131199861986, + "grad_norm": 0.578125, + "learning_rate": 8.645954531479226e-06, + "loss": 1.3503, + "step": 6320 + }, + { + "epoch": 1.0904856378849306, + "grad_norm": 0.6875, + "learning_rate": 8.643257770174202e-06, + "loss": 1.4309, + "step": 6321 + }, + { + "epoch": 1.0906581557836625, + "grad_norm": 0.58203125, + "learning_rate": 8.640561109385125e-06, + "loss": 1.4698, + "step": 6322 + }, + { + "epoch": 1.0908306736823945, + "grad_norm": 0.52734375, + "learning_rate": 8.637864549311768e-06, + "loss": 1.299, + "step": 6323 + }, + { + "epoch": 1.0910031915811265, + "grad_norm": 0.57421875, + "learning_rate": 8.635168090153922e-06, + "loss": 1.3882, + "step": 6324 + }, + { + "epoch": 1.0911757094798586, + "grad_norm": 0.640625, + "learning_rate": 8.632471732111349e-06, + "loss": 1.348, + "step": 6325 + }, + { + "epoch": 1.0913482273785906, + "grad_norm": 0.5625, + "learning_rate": 8.629775475383816e-06, + "loss": 1.4612, + "step": 6326 + }, + { + "epoch": 1.0915207452773226, + "grad_norm": 0.578125, + "learning_rate": 8.627079320171076e-06, + "loss": 1.3347, + "step": 6327 + }, + { + "epoch": 1.0916932631760545, + "grad_norm": 0.6796875, + "learning_rate": 8.624383266672882e-06, + "loss": 1.432, + "step": 6328 + }, + { + "epoch": 1.0918657810747865, + "grad_norm": 0.625, + "learning_rate": 8.621687315088965e-06, + "loss": 1.4275, + "step": 6329 + }, + { + "epoch": 1.0920382989735185, + "grad_norm": 0.61328125, + "learning_rate": 8.618991465619068e-06, + "loss": 1.4922, + "step": 6330 + }, + { + "epoch": 1.0922108168722504, + "grad_norm": 0.58984375, + "learning_rate": 8.616295718462913e-06, + "loss": 1.3674, + "step": 6331 + }, + { + "epoch": 1.0923833347709824, + "grad_norm": 0.69140625, + "learning_rate": 8.613600073820216e-06, + "loss": 1.5326, + "step": 6332 + }, + { + "epoch": 1.0925558526697146, + "grad_norm": 0.60546875, + "learning_rate": 8.610904531890685e-06, + "loss": 1.432, + "step": 6333 + }, + { + "epoch": 1.0927283705684465, + "grad_norm": 0.6875, + "learning_rate": 8.60820909287403e-06, + "loss": 1.434, + "step": 6334 + }, + { + "epoch": 1.0929008884671785, + "grad_norm": 0.6328125, + "learning_rate": 8.60551375696994e-06, + "loss": 1.4066, + "step": 6335 + }, + { + "epoch": 1.0930734063659104, + "grad_norm": 0.62890625, + "learning_rate": 8.602818524378098e-06, + "loss": 1.4648, + "step": 6336 + }, + { + "epoch": 1.0932459242646424, + "grad_norm": 0.70703125, + "learning_rate": 8.600123395298198e-06, + "loss": 1.3308, + "step": 6337 + }, + { + "epoch": 1.0934184421633744, + "grad_norm": 0.59765625, + "learning_rate": 8.597428369929902e-06, + "loss": 1.36, + "step": 6338 + }, + { + "epoch": 1.0935909600621065, + "grad_norm": 0.59375, + "learning_rate": 8.594733448472876e-06, + "loss": 1.4629, + "step": 6339 + }, + { + "epoch": 1.0937634779608385, + "grad_norm": 0.609375, + "learning_rate": 8.592038631126774e-06, + "loss": 1.4091, + "step": 6340 + }, + { + "epoch": 1.0939359958595705, + "grad_norm": 0.6640625, + "learning_rate": 8.589343918091251e-06, + "loss": 1.3806, + "step": 6341 + }, + { + "epoch": 1.0941085137583024, + "grad_norm": 0.58203125, + "learning_rate": 8.586649309565942e-06, + "loss": 1.4089, + "step": 6342 + }, + { + "epoch": 1.0942810316570344, + "grad_norm": 0.578125, + "learning_rate": 8.583954805750488e-06, + "loss": 1.5163, + "step": 6343 + }, + { + "epoch": 1.0944535495557663, + "grad_norm": 0.59765625, + "learning_rate": 8.581260406844503e-06, + "loss": 1.3662, + "step": 6344 + }, + { + "epoch": 1.0946260674544983, + "grad_norm": 0.63671875, + "learning_rate": 8.578566113047616e-06, + "loss": 1.4788, + "step": 6345 + }, + { + "epoch": 1.0947985853532305, + "grad_norm": 0.60546875, + "learning_rate": 8.575871924559431e-06, + "loss": 1.4255, + "step": 6346 + }, + { + "epoch": 1.0949711032519625, + "grad_norm": 0.640625, + "learning_rate": 8.573177841579556e-06, + "loss": 1.3274, + "step": 6347 + }, + { + "epoch": 1.0951436211506944, + "grad_norm": 0.64453125, + "learning_rate": 8.570483864307575e-06, + "loss": 1.3774, + "step": 6348 + }, + { + "epoch": 1.0953161390494264, + "grad_norm": 0.796875, + "learning_rate": 8.567789992943088e-06, + "loss": 1.5031, + "step": 6349 + }, + { + "epoch": 1.0954886569481583, + "grad_norm": 0.66015625, + "learning_rate": 8.565096227685663e-06, + "loss": 1.5199, + "step": 6350 + }, + { + "epoch": 1.0956611748468903, + "grad_norm": 0.65625, + "learning_rate": 8.562402568734879e-06, + "loss": 1.4346, + "step": 6351 + }, + { + "epoch": 1.0958336927456223, + "grad_norm": 0.703125, + "learning_rate": 8.559709016290288e-06, + "loss": 1.3992, + "step": 6352 + }, + { + "epoch": 1.0960062106443544, + "grad_norm": 0.5703125, + "learning_rate": 8.557015570551455e-06, + "loss": 1.3908, + "step": 6353 + }, + { + "epoch": 1.0961787285430864, + "grad_norm": 0.66015625, + "learning_rate": 8.55432223171793e-06, + "loss": 1.3654, + "step": 6354 + }, + { + "epoch": 1.0963512464418184, + "grad_norm": 0.625, + "learning_rate": 8.551628999989242e-06, + "loss": 1.4114, + "step": 6355 + }, + { + "epoch": 1.0965237643405503, + "grad_norm": 0.59375, + "learning_rate": 8.548935875564931e-06, + "loss": 1.4118, + "step": 6356 + }, + { + "epoch": 1.0966962822392823, + "grad_norm": 0.57421875, + "learning_rate": 8.546242858644513e-06, + "loss": 1.4611, + "step": 6357 + }, + { + "epoch": 1.0968688001380142, + "grad_norm": 0.61328125, + "learning_rate": 8.543549949427512e-06, + "loss": 1.4758, + "step": 6358 + }, + { + "epoch": 1.0970413180367462, + "grad_norm": 0.6171875, + "learning_rate": 8.540857148113429e-06, + "loss": 1.3938, + "step": 6359 + }, + { + "epoch": 1.0972138359354784, + "grad_norm": 0.65234375, + "learning_rate": 8.538164454901766e-06, + "loss": 1.4961, + "step": 6360 + }, + { + "epoch": 1.0973863538342103, + "grad_norm": 0.6171875, + "learning_rate": 8.535471869992011e-06, + "loss": 1.4775, + "step": 6361 + }, + { + "epoch": 1.0975588717329423, + "grad_norm": 0.5625, + "learning_rate": 8.532779393583656e-06, + "loss": 1.4067, + "step": 6362 + }, + { + "epoch": 1.0977313896316743, + "grad_norm": 0.58984375, + "learning_rate": 8.530087025876168e-06, + "loss": 1.3677, + "step": 6363 + }, + { + "epoch": 1.0979039075304062, + "grad_norm": 0.546875, + "learning_rate": 8.52739476706902e-06, + "loss": 1.2977, + "step": 6364 + }, + { + "epoch": 1.0980764254291382, + "grad_norm": 0.58203125, + "learning_rate": 8.524702617361665e-06, + "loss": 1.4171, + "step": 6365 + }, + { + "epoch": 1.0982489433278704, + "grad_norm": 0.578125, + "learning_rate": 8.522010576953561e-06, + "loss": 1.3564, + "step": 6366 + }, + { + "epoch": 1.0984214612266023, + "grad_norm": 0.7109375, + "learning_rate": 8.519318646044147e-06, + "loss": 1.4582, + "step": 6367 + }, + { + "epoch": 1.0985939791253343, + "grad_norm": 0.58984375, + "learning_rate": 8.516626824832858e-06, + "loss": 1.4448, + "step": 6368 + }, + { + "epoch": 1.0987664970240663, + "grad_norm": 0.61328125, + "learning_rate": 8.513935113519126e-06, + "loss": 1.3989, + "step": 6369 + }, + { + "epoch": 1.0989390149227982, + "grad_norm": 0.59375, + "learning_rate": 8.511243512302362e-06, + "loss": 1.4482, + "step": 6370 + }, + { + "epoch": 1.0991115328215302, + "grad_norm": 0.578125, + "learning_rate": 8.508552021381987e-06, + "loss": 1.345, + "step": 6371 + }, + { + "epoch": 1.0992840507202621, + "grad_norm": 0.6484375, + "learning_rate": 8.50586064095739e-06, + "loss": 1.3918, + "step": 6372 + }, + { + "epoch": 1.0994565686189943, + "grad_norm": 0.58203125, + "learning_rate": 8.503169371227978e-06, + "loss": 1.5311, + "step": 6373 + }, + { + "epoch": 1.0996290865177263, + "grad_norm": 0.60546875, + "learning_rate": 8.50047821239313e-06, + "loss": 1.4589, + "step": 6374 + }, + { + "epoch": 1.0998016044164582, + "grad_norm": 0.921875, + "learning_rate": 8.497787164652227e-06, + "loss": 1.3397, + "step": 6375 + }, + { + "epoch": 1.0999741223151902, + "grad_norm": 0.6015625, + "learning_rate": 8.495096228204632e-06, + "loss": 1.4847, + "step": 6376 + }, + { + "epoch": 1.1001466402139222, + "grad_norm": 0.60546875, + "learning_rate": 8.492405403249717e-06, + "loss": 1.4609, + "step": 6377 + }, + { + "epoch": 1.1003191581126541, + "grad_norm": 0.6484375, + "learning_rate": 8.489714689986826e-06, + "loss": 1.4853, + "step": 6378 + }, + { + "epoch": 1.100491676011386, + "grad_norm": 0.6015625, + "learning_rate": 8.48702408861531e-06, + "loss": 1.5215, + "step": 6379 + }, + { + "epoch": 1.1006641939101183, + "grad_norm": 0.5703125, + "learning_rate": 8.4843335993345e-06, + "loss": 1.3049, + "step": 6380 + }, + { + "epoch": 1.1008367118088502, + "grad_norm": 0.62109375, + "learning_rate": 8.48164322234373e-06, + "loss": 1.464, + "step": 6381 + }, + { + "epoch": 1.1010092297075822, + "grad_norm": 0.57421875, + "learning_rate": 8.478952957842317e-06, + "loss": 1.4325, + "step": 6382 + }, + { + "epoch": 1.1011817476063142, + "grad_norm": 0.6328125, + "learning_rate": 8.476262806029573e-06, + "loss": 1.3597, + "step": 6383 + }, + { + "epoch": 1.1013542655050461, + "grad_norm": 0.61328125, + "learning_rate": 8.473572767104799e-06, + "loss": 1.3886, + "step": 6384 + }, + { + "epoch": 1.101526783403778, + "grad_norm": 0.6328125, + "learning_rate": 8.47088284126729e-06, + "loss": 1.3507, + "step": 6385 + }, + { + "epoch": 1.10169930130251, + "grad_norm": 0.59765625, + "learning_rate": 8.46819302871634e-06, + "loss": 1.541, + "step": 6386 + }, + { + "epoch": 1.1018718192012422, + "grad_norm": 0.6171875, + "learning_rate": 8.465503329651219e-06, + "loss": 1.405, + "step": 6387 + }, + { + "epoch": 1.1020443370999742, + "grad_norm": 0.74609375, + "learning_rate": 8.462813744271202e-06, + "loss": 1.519, + "step": 6388 + }, + { + "epoch": 1.1022168549987061, + "grad_norm": 0.6796875, + "learning_rate": 8.460124272775542e-06, + "loss": 1.3921, + "step": 6389 + }, + { + "epoch": 1.102389372897438, + "grad_norm": 0.6484375, + "learning_rate": 8.457434915363501e-06, + "loss": 1.3868, + "step": 6390 + }, + { + "epoch": 1.10256189079617, + "grad_norm": 0.58984375, + "learning_rate": 8.454745672234321e-06, + "loss": 1.4336, + "step": 6391 + }, + { + "epoch": 1.102734408694902, + "grad_norm": 0.5703125, + "learning_rate": 8.452056543587236e-06, + "loss": 1.3515, + "step": 6392 + }, + { + "epoch": 1.1029069265936342, + "grad_norm": 0.640625, + "learning_rate": 8.44936752962147e-06, + "loss": 1.4774, + "step": 6393 + }, + { + "epoch": 1.1030794444923662, + "grad_norm": 0.671875, + "learning_rate": 8.446678630536252e-06, + "loss": 1.4492, + "step": 6394 + }, + { + "epoch": 1.1032519623910981, + "grad_norm": 0.5859375, + "learning_rate": 8.443989846530784e-06, + "loss": 1.4031, + "step": 6395 + }, + { + "epoch": 1.10342448028983, + "grad_norm": 0.55859375, + "learning_rate": 8.441301177804273e-06, + "loss": 1.3862, + "step": 6396 + }, + { + "epoch": 1.103596998188562, + "grad_norm": 0.65625, + "learning_rate": 8.438612624555905e-06, + "loss": 1.3648, + "step": 6397 + }, + { + "epoch": 1.103769516087294, + "grad_norm": 0.73046875, + "learning_rate": 8.435924186984875e-06, + "loss": 1.4533, + "step": 6398 + }, + { + "epoch": 1.103942033986026, + "grad_norm": 0.80859375, + "learning_rate": 8.433235865290351e-06, + "loss": 1.4442, + "step": 6399 + }, + { + "epoch": 1.104114551884758, + "grad_norm": 0.60546875, + "learning_rate": 8.430547659671503e-06, + "loss": 1.4276, + "step": 6400 + }, + { + "epoch": 1.104114551884758, + "eval_loss": 1.4093081951141357, + "eval_runtime": 10.9921, + "eval_samples_per_second": 93.158, + "eval_steps_per_second": 23.29, + "step": 6400 + }, + { + "epoch": 1.1042870697834901, + "grad_norm": 0.59765625, + "learning_rate": 8.427859570327494e-06, + "loss": 1.4678, + "step": 6401 + }, + { + "epoch": 1.104459587682222, + "grad_norm": 0.6171875, + "learning_rate": 8.425171597457469e-06, + "loss": 1.4255, + "step": 6402 + }, + { + "epoch": 1.104632105580954, + "grad_norm": 0.6328125, + "learning_rate": 8.422483741260575e-06, + "loss": 1.4794, + "step": 6403 + }, + { + "epoch": 1.104804623479686, + "grad_norm": 0.63671875, + "learning_rate": 8.419796001935935e-06, + "loss": 1.4145, + "step": 6404 + }, + { + "epoch": 1.104977141378418, + "grad_norm": 0.58984375, + "learning_rate": 8.417108379682688e-06, + "loss": 1.4402, + "step": 6405 + }, + { + "epoch": 1.10514965927715, + "grad_norm": 0.61328125, + "learning_rate": 8.41442087469994e-06, + "loss": 1.4904, + "step": 6406 + }, + { + "epoch": 1.105322177175882, + "grad_norm": 0.59765625, + "learning_rate": 8.411733487186802e-06, + "loss": 1.3821, + "step": 6407 + }, + { + "epoch": 1.105494695074614, + "grad_norm": 0.6484375, + "learning_rate": 8.409046217342367e-06, + "loss": 1.4523, + "step": 6408 + }, + { + "epoch": 1.105667212973346, + "grad_norm": 0.578125, + "learning_rate": 8.406359065365735e-06, + "loss": 1.3564, + "step": 6409 + }, + { + "epoch": 1.105839730872078, + "grad_norm": 0.5703125, + "learning_rate": 8.403672031455977e-06, + "loss": 1.456, + "step": 6410 + }, + { + "epoch": 1.10601224877081, + "grad_norm": 0.625, + "learning_rate": 8.40098511581217e-06, + "loss": 1.4374, + "step": 6411 + }, + { + "epoch": 1.106184766669542, + "grad_norm": 0.65234375, + "learning_rate": 8.398298318633376e-06, + "loss": 1.449, + "step": 6412 + }, + { + "epoch": 1.1063572845682739, + "grad_norm": 0.62109375, + "learning_rate": 8.395611640118653e-06, + "loss": 1.3961, + "step": 6413 + }, + { + "epoch": 1.106529802467006, + "grad_norm": 0.62890625, + "learning_rate": 8.39292508046704e-06, + "loss": 1.3924, + "step": 6414 + }, + { + "epoch": 1.106702320365738, + "grad_norm": 0.56640625, + "learning_rate": 8.390238639877584e-06, + "loss": 1.4491, + "step": 6415 + }, + { + "epoch": 1.10687483826447, + "grad_norm": 0.58984375, + "learning_rate": 8.387552318549304e-06, + "loss": 1.4087, + "step": 6416 + }, + { + "epoch": 1.107047356163202, + "grad_norm": 0.5703125, + "learning_rate": 8.384866116681221e-06, + "loss": 1.5279, + "step": 6417 + }, + { + "epoch": 1.107219874061934, + "grad_norm": 0.6328125, + "learning_rate": 8.382180034472353e-06, + "loss": 1.4544, + "step": 6418 + }, + { + "epoch": 1.1073923919606659, + "grad_norm": 0.66796875, + "learning_rate": 8.379494072121695e-06, + "loss": 1.5367, + "step": 6419 + }, + { + "epoch": 1.1075649098593978, + "grad_norm": 0.6328125, + "learning_rate": 8.376808229828242e-06, + "loss": 1.4524, + "step": 6420 + }, + { + "epoch": 1.10773742775813, + "grad_norm": 0.6015625, + "learning_rate": 8.374122507790975e-06, + "loss": 1.4472, + "step": 6421 + }, + { + "epoch": 1.107909945656862, + "grad_norm": 0.56640625, + "learning_rate": 8.371436906208876e-06, + "loss": 1.5322, + "step": 6422 + }, + { + "epoch": 1.108082463555594, + "grad_norm": 0.77734375, + "learning_rate": 8.368751425280904e-06, + "loss": 1.4576, + "step": 6423 + }, + { + "epoch": 1.1082549814543259, + "grad_norm": 0.59765625, + "learning_rate": 8.36606606520602e-06, + "loss": 1.4851, + "step": 6424 + }, + { + "epoch": 1.1084274993530578, + "grad_norm": 0.65625, + "learning_rate": 8.363380826183167e-06, + "loss": 1.3564, + "step": 6425 + }, + { + "epoch": 1.1086000172517898, + "grad_norm": 0.65625, + "learning_rate": 8.360695708411295e-06, + "loss": 1.428, + "step": 6426 + }, + { + "epoch": 1.1087725351505218, + "grad_norm": 0.63671875, + "learning_rate": 8.358010712089324e-06, + "loss": 1.4501, + "step": 6427 + }, + { + "epoch": 1.108945053049254, + "grad_norm": 0.6640625, + "learning_rate": 8.355325837416182e-06, + "loss": 1.363, + "step": 6428 + }, + { + "epoch": 1.109117570947986, + "grad_norm": 0.640625, + "learning_rate": 8.352641084590772e-06, + "loss": 1.4789, + "step": 6429 + }, + { + "epoch": 1.1092900888467179, + "grad_norm": 0.63671875, + "learning_rate": 8.349956453812009e-06, + "loss": 1.4532, + "step": 6430 + }, + { + "epoch": 1.1094626067454498, + "grad_norm": 0.60546875, + "learning_rate": 8.34727194527878e-06, + "loss": 1.412, + "step": 6431 + }, + { + "epoch": 1.1096351246441818, + "grad_norm": 0.58203125, + "learning_rate": 8.34458755918997e-06, + "loss": 1.401, + "step": 6432 + }, + { + "epoch": 1.1098076425429138, + "grad_norm": 0.625, + "learning_rate": 8.341903295744463e-06, + "loss": 1.4473, + "step": 6433 + }, + { + "epoch": 1.109980160441646, + "grad_norm": 0.54296875, + "learning_rate": 8.339219155141115e-06, + "loss": 1.3622, + "step": 6434 + }, + { + "epoch": 1.110152678340378, + "grad_norm": 0.6015625, + "learning_rate": 8.336535137578792e-06, + "loss": 1.4421, + "step": 6435 + }, + { + "epoch": 1.1103251962391099, + "grad_norm": 0.6796875, + "learning_rate": 8.333851243256337e-06, + "loss": 1.4733, + "step": 6436 + }, + { + "epoch": 1.1104977141378418, + "grad_norm": 0.640625, + "learning_rate": 8.331167472372596e-06, + "loss": 1.3693, + "step": 6437 + }, + { + "epoch": 1.1106702320365738, + "grad_norm": 0.58203125, + "learning_rate": 8.328483825126393e-06, + "loss": 1.4495, + "step": 6438 + }, + { + "epoch": 1.1108427499353057, + "grad_norm": 0.5703125, + "learning_rate": 8.325800301716555e-06, + "loss": 1.4169, + "step": 6439 + }, + { + "epoch": 1.1110152678340377, + "grad_norm": 0.6796875, + "learning_rate": 8.323116902341888e-06, + "loss": 1.4493, + "step": 6440 + }, + { + "epoch": 1.1111877857327697, + "grad_norm": 0.68359375, + "learning_rate": 8.320433627201202e-06, + "loss": 1.3748, + "step": 6441 + }, + { + "epoch": 1.1113603036315018, + "grad_norm": 0.60546875, + "learning_rate": 8.317750476493282e-06, + "loss": 1.5276, + "step": 6442 + }, + { + "epoch": 1.1115328215302338, + "grad_norm": 0.55859375, + "learning_rate": 8.315067450416926e-06, + "loss": 1.3542, + "step": 6443 + }, + { + "epoch": 1.1117053394289658, + "grad_norm": 0.61328125, + "learning_rate": 8.312384549170894e-06, + "loss": 1.426, + "step": 6444 + }, + { + "epoch": 1.1118778573276977, + "grad_norm": 0.62109375, + "learning_rate": 8.309701772953964e-06, + "loss": 1.4683, + "step": 6445 + }, + { + "epoch": 1.1120503752264297, + "grad_norm": 0.58984375, + "learning_rate": 8.307019121964885e-06, + "loss": 1.4291, + "step": 6446 + }, + { + "epoch": 1.1122228931251616, + "grad_norm": 0.58203125, + "learning_rate": 8.30433659640241e-06, + "loss": 1.4038, + "step": 6447 + }, + { + "epoch": 1.1123954110238938, + "grad_norm": 0.59375, + "learning_rate": 8.301654196465273e-06, + "loss": 1.422, + "step": 6448 + }, + { + "epoch": 1.1125679289226258, + "grad_norm": 0.59375, + "learning_rate": 8.298971922352203e-06, + "loss": 1.3519, + "step": 6449 + }, + { + "epoch": 1.1127404468213578, + "grad_norm": 0.59375, + "learning_rate": 8.296289774261926e-06, + "loss": 1.4286, + "step": 6450 + }, + { + "epoch": 1.1129129647200897, + "grad_norm": 0.5625, + "learning_rate": 8.293607752393145e-06, + "loss": 1.3807, + "step": 6451 + }, + { + "epoch": 1.1130854826188217, + "grad_norm": 0.59765625, + "learning_rate": 8.290925856944567e-06, + "loss": 1.3515, + "step": 6452 + }, + { + "epoch": 1.1132580005175536, + "grad_norm": 0.6171875, + "learning_rate": 8.288244088114876e-06, + "loss": 1.3681, + "step": 6453 + }, + { + "epoch": 1.1134305184162856, + "grad_norm": 0.578125, + "learning_rate": 8.285562446102761e-06, + "loss": 1.4408, + "step": 6454 + }, + { + "epoch": 1.1136030363150178, + "grad_norm": 0.65625, + "learning_rate": 8.282880931106893e-06, + "loss": 1.4276, + "step": 6455 + }, + { + "epoch": 1.1137755542137497, + "grad_norm": 0.65234375, + "learning_rate": 8.280199543325935e-06, + "loss": 1.467, + "step": 6456 + }, + { + "epoch": 1.1139480721124817, + "grad_norm": 0.59765625, + "learning_rate": 8.277518282958536e-06, + "loss": 1.4664, + "step": 6457 + }, + { + "epoch": 1.1141205900112137, + "grad_norm": 0.6484375, + "learning_rate": 8.274837150203352e-06, + "loss": 1.4037, + "step": 6458 + }, + { + "epoch": 1.1142931079099456, + "grad_norm": 0.6171875, + "learning_rate": 8.272156145259006e-06, + "loss": 1.4159, + "step": 6459 + }, + { + "epoch": 1.1144656258086776, + "grad_norm": 0.578125, + "learning_rate": 8.269475268324131e-06, + "loss": 1.3251, + "step": 6460 + }, + { + "epoch": 1.1146381437074095, + "grad_norm": 0.54296875, + "learning_rate": 8.266794519597339e-06, + "loss": 1.4099, + "step": 6461 + }, + { + "epoch": 1.1148106616061417, + "grad_norm": 0.61328125, + "learning_rate": 8.264113899277241e-06, + "loss": 1.3554, + "step": 6462 + }, + { + "epoch": 1.1149831795048737, + "grad_norm": 0.63671875, + "learning_rate": 8.26143340756243e-06, + "loss": 1.3989, + "step": 6463 + }, + { + "epoch": 1.1151556974036057, + "grad_norm": 0.65234375, + "learning_rate": 8.258753044651499e-06, + "loss": 1.4215, + "step": 6464 + }, + { + "epoch": 1.1153282153023376, + "grad_norm": 0.56640625, + "learning_rate": 8.256072810743015e-06, + "loss": 1.4646, + "step": 6465 + }, + { + "epoch": 1.1155007332010696, + "grad_norm": 0.58203125, + "learning_rate": 8.253392706035558e-06, + "loss": 1.4461, + "step": 6466 + }, + { + "epoch": 1.1156732510998015, + "grad_norm": 0.5546875, + "learning_rate": 8.250712730727685e-06, + "loss": 1.4213, + "step": 6467 + }, + { + "epoch": 1.1158457689985335, + "grad_norm": 0.58203125, + "learning_rate": 8.248032885017937e-06, + "loss": 1.3849, + "step": 6468 + }, + { + "epoch": 1.1160182868972657, + "grad_norm": 0.58984375, + "learning_rate": 8.245353169104865e-06, + "loss": 1.4312, + "step": 6469 + }, + { + "epoch": 1.1161908047959976, + "grad_norm": 0.5859375, + "learning_rate": 8.242673583186991e-06, + "loss": 1.3815, + "step": 6470 + }, + { + "epoch": 1.1163633226947296, + "grad_norm": 0.6015625, + "learning_rate": 8.239994127462842e-06, + "loss": 1.2654, + "step": 6471 + }, + { + "epoch": 1.1165358405934616, + "grad_norm": 0.62890625, + "learning_rate": 8.237314802130919e-06, + "loss": 1.4075, + "step": 6472 + }, + { + "epoch": 1.1167083584921935, + "grad_norm": 0.59375, + "learning_rate": 8.234635607389733e-06, + "loss": 1.3882, + "step": 6473 + }, + { + "epoch": 1.1168808763909255, + "grad_norm": 0.609375, + "learning_rate": 8.231956543437768e-06, + "loss": 1.4488, + "step": 6474 + }, + { + "epoch": 1.1170533942896577, + "grad_norm": 0.625, + "learning_rate": 8.229277610473516e-06, + "loss": 1.4515, + "step": 6475 + }, + { + "epoch": 1.1172259121883896, + "grad_norm": 0.57421875, + "learning_rate": 8.226598808695438e-06, + "loss": 1.4751, + "step": 6476 + }, + { + "epoch": 1.1173984300871216, + "grad_norm": 0.6171875, + "learning_rate": 8.223920138302006e-06, + "loss": 1.4667, + "step": 6477 + }, + { + "epoch": 1.1175709479858535, + "grad_norm": 0.55859375, + "learning_rate": 8.22124159949166e-06, + "loss": 1.4327, + "step": 6478 + }, + { + "epoch": 1.1177434658845855, + "grad_norm": 0.65234375, + "learning_rate": 8.218563192462857e-06, + "loss": 1.4853, + "step": 6479 + }, + { + "epoch": 1.1179159837833175, + "grad_norm": 0.58984375, + "learning_rate": 8.21588491741402e-06, + "loss": 1.432, + "step": 6480 + }, + { + "epoch": 1.1180885016820494, + "grad_norm": 0.59765625, + "learning_rate": 8.213206774543574e-06, + "loss": 1.5029, + "step": 6481 + }, + { + "epoch": 1.1182610195807814, + "grad_norm": 0.6171875, + "learning_rate": 8.21052876404994e-06, + "loss": 1.4315, + "step": 6482 + }, + { + "epoch": 1.1184335374795136, + "grad_norm": 0.62890625, + "learning_rate": 8.207850886131512e-06, + "loss": 1.3277, + "step": 6483 + }, + { + "epoch": 1.1186060553782455, + "grad_norm": 0.609375, + "learning_rate": 8.205173140986691e-06, + "loss": 1.4484, + "step": 6484 + }, + { + "epoch": 1.1187785732769775, + "grad_norm": 0.58984375, + "learning_rate": 8.202495528813856e-06, + "loss": 1.4537, + "step": 6485 + }, + { + "epoch": 1.1189510911757095, + "grad_norm": 0.6171875, + "learning_rate": 8.199818049811387e-06, + "loss": 1.4245, + "step": 6486 + }, + { + "epoch": 1.1191236090744414, + "grad_norm": 0.64453125, + "learning_rate": 8.197140704177642e-06, + "loss": 1.412, + "step": 6487 + }, + { + "epoch": 1.1192961269731734, + "grad_norm": 0.65625, + "learning_rate": 8.194463492110982e-06, + "loss": 1.4611, + "step": 6488 + }, + { + "epoch": 1.1194686448719056, + "grad_norm": 0.625, + "learning_rate": 8.191786413809742e-06, + "loss": 1.502, + "step": 6489 + }, + { + "epoch": 1.1196411627706375, + "grad_norm": 0.6171875, + "learning_rate": 8.189109469472267e-06, + "loss": 1.44, + "step": 6490 + }, + { + "epoch": 1.1198136806693695, + "grad_norm": 0.55078125, + "learning_rate": 8.186432659296876e-06, + "loss": 1.4693, + "step": 6491 + }, + { + "epoch": 1.1199861985681014, + "grad_norm": 0.57421875, + "learning_rate": 8.183755983481888e-06, + "loss": 1.3884, + "step": 6492 + }, + { + "epoch": 1.1201587164668334, + "grad_norm": 0.625, + "learning_rate": 8.1810794422256e-06, + "loss": 1.4728, + "step": 6493 + }, + { + "epoch": 1.1203312343655654, + "grad_norm": 0.734375, + "learning_rate": 8.178403035726316e-06, + "loss": 1.5149, + "step": 6494 + }, + { + "epoch": 1.1205037522642973, + "grad_norm": 0.578125, + "learning_rate": 8.175726764182315e-06, + "loss": 1.4063, + "step": 6495 + }, + { + "epoch": 1.1206762701630295, + "grad_norm": 0.65234375, + "learning_rate": 8.173050627791877e-06, + "loss": 1.4264, + "step": 6496 + }, + { + "epoch": 1.1208487880617615, + "grad_norm": 0.62109375, + "learning_rate": 8.170374626753259e-06, + "loss": 1.483, + "step": 6497 + }, + { + "epoch": 1.1210213059604934, + "grad_norm": 0.640625, + "learning_rate": 8.167698761264723e-06, + "loss": 1.4241, + "step": 6498 + }, + { + "epoch": 1.1211938238592254, + "grad_norm": 0.58984375, + "learning_rate": 8.165023031524513e-06, + "loss": 1.3938, + "step": 6499 + }, + { + "epoch": 1.1213663417579574, + "grad_norm": 0.578125, + "learning_rate": 8.162347437730859e-06, + "loss": 1.4848, + "step": 6500 + }, + { + "epoch": 1.1213663417579574, + "eval_loss": 1.4090793132781982, + "eval_runtime": 10.9381, + "eval_samples_per_second": 93.618, + "eval_steps_per_second": 23.405, + "step": 6500 + }, + { + "epoch": 1.1215388596566893, + "grad_norm": 0.6015625, + "learning_rate": 8.159671980081994e-06, + "loss": 1.3506, + "step": 6501 + }, + { + "epoch": 1.1217113775554213, + "grad_norm": 0.62109375, + "learning_rate": 8.156996658776121e-06, + "loss": 1.3836, + "step": 6502 + }, + { + "epoch": 1.1218838954541535, + "grad_norm": 0.6171875, + "learning_rate": 8.154321474011457e-06, + "loss": 1.368, + "step": 6503 + }, + { + "epoch": 1.1220564133528854, + "grad_norm": 0.7109375, + "learning_rate": 8.151646425986187e-06, + "loss": 1.3849, + "step": 6504 + }, + { + "epoch": 1.1222289312516174, + "grad_norm": 0.62109375, + "learning_rate": 8.148971514898503e-06, + "loss": 1.4842, + "step": 6505 + }, + { + "epoch": 1.1224014491503493, + "grad_norm": 0.5625, + "learning_rate": 8.14629674094657e-06, + "loss": 1.5188, + "step": 6506 + }, + { + "epoch": 1.1225739670490813, + "grad_norm": 0.56640625, + "learning_rate": 8.14362210432856e-06, + "loss": 1.3948, + "step": 6507 + }, + { + "epoch": 1.1227464849478133, + "grad_norm": 0.61328125, + "learning_rate": 8.140947605242622e-06, + "loss": 1.3766, + "step": 6508 + }, + { + "epoch": 1.1229190028465452, + "grad_norm": 0.61328125, + "learning_rate": 8.138273243886902e-06, + "loss": 1.4794, + "step": 6509 + }, + { + "epoch": 1.1230915207452774, + "grad_norm": 0.578125, + "learning_rate": 8.135599020459531e-06, + "loss": 1.4689, + "step": 6510 + }, + { + "epoch": 1.1232640386440094, + "grad_norm": 0.5859375, + "learning_rate": 8.132924935158638e-06, + "loss": 1.4573, + "step": 6511 + }, + { + "epoch": 1.1234365565427413, + "grad_norm": 0.5625, + "learning_rate": 8.130250988182328e-06, + "loss": 1.4931, + "step": 6512 + }, + { + "epoch": 1.1236090744414733, + "grad_norm": 0.58984375, + "learning_rate": 8.127577179728708e-06, + "loss": 1.3708, + "step": 6513 + }, + { + "epoch": 1.1237815923402052, + "grad_norm": 0.63671875, + "learning_rate": 8.124903509995872e-06, + "loss": 1.4501, + "step": 6514 + }, + { + "epoch": 1.1239541102389372, + "grad_norm": 0.61328125, + "learning_rate": 8.122229979181899e-06, + "loss": 1.3516, + "step": 6515 + }, + { + "epoch": 1.1241266281376694, + "grad_norm": 0.61328125, + "learning_rate": 8.119556587484863e-06, + "loss": 1.4109, + "step": 6516 + }, + { + "epoch": 1.1242991460364014, + "grad_norm": 0.59375, + "learning_rate": 8.116883335102821e-06, + "loss": 1.4759, + "step": 6517 + }, + { + "epoch": 1.1244716639351333, + "grad_norm": 0.62109375, + "learning_rate": 8.114210222233832e-06, + "loss": 1.3568, + "step": 6518 + }, + { + "epoch": 1.1246441818338653, + "grad_norm": 0.6328125, + "learning_rate": 8.11153724907593e-06, + "loss": 1.3548, + "step": 6519 + }, + { + "epoch": 1.1248166997325972, + "grad_norm": 0.62109375, + "learning_rate": 8.108864415827152e-06, + "loss": 1.4087, + "step": 6520 + }, + { + "epoch": 1.1249892176313292, + "grad_norm": 0.55859375, + "learning_rate": 8.10619172268551e-06, + "loss": 1.4023, + "step": 6521 + }, + { + "epoch": 1.1251617355300612, + "grad_norm": 0.6328125, + "learning_rate": 8.10351916984902e-06, + "loss": 1.4742, + "step": 6522 + }, + { + "epoch": 1.1253342534287931, + "grad_norm": 0.58984375, + "learning_rate": 8.10084675751568e-06, + "loss": 1.4095, + "step": 6523 + }, + { + "epoch": 1.1255067713275253, + "grad_norm": 0.5625, + "learning_rate": 8.09817448588348e-06, + "loss": 1.4618, + "step": 6524 + }, + { + "epoch": 1.1256792892262573, + "grad_norm": 0.5703125, + "learning_rate": 8.095502355150392e-06, + "loss": 1.3808, + "step": 6525 + }, + { + "epoch": 1.1258518071249892, + "grad_norm": 0.7890625, + "learning_rate": 8.092830365514395e-06, + "loss": 1.4675, + "step": 6526 + }, + { + "epoch": 1.1260243250237212, + "grad_norm": 0.62890625, + "learning_rate": 8.090158517173438e-06, + "loss": 1.4233, + "step": 6527 + }, + { + "epoch": 1.1261968429224531, + "grad_norm": 0.6328125, + "learning_rate": 8.087486810325475e-06, + "loss": 1.5255, + "step": 6528 + }, + { + "epoch": 1.126369360821185, + "grad_norm": 0.66015625, + "learning_rate": 8.084815245168434e-06, + "loss": 1.4081, + "step": 6529 + }, + { + "epoch": 1.1265418787199173, + "grad_norm": 0.6015625, + "learning_rate": 8.082143821900246e-06, + "loss": 1.4038, + "step": 6530 + }, + { + "epoch": 1.1267143966186492, + "grad_norm": 0.57421875, + "learning_rate": 8.079472540718833e-06, + "loss": 1.4907, + "step": 6531 + }, + { + "epoch": 1.1268869145173812, + "grad_norm": 0.55078125, + "learning_rate": 8.076801401822088e-06, + "loss": 1.413, + "step": 6532 + }, + { + "epoch": 1.1270594324161132, + "grad_norm": 0.56640625, + "learning_rate": 8.074130405407915e-06, + "loss": 1.4993, + "step": 6533 + }, + { + "epoch": 1.1272319503148451, + "grad_norm": 0.5859375, + "learning_rate": 8.071459551674193e-06, + "loss": 1.4363, + "step": 6534 + }, + { + "epoch": 1.127404468213577, + "grad_norm": 0.5703125, + "learning_rate": 8.0687888408188e-06, + "loss": 1.4667, + "step": 6535 + }, + { + "epoch": 1.127576986112309, + "grad_norm": 0.6015625, + "learning_rate": 8.066118273039597e-06, + "loss": 1.5419, + "step": 6536 + }, + { + "epoch": 1.1277495040110412, + "grad_norm": 0.58203125, + "learning_rate": 8.063447848534435e-06, + "loss": 1.4496, + "step": 6537 + }, + { + "epoch": 1.1279220219097732, + "grad_norm": 0.61328125, + "learning_rate": 8.060777567501154e-06, + "loss": 1.5354, + "step": 6538 + }, + { + "epoch": 1.1280945398085052, + "grad_norm": 0.65234375, + "learning_rate": 8.05810743013759e-06, + "loss": 1.5538, + "step": 6539 + }, + { + "epoch": 1.1282670577072371, + "grad_norm": 0.5625, + "learning_rate": 8.05543743664156e-06, + "loss": 1.4138, + "step": 6540 + }, + { + "epoch": 1.128439575605969, + "grad_norm": 0.53125, + "learning_rate": 8.052767587210878e-06, + "loss": 1.3768, + "step": 6541 + }, + { + "epoch": 1.128612093504701, + "grad_norm": 0.6328125, + "learning_rate": 8.050097882043334e-06, + "loss": 1.494, + "step": 6542 + }, + { + "epoch": 1.1287846114034332, + "grad_norm": 0.5703125, + "learning_rate": 8.047428321336728e-06, + "loss": 1.3528, + "step": 6543 + }, + { + "epoch": 1.1289571293021652, + "grad_norm": 0.63671875, + "learning_rate": 8.04475890528883e-06, + "loss": 1.4149, + "step": 6544 + }, + { + "epoch": 1.1291296472008971, + "grad_norm": 0.60546875, + "learning_rate": 8.042089634097406e-06, + "loss": 1.5056, + "step": 6545 + }, + { + "epoch": 1.129302165099629, + "grad_norm": 0.59765625, + "learning_rate": 8.03942050796022e-06, + "loss": 1.4004, + "step": 6546 + }, + { + "epoch": 1.129474682998361, + "grad_norm": 0.60546875, + "learning_rate": 8.036751527075011e-06, + "loss": 1.476, + "step": 6547 + }, + { + "epoch": 1.129647200897093, + "grad_norm": 0.5859375, + "learning_rate": 8.034082691639519e-06, + "loss": 1.4687, + "step": 6548 + }, + { + "epoch": 1.129819718795825, + "grad_norm": 0.6328125, + "learning_rate": 8.031414001851459e-06, + "loss": 1.5199, + "step": 6549 + }, + { + "epoch": 1.129992236694557, + "grad_norm": 0.5625, + "learning_rate": 8.028745457908555e-06, + "loss": 1.3649, + "step": 6550 + }, + { + "epoch": 1.1301647545932891, + "grad_norm": 0.609375, + "learning_rate": 8.026077060008503e-06, + "loss": 1.4537, + "step": 6551 + }, + { + "epoch": 1.130337272492021, + "grad_norm": 0.5546875, + "learning_rate": 8.023408808348999e-06, + "loss": 1.4067, + "step": 6552 + }, + { + "epoch": 1.130509790390753, + "grad_norm": 0.58984375, + "learning_rate": 8.020740703127715e-06, + "loss": 1.4795, + "step": 6553 + }, + { + "epoch": 1.130682308289485, + "grad_norm": 0.5703125, + "learning_rate": 8.01807274454233e-06, + "loss": 1.472, + "step": 6554 + }, + { + "epoch": 1.130854826188217, + "grad_norm": 0.640625, + "learning_rate": 8.015404932790499e-06, + "loss": 1.4342, + "step": 6555 + }, + { + "epoch": 1.131027344086949, + "grad_norm": 0.58203125, + "learning_rate": 8.012737268069873e-06, + "loss": 1.3848, + "step": 6556 + }, + { + "epoch": 1.1311998619856811, + "grad_norm": 0.59375, + "learning_rate": 8.010069750578082e-06, + "loss": 1.3666, + "step": 6557 + }, + { + "epoch": 1.131372379884413, + "grad_norm": 0.5703125, + "learning_rate": 8.007402380512763e-06, + "loss": 1.3574, + "step": 6558 + }, + { + "epoch": 1.131544897783145, + "grad_norm": 0.60546875, + "learning_rate": 8.004735158071524e-06, + "loss": 1.5037, + "step": 6559 + }, + { + "epoch": 1.131717415681877, + "grad_norm": 0.6171875, + "learning_rate": 8.002068083451973e-06, + "loss": 1.4382, + "step": 6560 + }, + { + "epoch": 1.131889933580609, + "grad_norm": 0.5703125, + "learning_rate": 7.999401156851697e-06, + "loss": 1.452, + "step": 6561 + }, + { + "epoch": 1.132062451479341, + "grad_norm": 0.60546875, + "learning_rate": 7.996734378468284e-06, + "loss": 1.4871, + "step": 6562 + }, + { + "epoch": 1.1322349693780729, + "grad_norm": 0.60546875, + "learning_rate": 7.99406774849931e-06, + "loss": 1.4092, + "step": 6563 + }, + { + "epoch": 1.1324074872768048, + "grad_norm": 0.57421875, + "learning_rate": 7.991401267142329e-06, + "loss": 1.3844, + "step": 6564 + }, + { + "epoch": 1.132580005175537, + "grad_norm": 0.578125, + "learning_rate": 7.988734934594893e-06, + "loss": 1.4034, + "step": 6565 + }, + { + "epoch": 1.132752523074269, + "grad_norm": 0.58203125, + "learning_rate": 7.986068751054537e-06, + "loss": 1.4734, + "step": 6566 + }, + { + "epoch": 1.132925040973001, + "grad_norm": 0.6171875, + "learning_rate": 7.983402716718796e-06, + "loss": 1.3407, + "step": 6567 + }, + { + "epoch": 1.133097558871733, + "grad_norm": 0.58203125, + "learning_rate": 7.980736831785178e-06, + "loss": 1.5068, + "step": 6568 + }, + { + "epoch": 1.1332700767704649, + "grad_norm": 0.60546875, + "learning_rate": 7.978071096451198e-06, + "loss": 1.4126, + "step": 6569 + }, + { + "epoch": 1.1334425946691968, + "grad_norm": 0.5859375, + "learning_rate": 7.975405510914338e-06, + "loss": 1.3754, + "step": 6570 + }, + { + "epoch": 1.133615112567929, + "grad_norm": 0.5625, + "learning_rate": 7.972740075372094e-06, + "loss": 1.4839, + "step": 6571 + }, + { + "epoch": 1.133787630466661, + "grad_norm": 0.5859375, + "learning_rate": 7.970074790021928e-06, + "loss": 1.379, + "step": 6572 + }, + { + "epoch": 1.133960148365393, + "grad_norm": 0.59375, + "learning_rate": 7.967409655061308e-06, + "loss": 1.4018, + "step": 6573 + }, + { + "epoch": 1.134132666264125, + "grad_norm": 0.640625, + "learning_rate": 7.964744670687676e-06, + "loss": 1.3949, + "step": 6574 + }, + { + "epoch": 1.1343051841628569, + "grad_norm": 0.578125, + "learning_rate": 7.962079837098481e-06, + "loss": 1.4509, + "step": 6575 + }, + { + "epoch": 1.1344777020615888, + "grad_norm": 0.55859375, + "learning_rate": 7.959415154491142e-06, + "loss": 1.3816, + "step": 6576 + }, + { + "epoch": 1.1346502199603208, + "grad_norm": 0.5546875, + "learning_rate": 7.956750623063077e-06, + "loss": 1.4659, + "step": 6577 + }, + { + "epoch": 1.134822737859053, + "grad_norm": 0.5546875, + "learning_rate": 7.954086243011698e-06, + "loss": 1.3871, + "step": 6578 + }, + { + "epoch": 1.134995255757785, + "grad_norm": 0.609375, + "learning_rate": 7.95142201453439e-06, + "loss": 1.354, + "step": 6579 + }, + { + "epoch": 1.1351677736565169, + "grad_norm": 0.57421875, + "learning_rate": 7.948757937828542e-06, + "loss": 1.405, + "step": 6580 + }, + { + "epoch": 1.1353402915552488, + "grad_norm": 0.609375, + "learning_rate": 7.946094013091518e-06, + "loss": 1.4215, + "step": 6581 + }, + { + "epoch": 1.1355128094539808, + "grad_norm": 0.59375, + "learning_rate": 7.943430240520689e-06, + "loss": 1.3913, + "step": 6582 + }, + { + "epoch": 1.1356853273527128, + "grad_norm": 0.578125, + "learning_rate": 7.940766620313394e-06, + "loss": 1.394, + "step": 6583 + }, + { + "epoch": 1.135857845251445, + "grad_norm": 0.58984375, + "learning_rate": 7.938103152666976e-06, + "loss": 1.374, + "step": 6584 + }, + { + "epoch": 1.136030363150177, + "grad_norm": 0.64453125, + "learning_rate": 7.935439837778757e-06, + "loss": 1.3971, + "step": 6585 + }, + { + "epoch": 1.1362028810489089, + "grad_norm": 0.66796875, + "learning_rate": 7.932776675846058e-06, + "loss": 1.4629, + "step": 6586 + }, + { + "epoch": 1.1363753989476408, + "grad_norm": 0.59375, + "learning_rate": 7.930113667066177e-06, + "loss": 1.4402, + "step": 6587 + }, + { + "epoch": 1.1365479168463728, + "grad_norm": 0.62890625, + "learning_rate": 7.927450811636413e-06, + "loss": 1.3988, + "step": 6588 + }, + { + "epoch": 1.1367204347451048, + "grad_norm": 0.6015625, + "learning_rate": 7.924788109754036e-06, + "loss": 1.355, + "step": 6589 + }, + { + "epoch": 1.1368929526438367, + "grad_norm": 0.62109375, + "learning_rate": 7.922125561616329e-06, + "loss": 1.397, + "step": 6590 + }, + { + "epoch": 1.1370654705425687, + "grad_norm": 0.66796875, + "learning_rate": 7.919463167420538e-06, + "loss": 1.5036, + "step": 6591 + }, + { + "epoch": 1.1372379884413009, + "grad_norm": 0.64453125, + "learning_rate": 7.91680092736392e-06, + "loss": 1.4029, + "step": 6592 + }, + { + "epoch": 1.1374105063400328, + "grad_norm": 0.57421875, + "learning_rate": 7.914138841643702e-06, + "loss": 1.4439, + "step": 6593 + }, + { + "epoch": 1.1375830242387648, + "grad_norm": 0.6328125, + "learning_rate": 7.91147691045711e-06, + "loss": 1.4961, + "step": 6594 + }, + { + "epoch": 1.1377555421374967, + "grad_norm": 0.60546875, + "learning_rate": 7.908815134001363e-06, + "loss": 1.3272, + "step": 6595 + }, + { + "epoch": 1.1379280600362287, + "grad_norm": 0.62890625, + "learning_rate": 7.906153512473656e-06, + "loss": 1.4737, + "step": 6596 + }, + { + "epoch": 1.1381005779349607, + "grad_norm": 0.55859375, + "learning_rate": 7.903492046071182e-06, + "loss": 1.3569, + "step": 6597 + }, + { + "epoch": 1.1382730958336928, + "grad_norm": 0.609375, + "learning_rate": 7.900830734991111e-06, + "loss": 1.4397, + "step": 6598 + }, + { + "epoch": 1.1384456137324248, + "grad_norm": 0.69921875, + "learning_rate": 7.89816957943062e-06, + "loss": 1.3377, + "step": 6599 + }, + { + "epoch": 1.1386181316311568, + "grad_norm": 0.5859375, + "learning_rate": 7.895508579586857e-06, + "loss": 1.5259, + "step": 6600 + }, + { + "epoch": 1.1386181316311568, + "eval_loss": 1.408766746520996, + "eval_runtime": 10.7693, + "eval_samples_per_second": 95.085, + "eval_steps_per_second": 23.771, + "step": 6600 + }, + { + "epoch": 1.1387906495298887, + "grad_norm": 0.6328125, + "learning_rate": 7.892847735656972e-06, + "loss": 1.528, + "step": 6601 + }, + { + "epoch": 1.1389631674286207, + "grad_norm": 0.58984375, + "learning_rate": 7.890187047838087e-06, + "loss": 1.4519, + "step": 6602 + }, + { + "epoch": 1.1391356853273527, + "grad_norm": 0.609375, + "learning_rate": 7.887526516327334e-06, + "loss": 1.4995, + "step": 6603 + }, + { + "epoch": 1.1393082032260846, + "grad_norm": 0.66015625, + "learning_rate": 7.884866141321811e-06, + "loss": 1.4422, + "step": 6604 + }, + { + "epoch": 1.1394807211248166, + "grad_norm": 0.54296875, + "learning_rate": 7.882205923018624e-06, + "loss": 1.4153, + "step": 6605 + }, + { + "epoch": 1.1396532390235488, + "grad_norm": 0.6171875, + "learning_rate": 7.879545861614851e-06, + "loss": 1.4228, + "step": 6606 + }, + { + "epoch": 1.1398257569222807, + "grad_norm": 0.5390625, + "learning_rate": 7.876885957307573e-06, + "loss": 1.3649, + "step": 6607 + }, + { + "epoch": 1.1399982748210127, + "grad_norm": 0.63671875, + "learning_rate": 7.874226210293847e-06, + "loss": 1.4664, + "step": 6608 + }, + { + "epoch": 1.1401707927197446, + "grad_norm": 0.66796875, + "learning_rate": 7.871566620770726e-06, + "loss": 1.4722, + "step": 6609 + }, + { + "epoch": 1.1403433106184766, + "grad_norm": 0.67578125, + "learning_rate": 7.86890718893525e-06, + "loss": 1.3472, + "step": 6610 + }, + { + "epoch": 1.1405158285172088, + "grad_norm": 0.8515625, + "learning_rate": 7.866247914984444e-06, + "loss": 1.3521, + "step": 6611 + }, + { + "epoch": 1.1406883464159407, + "grad_norm": 0.82421875, + "learning_rate": 7.863588799115327e-06, + "loss": 1.3674, + "step": 6612 + }, + { + "epoch": 1.1408608643146727, + "grad_norm": 0.62890625, + "learning_rate": 7.860929841524898e-06, + "loss": 1.3509, + "step": 6613 + }, + { + "epoch": 1.1410333822134047, + "grad_norm": 0.63671875, + "learning_rate": 7.858271042410153e-06, + "loss": 1.4391, + "step": 6614 + }, + { + "epoch": 1.1412059001121366, + "grad_norm": 0.8125, + "learning_rate": 7.855612401968072e-06, + "loss": 1.5101, + "step": 6615 + }, + { + "epoch": 1.1413784180108686, + "grad_norm": 0.62109375, + "learning_rate": 7.852953920395623e-06, + "loss": 1.4194, + "step": 6616 + }, + { + "epoch": 1.1415509359096006, + "grad_norm": 0.56640625, + "learning_rate": 7.85029559788976e-06, + "loss": 1.458, + "step": 6617 + }, + { + "epoch": 1.1417234538083325, + "grad_norm": 0.67578125, + "learning_rate": 7.847637434647436e-06, + "loss": 1.4617, + "step": 6618 + }, + { + "epoch": 1.1418959717070647, + "grad_norm": 0.59765625, + "learning_rate": 7.844979430865575e-06, + "loss": 1.3958, + "step": 6619 + }, + { + "epoch": 1.1420684896057967, + "grad_norm": 0.7109375, + "learning_rate": 7.842321586741107e-06, + "loss": 1.516, + "step": 6620 + }, + { + "epoch": 1.1422410075045286, + "grad_norm": 0.86328125, + "learning_rate": 7.839663902470933e-06, + "loss": 1.4452, + "step": 6621 + }, + { + "epoch": 1.1424135254032606, + "grad_norm": 4.40625, + "learning_rate": 7.837006378251959e-06, + "loss": 1.3733, + "step": 6622 + }, + { + "epoch": 1.1425860433019925, + "grad_norm": 0.60546875, + "learning_rate": 7.834349014281065e-06, + "loss": 1.4408, + "step": 6623 + }, + { + "epoch": 1.1427585612007245, + "grad_norm": 0.6640625, + "learning_rate": 7.83169181075513e-06, + "loss": 1.4705, + "step": 6624 + }, + { + "epoch": 1.1429310790994567, + "grad_norm": 0.62109375, + "learning_rate": 7.82903476787101e-06, + "loss": 1.3875, + "step": 6625 + }, + { + "epoch": 1.1431035969981886, + "grad_norm": 0.59765625, + "learning_rate": 7.826377885825561e-06, + "loss": 1.5407, + "step": 6626 + }, + { + "epoch": 1.1432761148969206, + "grad_norm": 0.6328125, + "learning_rate": 7.823721164815624e-06, + "loss": 1.4636, + "step": 6627 + }, + { + "epoch": 1.1434486327956526, + "grad_norm": 0.640625, + "learning_rate": 7.821064605038016e-06, + "loss": 1.4006, + "step": 6628 + }, + { + "epoch": 1.1436211506943845, + "grad_norm": 0.6171875, + "learning_rate": 7.818408206689561e-06, + "loss": 1.4363, + "step": 6629 + }, + { + "epoch": 1.1437936685931165, + "grad_norm": 0.62109375, + "learning_rate": 7.815751969967052e-06, + "loss": 1.3846, + "step": 6630 + }, + { + "epoch": 1.1439661864918484, + "grad_norm": 0.57421875, + "learning_rate": 7.813095895067289e-06, + "loss": 1.4288, + "step": 6631 + }, + { + "epoch": 1.1441387043905804, + "grad_norm": 0.63671875, + "learning_rate": 7.810439982187045e-06, + "loss": 1.3967, + "step": 6632 + }, + { + "epoch": 1.1443112222893126, + "grad_norm": 0.66796875, + "learning_rate": 7.80778423152309e-06, + "loss": 1.3882, + "step": 6633 + }, + { + "epoch": 1.1444837401880446, + "grad_norm": 0.58203125, + "learning_rate": 7.805128643272171e-06, + "loss": 1.4812, + "step": 6634 + }, + { + "epoch": 1.1446562580867765, + "grad_norm": 0.6953125, + "learning_rate": 7.802473217631043e-06, + "loss": 1.4707, + "step": 6635 + }, + { + "epoch": 1.1448287759855085, + "grad_norm": 0.58984375, + "learning_rate": 7.799817954796427e-06, + "loss": 1.4223, + "step": 6636 + }, + { + "epoch": 1.1450012938842404, + "grad_norm": 0.67578125, + "learning_rate": 7.797162854965046e-06, + "loss": 1.401, + "step": 6637 + }, + { + "epoch": 1.1451738117829724, + "grad_norm": 0.6796875, + "learning_rate": 7.7945079183336e-06, + "loss": 1.3815, + "step": 6638 + }, + { + "epoch": 1.1453463296817046, + "grad_norm": 0.58203125, + "learning_rate": 7.791853145098792e-06, + "loss": 1.4514, + "step": 6639 + }, + { + "epoch": 1.1455188475804365, + "grad_norm": 0.6484375, + "learning_rate": 7.7891985354573e-06, + "loss": 1.4598, + "step": 6640 + }, + { + "epoch": 1.1456913654791685, + "grad_norm": 0.62109375, + "learning_rate": 7.78654408960579e-06, + "loss": 1.3204, + "step": 6641 + }, + { + "epoch": 1.1458638833779005, + "grad_norm": 0.6171875, + "learning_rate": 7.78388980774093e-06, + "loss": 1.5014, + "step": 6642 + }, + { + "epoch": 1.1460364012766324, + "grad_norm": 0.578125, + "learning_rate": 7.781235690059356e-06, + "loss": 1.3385, + "step": 6643 + }, + { + "epoch": 1.1462089191753644, + "grad_norm": 0.59765625, + "learning_rate": 7.77858173675771e-06, + "loss": 1.4659, + "step": 6644 + }, + { + "epoch": 1.1463814370740963, + "grad_norm": 0.5859375, + "learning_rate": 7.775927948032602e-06, + "loss": 1.3328, + "step": 6645 + }, + { + "epoch": 1.1465539549728285, + "grad_norm": 2.5625, + "learning_rate": 7.773274324080655e-06, + "loss": 1.4962, + "step": 6646 + }, + { + "epoch": 1.1467264728715605, + "grad_norm": 0.66796875, + "learning_rate": 7.770620865098455e-06, + "loss": 1.3894, + "step": 6647 + }, + { + "epoch": 1.1468989907702924, + "grad_norm": 0.62109375, + "learning_rate": 7.767967571282595e-06, + "loss": 1.4642, + "step": 6648 + }, + { + "epoch": 1.1470715086690244, + "grad_norm": 0.66796875, + "learning_rate": 7.76531444282964e-06, + "loss": 1.4378, + "step": 6649 + }, + { + "epoch": 1.1472440265677564, + "grad_norm": 0.76171875, + "learning_rate": 7.762661479936157e-06, + "loss": 1.4204, + "step": 6650 + }, + { + "epoch": 1.1474165444664883, + "grad_norm": 0.5703125, + "learning_rate": 7.760008682798687e-06, + "loss": 1.5035, + "step": 6651 + }, + { + "epoch": 1.1475890623652205, + "grad_norm": 0.671875, + "learning_rate": 7.757356051613774e-06, + "loss": 1.4768, + "step": 6652 + }, + { + "epoch": 1.1477615802639525, + "grad_norm": 0.640625, + "learning_rate": 7.754703586577935e-06, + "loss": 1.4607, + "step": 6653 + }, + { + "epoch": 1.1479340981626844, + "grad_norm": 0.5546875, + "learning_rate": 7.752051287887685e-06, + "loss": 1.3546, + "step": 6654 + }, + { + "epoch": 1.1481066160614164, + "grad_norm": 0.5703125, + "learning_rate": 7.74939915573952e-06, + "loss": 1.3577, + "step": 6655 + }, + { + "epoch": 1.1482791339601484, + "grad_norm": 0.59375, + "learning_rate": 7.74674719032993e-06, + "loss": 1.402, + "step": 6656 + }, + { + "epoch": 1.1484516518588803, + "grad_norm": 0.5703125, + "learning_rate": 7.744095391855386e-06, + "loss": 1.4069, + "step": 6657 + }, + { + "epoch": 1.1486241697576123, + "grad_norm": 0.55859375, + "learning_rate": 7.741443760512348e-06, + "loss": 1.4762, + "step": 6658 + }, + { + "epoch": 1.1487966876563442, + "grad_norm": 0.6171875, + "learning_rate": 7.738792296497272e-06, + "loss": 1.416, + "step": 6659 + }, + { + "epoch": 1.1489692055550764, + "grad_norm": 0.62890625, + "learning_rate": 7.736141000006589e-06, + "loss": 1.4218, + "step": 6660 + }, + { + "epoch": 1.1491417234538084, + "grad_norm": 0.734375, + "learning_rate": 7.73348987123673e-06, + "loss": 1.4855, + "step": 6661 + }, + { + "epoch": 1.1493142413525403, + "grad_norm": 0.62890625, + "learning_rate": 7.730838910384098e-06, + "loss": 1.4101, + "step": 6662 + }, + { + "epoch": 1.1494867592512723, + "grad_norm": 0.70703125, + "learning_rate": 7.728188117645103e-06, + "loss": 1.3949, + "step": 6663 + }, + { + "epoch": 1.1496592771500043, + "grad_norm": 0.72265625, + "learning_rate": 7.725537493216125e-06, + "loss": 1.3943, + "step": 6664 + }, + { + "epoch": 1.1498317950487362, + "grad_norm": 0.5859375, + "learning_rate": 7.72288703729354e-06, + "loss": 1.3493, + "step": 6665 + }, + { + "epoch": 1.1500043129474684, + "grad_norm": 1.0546875, + "learning_rate": 7.72023675007371e-06, + "loss": 1.3391, + "step": 6666 + }, + { + "epoch": 1.1501768308462004, + "grad_norm": 0.609375, + "learning_rate": 7.71758663175299e-06, + "loss": 1.3756, + "step": 6667 + }, + { + "epoch": 1.1503493487449323, + "grad_norm": 0.625, + "learning_rate": 7.714936682527712e-06, + "loss": 1.4111, + "step": 6668 + }, + { + "epoch": 1.1505218666436643, + "grad_norm": 0.58203125, + "learning_rate": 7.712286902594205e-06, + "loss": 1.3056, + "step": 6669 + }, + { + "epoch": 1.1506943845423963, + "grad_norm": 0.5859375, + "learning_rate": 7.709637292148771e-06, + "loss": 1.4415, + "step": 6670 + }, + { + "epoch": 1.1508669024411282, + "grad_norm": 0.6171875, + "learning_rate": 7.706987851387724e-06, + "loss": 1.4628, + "step": 6671 + }, + { + "epoch": 1.1510394203398602, + "grad_norm": 0.66015625, + "learning_rate": 7.704338580507341e-06, + "loss": 1.4112, + "step": 6672 + }, + { + "epoch": 1.1512119382385921, + "grad_norm": 0.6171875, + "learning_rate": 7.701689479703899e-06, + "loss": 1.422, + "step": 6673 + }, + { + "epoch": 1.1513844561373243, + "grad_norm": 0.75390625, + "learning_rate": 7.699040549173664e-06, + "loss": 1.4545, + "step": 6674 + }, + { + "epoch": 1.1515569740360563, + "grad_norm": 0.5625, + "learning_rate": 7.69639178911288e-06, + "loss": 1.3825, + "step": 6675 + }, + { + "epoch": 1.1517294919347882, + "grad_norm": 0.6875, + "learning_rate": 7.693743199717789e-06, + "loss": 1.4206, + "step": 6676 + }, + { + "epoch": 1.1519020098335202, + "grad_norm": 0.5859375, + "learning_rate": 7.691094781184608e-06, + "loss": 1.4968, + "step": 6677 + }, + { + "epoch": 1.1520745277322522, + "grad_norm": 0.59375, + "learning_rate": 7.688446533709556e-06, + "loss": 1.4569, + "step": 6678 + }, + { + "epoch": 1.1522470456309841, + "grad_norm": 0.60546875, + "learning_rate": 7.685798457488824e-06, + "loss": 1.435, + "step": 6679 + }, + { + "epoch": 1.1524195635297163, + "grad_norm": 0.60546875, + "learning_rate": 7.683150552718608e-06, + "loss": 1.4661, + "step": 6680 + }, + { + "epoch": 1.1525920814284483, + "grad_norm": 0.60546875, + "learning_rate": 7.680502819595067e-06, + "loss": 1.3847, + "step": 6681 + }, + { + "epoch": 1.1527645993271802, + "grad_norm": 0.62890625, + "learning_rate": 7.677855258314378e-06, + "loss": 1.5205, + "step": 6682 + }, + { + "epoch": 1.1529371172259122, + "grad_norm": 0.63671875, + "learning_rate": 7.675207869072675e-06, + "loss": 1.4652, + "step": 6683 + }, + { + "epoch": 1.1531096351246442, + "grad_norm": 0.5546875, + "learning_rate": 7.672560652066104e-06, + "loss": 1.3736, + "step": 6684 + }, + { + "epoch": 1.153282153023376, + "grad_norm": 0.578125, + "learning_rate": 7.66991360749078e-06, + "loss": 1.4111, + "step": 6685 + }, + { + "epoch": 1.153454670922108, + "grad_norm": 0.60546875, + "learning_rate": 7.667266735542816e-06, + "loss": 1.4514, + "step": 6686 + }, + { + "epoch": 1.1536271888208403, + "grad_norm": 0.6875, + "learning_rate": 7.664620036418304e-06, + "loss": 1.3545, + "step": 6687 + }, + { + "epoch": 1.1537997067195722, + "grad_norm": 0.65234375, + "learning_rate": 7.661973510313336e-06, + "loss": 1.2812, + "step": 6688 + }, + { + "epoch": 1.1539722246183042, + "grad_norm": 0.8359375, + "learning_rate": 7.659327157423977e-06, + "loss": 1.4696, + "step": 6689 + }, + { + "epoch": 1.1541447425170361, + "grad_norm": 0.62109375, + "learning_rate": 7.656680977946286e-06, + "loss": 1.4847, + "step": 6690 + }, + { + "epoch": 1.154317260415768, + "grad_norm": 0.66796875, + "learning_rate": 7.654034972076314e-06, + "loss": 1.3304, + "step": 6691 + }, + { + "epoch": 1.1544897783145, + "grad_norm": 0.59375, + "learning_rate": 7.651389140010087e-06, + "loss": 1.3784, + "step": 6692 + }, + { + "epoch": 1.1546622962132322, + "grad_norm": 0.78125, + "learning_rate": 7.648743481943628e-06, + "loss": 1.4342, + "step": 6693 + }, + { + "epoch": 1.1548348141119642, + "grad_norm": 0.609375, + "learning_rate": 7.646097998072941e-06, + "loss": 1.4824, + "step": 6694 + }, + { + "epoch": 1.1550073320106962, + "grad_norm": 0.61328125, + "learning_rate": 7.643452688594026e-06, + "loss": 1.3911, + "step": 6695 + }, + { + "epoch": 1.1551798499094281, + "grad_norm": 0.578125, + "learning_rate": 7.640807553702858e-06, + "loss": 1.3827, + "step": 6696 + }, + { + "epoch": 1.15535236780816, + "grad_norm": 0.71484375, + "learning_rate": 7.63816259359541e-06, + "loss": 1.3959, + "step": 6697 + }, + { + "epoch": 1.155524885706892, + "grad_norm": 0.58984375, + "learning_rate": 7.63551780846763e-06, + "loss": 1.4016, + "step": 6698 + }, + { + "epoch": 1.155697403605624, + "grad_norm": 0.62890625, + "learning_rate": 7.632873198515468e-06, + "loss": 1.4195, + "step": 6699 + }, + { + "epoch": 1.155869921504356, + "grad_norm": 0.5625, + "learning_rate": 7.630228763934848e-06, + "loss": 1.396, + "step": 6700 + }, + { + "epoch": 1.155869921504356, + "eval_loss": 1.408650517463684, + "eval_runtime": 10.8332, + "eval_samples_per_second": 94.525, + "eval_steps_per_second": 23.631, + "step": 6700 + }, + { + "epoch": 1.1560424394030882, + "grad_norm": 0.63671875, + "learning_rate": 7.6275845049216914e-06, + "loss": 1.4485, + "step": 6701 + }, + { + "epoch": 1.1562149573018201, + "grad_norm": 0.5859375, + "learning_rate": 7.624940421671893e-06, + "loss": 1.4597, + "step": 6702 + }, + { + "epoch": 1.156387475200552, + "grad_norm": 0.72265625, + "learning_rate": 7.622296514381353e-06, + "loss": 1.4152, + "step": 6703 + }, + { + "epoch": 1.156559993099284, + "grad_norm": 0.5859375, + "learning_rate": 7.619652783245941e-06, + "loss": 1.4197, + "step": 6704 + }, + { + "epoch": 1.156732510998016, + "grad_norm": 0.66796875, + "learning_rate": 7.617009228461527e-06, + "loss": 1.4941, + "step": 6705 + }, + { + "epoch": 1.156905028896748, + "grad_norm": 0.60546875, + "learning_rate": 7.6143658502239546e-06, + "loss": 1.4696, + "step": 6706 + }, + { + "epoch": 1.1570775467954801, + "grad_norm": 0.640625, + "learning_rate": 7.611722648729065e-06, + "loss": 1.3503, + "step": 6707 + }, + { + "epoch": 1.157250064694212, + "grad_norm": 0.59765625, + "learning_rate": 7.609079624172692e-06, + "loss": 1.4071, + "step": 6708 + }, + { + "epoch": 1.157422582592944, + "grad_norm": 0.609375, + "learning_rate": 7.606436776750632e-06, + "loss": 1.3535, + "step": 6709 + }, + { + "epoch": 1.157595100491676, + "grad_norm": 0.6171875, + "learning_rate": 7.603794106658696e-06, + "loss": 1.515, + "step": 6710 + }, + { + "epoch": 1.157767618390408, + "grad_norm": 0.59375, + "learning_rate": 7.601151614092661e-06, + "loss": 1.4618, + "step": 6711 + }, + { + "epoch": 1.15794013628914, + "grad_norm": 0.6015625, + "learning_rate": 7.598509299248307e-06, + "loss": 1.4592, + "step": 6712 + }, + { + "epoch": 1.158112654187872, + "grad_norm": 0.5703125, + "learning_rate": 7.595867162321388e-06, + "loss": 1.4808, + "step": 6713 + }, + { + "epoch": 1.1582851720866039, + "grad_norm": 0.8125, + "learning_rate": 7.593225203507652e-06, + "loss": 1.3933, + "step": 6714 + }, + { + "epoch": 1.158457689985336, + "grad_norm": 0.6171875, + "learning_rate": 7.590583423002828e-06, + "loss": 1.5316, + "step": 6715 + }, + { + "epoch": 1.158630207884068, + "grad_norm": 0.6640625, + "learning_rate": 7.5879418210026425e-06, + "loss": 1.4468, + "step": 6716 + }, + { + "epoch": 1.1588027257828, + "grad_norm": 0.6875, + "learning_rate": 7.585300397702795e-06, + "loss": 1.4647, + "step": 6717 + }, + { + "epoch": 1.158975243681532, + "grad_norm": 0.6015625, + "learning_rate": 7.5826591532989855e-06, + "loss": 1.4042, + "step": 6718 + }, + { + "epoch": 1.159147761580264, + "grad_norm": 0.5625, + "learning_rate": 7.580018087986886e-06, + "loss": 1.4184, + "step": 6719 + }, + { + "epoch": 1.1593202794789959, + "grad_norm": 0.60546875, + "learning_rate": 7.57737720196217e-06, + "loss": 1.5034, + "step": 6720 + }, + { + "epoch": 1.159492797377728, + "grad_norm": 0.7109375, + "learning_rate": 7.574736495420487e-06, + "loss": 1.4559, + "step": 6721 + }, + { + "epoch": 1.15966531527646, + "grad_norm": 0.66015625, + "learning_rate": 7.572095968557476e-06, + "loss": 1.4681, + "step": 6722 + }, + { + "epoch": 1.159837833175192, + "grad_norm": 0.74609375, + "learning_rate": 7.56945562156877e-06, + "loss": 1.4418, + "step": 6723 + }, + { + "epoch": 1.160010351073924, + "grad_norm": 0.625, + "learning_rate": 7.566815454649976e-06, + "loss": 1.3457, + "step": 6724 + }, + { + "epoch": 1.1601828689726559, + "grad_norm": 0.61328125, + "learning_rate": 7.5641754679967e-06, + "loss": 1.3257, + "step": 6725 + }, + { + "epoch": 1.1603553868713878, + "grad_norm": 0.68359375, + "learning_rate": 7.561535661804519e-06, + "loss": 1.4105, + "step": 6726 + }, + { + "epoch": 1.1605279047701198, + "grad_norm": 0.75390625, + "learning_rate": 7.558896036269017e-06, + "loss": 1.4474, + "step": 6727 + }, + { + "epoch": 1.160700422668852, + "grad_norm": 0.6328125, + "learning_rate": 7.556256591585747e-06, + "loss": 1.4649, + "step": 6728 + }, + { + "epoch": 1.160872940567584, + "grad_norm": 0.625, + "learning_rate": 7.5536173279502615e-06, + "loss": 1.423, + "step": 6729 + }, + { + "epoch": 1.161045458466316, + "grad_norm": 0.5625, + "learning_rate": 7.550978245558084e-06, + "loss": 1.4107, + "step": 6730 + }, + { + "epoch": 1.1612179763650479, + "grad_norm": 0.59765625, + "learning_rate": 7.548339344604745e-06, + "loss": 1.4978, + "step": 6731 + }, + { + "epoch": 1.1613904942637798, + "grad_norm": 0.66015625, + "learning_rate": 7.5457006252857445e-06, + "loss": 1.5156, + "step": 6732 + }, + { + "epoch": 1.1615630121625118, + "grad_norm": 0.6328125, + "learning_rate": 7.543062087796579e-06, + "loss": 1.3659, + "step": 6733 + }, + { + "epoch": 1.161735530061244, + "grad_norm": 0.6015625, + "learning_rate": 7.540423732332721e-06, + "loss": 1.4551, + "step": 6734 + }, + { + "epoch": 1.161908047959976, + "grad_norm": 0.59375, + "learning_rate": 7.537785559089646e-06, + "loss": 1.3231, + "step": 6735 + }, + { + "epoch": 1.162080565858708, + "grad_norm": 0.5859375, + "learning_rate": 7.535147568262799e-06, + "loss": 1.4627, + "step": 6736 + }, + { + "epoch": 1.1622530837574399, + "grad_norm": 0.62890625, + "learning_rate": 7.532509760047621e-06, + "loss": 1.4135, + "step": 6737 + }, + { + "epoch": 1.1624256016561718, + "grad_norm": 0.59765625, + "learning_rate": 7.529872134639537e-06, + "loss": 1.3578, + "step": 6738 + }, + { + "epoch": 1.1625981195549038, + "grad_norm": 0.65625, + "learning_rate": 7.527234692233957e-06, + "loss": 1.3597, + "step": 6739 + }, + { + "epoch": 1.1627706374536357, + "grad_norm": 0.58203125, + "learning_rate": 7.524597433026286e-06, + "loss": 1.3543, + "step": 6740 + }, + { + "epoch": 1.1629431553523677, + "grad_norm": 0.6328125, + "learning_rate": 7.521960357211904e-06, + "loss": 1.4293, + "step": 6741 + }, + { + "epoch": 1.1631156732510999, + "grad_norm": 0.62109375, + "learning_rate": 7.519323464986182e-06, + "loss": 1.4101, + "step": 6742 + }, + { + "epoch": 1.1632881911498318, + "grad_norm": 1.1640625, + "learning_rate": 7.516686756544475e-06, + "loss": 1.3559, + "step": 6743 + }, + { + "epoch": 1.1634607090485638, + "grad_norm": 0.66796875, + "learning_rate": 7.514050232082133e-06, + "loss": 1.5265, + "step": 6744 + }, + { + "epoch": 1.1636332269472958, + "grad_norm": 0.63671875, + "learning_rate": 7.511413891794482e-06, + "loss": 1.4018, + "step": 6745 + }, + { + "epoch": 1.1638057448460277, + "grad_norm": 0.65234375, + "learning_rate": 7.508777735876839e-06, + "loss": 1.4592, + "step": 6746 + }, + { + "epoch": 1.1639782627447597, + "grad_norm": 0.61328125, + "learning_rate": 7.5061417645245034e-06, + "loss": 1.3904, + "step": 6747 + }, + { + "epoch": 1.1641507806434919, + "grad_norm": 0.5703125, + "learning_rate": 7.503505977932775e-06, + "loss": 1.4482, + "step": 6748 + }, + { + "epoch": 1.1643232985422238, + "grad_norm": 0.640625, + "learning_rate": 7.500870376296918e-06, + "loss": 1.4214, + "step": 6749 + }, + { + "epoch": 1.1644958164409558, + "grad_norm": 0.62890625, + "learning_rate": 7.4982349598122e-06, + "loss": 1.3156, + "step": 6750 + }, + { + "epoch": 1.1646683343396877, + "grad_norm": 0.6484375, + "learning_rate": 7.495599728673867e-06, + "loss": 1.3671, + "step": 6751 + }, + { + "epoch": 1.1648408522384197, + "grad_norm": 0.6328125, + "learning_rate": 7.492964683077156e-06, + "loss": 1.4576, + "step": 6752 + }, + { + "epoch": 1.1650133701371517, + "grad_norm": 0.58203125, + "learning_rate": 7.490329823217286e-06, + "loss": 1.321, + "step": 6753 + }, + { + "epoch": 1.1651858880358836, + "grad_norm": 0.5859375, + "learning_rate": 7.48769514928946e-06, + "loss": 1.3814, + "step": 6754 + }, + { + "epoch": 1.1653584059346156, + "grad_norm": 0.62109375, + "learning_rate": 7.485060661488879e-06, + "loss": 1.4575, + "step": 6755 + }, + { + "epoch": 1.1655309238333478, + "grad_norm": 0.65625, + "learning_rate": 7.482426360010717e-06, + "loss": 1.442, + "step": 6756 + }, + { + "epoch": 1.1657034417320797, + "grad_norm": 0.58203125, + "learning_rate": 7.479792245050142e-06, + "loss": 1.4058, + "step": 6757 + }, + { + "epoch": 1.1658759596308117, + "grad_norm": 0.6171875, + "learning_rate": 7.477158316802302e-06, + "loss": 1.4523, + "step": 6758 + }, + { + "epoch": 1.1660484775295437, + "grad_norm": 0.66015625, + "learning_rate": 7.474524575462341e-06, + "loss": 1.4831, + "step": 6759 + }, + { + "epoch": 1.1662209954282756, + "grad_norm": 0.671875, + "learning_rate": 7.471891021225376e-06, + "loss": 1.5114, + "step": 6760 + }, + { + "epoch": 1.1663935133270076, + "grad_norm": 0.62890625, + "learning_rate": 7.469257654286524e-06, + "loss": 1.4559, + "step": 6761 + }, + { + "epoch": 1.1665660312257398, + "grad_norm": 0.53515625, + "learning_rate": 7.466624474840872e-06, + "loss": 1.3233, + "step": 6762 + }, + { + "epoch": 1.1667385491244717, + "grad_norm": 0.578125, + "learning_rate": 7.463991483083512e-06, + "loss": 1.432, + "step": 6763 + }, + { + "epoch": 1.1669110670232037, + "grad_norm": 0.8203125, + "learning_rate": 7.461358679209509e-06, + "loss": 1.3795, + "step": 6764 + }, + { + "epoch": 1.1670835849219356, + "grad_norm": 0.609375, + "learning_rate": 7.458726063413918e-06, + "loss": 1.2887, + "step": 6765 + }, + { + "epoch": 1.1672561028206676, + "grad_norm": 0.55859375, + "learning_rate": 7.4560936358917745e-06, + "loss": 1.4637, + "step": 6766 + }, + { + "epoch": 1.1674286207193996, + "grad_norm": 0.5703125, + "learning_rate": 7.4534613968381146e-06, + "loss": 1.4384, + "step": 6767 + }, + { + "epoch": 1.1676011386181315, + "grad_norm": 0.58984375, + "learning_rate": 7.450829346447941e-06, + "loss": 1.4334, + "step": 6768 + }, + { + "epoch": 1.1677736565168637, + "grad_norm": 0.5703125, + "learning_rate": 7.448197484916264e-06, + "loss": 1.4752, + "step": 6769 + }, + { + "epoch": 1.1679461744155957, + "grad_norm": 0.61328125, + "learning_rate": 7.4455658124380545e-06, + "loss": 1.3971, + "step": 6770 + }, + { + "epoch": 1.1681186923143276, + "grad_norm": 0.67578125, + "learning_rate": 7.442934329208291e-06, + "loss": 1.457, + "step": 6771 + }, + { + "epoch": 1.1682912102130596, + "grad_norm": 0.5625, + "learning_rate": 7.440303035421934e-06, + "loss": 1.3178, + "step": 6772 + }, + { + "epoch": 1.1684637281117916, + "grad_norm": 0.59375, + "learning_rate": 7.43767193127392e-06, + "loss": 1.4777, + "step": 6773 + }, + { + "epoch": 1.1686362460105235, + "grad_norm": 0.60546875, + "learning_rate": 7.4350410169591815e-06, + "loss": 1.3621, + "step": 6774 + }, + { + "epoch": 1.1688087639092557, + "grad_norm": 0.59765625, + "learning_rate": 7.432410292672627e-06, + "loss": 1.3563, + "step": 6775 + }, + { + "epoch": 1.1689812818079877, + "grad_norm": 0.65625, + "learning_rate": 7.429779758609165e-06, + "loss": 1.3639, + "step": 6776 + }, + { + "epoch": 1.1691537997067196, + "grad_norm": 0.6171875, + "learning_rate": 7.427149414963676e-06, + "loss": 1.4728, + "step": 6777 + }, + { + "epoch": 1.1693263176054516, + "grad_norm": 0.5625, + "learning_rate": 7.424519261931036e-06, + "loss": 1.426, + "step": 6778 + }, + { + "epoch": 1.1694988355041835, + "grad_norm": 0.76171875, + "learning_rate": 7.421889299706098e-06, + "loss": 1.4206, + "step": 6779 + }, + { + "epoch": 1.1696713534029155, + "grad_norm": 0.6484375, + "learning_rate": 7.419259528483713e-06, + "loss": 1.4275, + "step": 6780 + }, + { + "epoch": 1.1698438713016475, + "grad_norm": 0.6015625, + "learning_rate": 7.416629948458705e-06, + "loss": 1.5017, + "step": 6781 + }, + { + "epoch": 1.1700163892003794, + "grad_norm": 0.74609375, + "learning_rate": 7.414000559825893e-06, + "loss": 1.3737, + "step": 6782 + }, + { + "epoch": 1.1701889070991116, + "grad_norm": 0.60546875, + "learning_rate": 7.411371362780076e-06, + "loss": 1.4951, + "step": 6783 + }, + { + "epoch": 1.1703614249978436, + "grad_norm": 0.66796875, + "learning_rate": 7.408742357516046e-06, + "loss": 1.4681, + "step": 6784 + }, + { + "epoch": 1.1705339428965755, + "grad_norm": 0.58984375, + "learning_rate": 7.406113544228571e-06, + "loss": 1.4076, + "step": 6785 + }, + { + "epoch": 1.1707064607953075, + "grad_norm": 0.6640625, + "learning_rate": 7.40348492311241e-06, + "loss": 1.5682, + "step": 6786 + }, + { + "epoch": 1.1708789786940395, + "grad_norm": 0.640625, + "learning_rate": 7.400856494362314e-06, + "loss": 1.4475, + "step": 6787 + }, + { + "epoch": 1.1710514965927714, + "grad_norm": 0.65234375, + "learning_rate": 7.398228258173006e-06, + "loss": 1.3377, + "step": 6788 + }, + { + "epoch": 1.1712240144915036, + "grad_norm": 0.56640625, + "learning_rate": 7.395600214739209e-06, + "loss": 1.4687, + "step": 6789 + }, + { + "epoch": 1.1713965323902356, + "grad_norm": 0.59765625, + "learning_rate": 7.392972364255615e-06, + "loss": 1.4127, + "step": 6790 + }, + { + "epoch": 1.1715690502889675, + "grad_norm": 0.578125, + "learning_rate": 7.390344706916923e-06, + "loss": 1.4401, + "step": 6791 + }, + { + "epoch": 1.1717415681876995, + "grad_norm": 0.62109375, + "learning_rate": 7.3877172429178e-06, + "loss": 1.4409, + "step": 6792 + }, + { + "epoch": 1.1719140860864314, + "grad_norm": 0.59375, + "learning_rate": 7.385089972452907e-06, + "loss": 1.3877, + "step": 6793 + }, + { + "epoch": 1.1720866039851634, + "grad_norm": 0.63671875, + "learning_rate": 7.382462895716882e-06, + "loss": 1.3832, + "step": 6794 + }, + { + "epoch": 1.1722591218838954, + "grad_norm": 0.62109375, + "learning_rate": 7.379836012904367e-06, + "loss": 1.3738, + "step": 6795 + }, + { + "epoch": 1.1724316397826275, + "grad_norm": 0.59765625, + "learning_rate": 7.377209324209968e-06, + "loss": 1.4374, + "step": 6796 + }, + { + "epoch": 1.1726041576813595, + "grad_norm": 0.58984375, + "learning_rate": 7.374582829828294e-06, + "loss": 1.3218, + "step": 6797 + }, + { + "epoch": 1.1727766755800915, + "grad_norm": 0.6171875, + "learning_rate": 7.3719565299539235e-06, + "loss": 1.457, + "step": 6798 + }, + { + "epoch": 1.1729491934788234, + "grad_norm": 0.62109375, + "learning_rate": 7.369330424781438e-06, + "loss": 1.4775, + "step": 6799 + }, + { + "epoch": 1.1731217113775554, + "grad_norm": 0.546875, + "learning_rate": 7.3667045145053875e-06, + "loss": 1.3732, + "step": 6800 + }, + { + "epoch": 1.1731217113775554, + "eval_loss": 1.4084804058074951, + "eval_runtime": 10.8179, + "eval_samples_per_second": 94.658, + "eval_steps_per_second": 23.665, + "step": 6800 + }, + { + "epoch": 1.1732942292762873, + "grad_norm": 0.61328125, + "learning_rate": 7.364078799320324e-06, + "loss": 1.3311, + "step": 6801 + }, + { + "epoch": 1.1734667471750195, + "grad_norm": 0.56640625, + "learning_rate": 7.3614532794207714e-06, + "loss": 1.3944, + "step": 6802 + }, + { + "epoch": 1.1736392650737515, + "grad_norm": 0.6796875, + "learning_rate": 7.358827955001244e-06, + "loss": 1.3655, + "step": 6803 + }, + { + "epoch": 1.1738117829724835, + "grad_norm": 0.5859375, + "learning_rate": 7.35620282625625e-06, + "loss": 1.3973, + "step": 6804 + }, + { + "epoch": 1.1739843008712154, + "grad_norm": 0.60546875, + "learning_rate": 7.353577893380266e-06, + "loss": 1.4846, + "step": 6805 + }, + { + "epoch": 1.1741568187699474, + "grad_norm": 0.95703125, + "learning_rate": 7.350953156567771e-06, + "loss": 1.422, + "step": 6806 + }, + { + "epoch": 1.1743293366686793, + "grad_norm": 0.640625, + "learning_rate": 7.348328616013213e-06, + "loss": 1.3805, + "step": 6807 + }, + { + "epoch": 1.1745018545674113, + "grad_norm": 0.59375, + "learning_rate": 7.345704271911043e-06, + "loss": 1.427, + "step": 6808 + }, + { + "epoch": 1.1746743724661433, + "grad_norm": 0.6015625, + "learning_rate": 7.343080124455684e-06, + "loss": 1.4001, + "step": 6809 + }, + { + "epoch": 1.1748468903648754, + "grad_norm": 0.6328125, + "learning_rate": 7.340456173841552e-06, + "loss": 1.468, + "step": 6810 + }, + { + "epoch": 1.1750194082636074, + "grad_norm": 0.5859375, + "learning_rate": 7.337832420263042e-06, + "loss": 1.4489, + "step": 6811 + }, + { + "epoch": 1.1751919261623394, + "grad_norm": 0.62109375, + "learning_rate": 7.3352088639145425e-06, + "loss": 1.4517, + "step": 6812 + }, + { + "epoch": 1.1753644440610713, + "grad_norm": 0.6640625, + "learning_rate": 7.332585504990419e-06, + "loss": 1.3657, + "step": 6813 + }, + { + "epoch": 1.1755369619598033, + "grad_norm": 0.59375, + "learning_rate": 7.329962343685031e-06, + "loss": 1.336, + "step": 6814 + }, + { + "epoch": 1.1757094798585352, + "grad_norm": 0.57421875, + "learning_rate": 7.327339380192712e-06, + "loss": 1.3534, + "step": 6815 + }, + { + "epoch": 1.1758819977572674, + "grad_norm": 0.61328125, + "learning_rate": 7.324716614707794e-06, + "loss": 1.5315, + "step": 6816 + }, + { + "epoch": 1.1760545156559994, + "grad_norm": 0.69921875, + "learning_rate": 7.322094047424584e-06, + "loss": 1.4136, + "step": 6817 + }, + { + "epoch": 1.1762270335547313, + "grad_norm": 0.5703125, + "learning_rate": 7.319471678537376e-06, + "loss": 1.4698, + "step": 6818 + }, + { + "epoch": 1.1763995514534633, + "grad_norm": 0.59765625, + "learning_rate": 7.31684950824046e-06, + "loss": 1.3532, + "step": 6819 + }, + { + "epoch": 1.1765720693521953, + "grad_norm": 0.6796875, + "learning_rate": 7.314227536728096e-06, + "loss": 1.4208, + "step": 6820 + }, + { + "epoch": 1.1767445872509272, + "grad_norm": 0.57421875, + "learning_rate": 7.311605764194538e-06, + "loss": 1.4014, + "step": 6821 + }, + { + "epoch": 1.1769171051496592, + "grad_norm": 0.546875, + "learning_rate": 7.308984190834019e-06, + "loss": 1.4367, + "step": 6822 + }, + { + "epoch": 1.1770896230483912, + "grad_norm": 0.62890625, + "learning_rate": 7.306362816840771e-06, + "loss": 1.4118, + "step": 6823 + }, + { + "epoch": 1.1772621409471233, + "grad_norm": 0.57421875, + "learning_rate": 7.3037416424089925e-06, + "loss": 1.4064, + "step": 6824 + }, + { + "epoch": 1.1774346588458553, + "grad_norm": 0.59375, + "learning_rate": 7.301120667732884e-06, + "loss": 1.4058, + "step": 6825 + }, + { + "epoch": 1.1776071767445873, + "grad_norm": 0.5859375, + "learning_rate": 7.2984998930066145e-06, + "loss": 1.5118, + "step": 6826 + }, + { + "epoch": 1.1777796946433192, + "grad_norm": 0.6171875, + "learning_rate": 7.295879318424356e-06, + "loss": 1.559, + "step": 6827 + }, + { + "epoch": 1.1779522125420512, + "grad_norm": 0.62109375, + "learning_rate": 7.2932589441802506e-06, + "loss": 1.4733, + "step": 6828 + }, + { + "epoch": 1.1781247304407831, + "grad_norm": 0.6328125, + "learning_rate": 7.290638770468439e-06, + "loss": 1.412, + "step": 6829 + }, + { + "epoch": 1.1782972483395153, + "grad_norm": 0.56640625, + "learning_rate": 7.288018797483034e-06, + "loss": 1.3695, + "step": 6830 + }, + { + "epoch": 1.1784697662382473, + "grad_norm": 0.98828125, + "learning_rate": 7.285399025418144e-06, + "loss": 1.4087, + "step": 6831 + }, + { + "epoch": 1.1786422841369792, + "grad_norm": 0.57421875, + "learning_rate": 7.282779454467851e-06, + "loss": 1.4296, + "step": 6832 + }, + { + "epoch": 1.1788148020357112, + "grad_norm": 0.640625, + "learning_rate": 7.280160084826239e-06, + "loss": 1.4193, + "step": 6833 + }, + { + "epoch": 1.1789873199344432, + "grad_norm": 0.65625, + "learning_rate": 7.2775409166873604e-06, + "loss": 1.4459, + "step": 6834 + }, + { + "epoch": 1.1791598378331751, + "grad_norm": 0.609375, + "learning_rate": 7.274921950245258e-06, + "loss": 1.4489, + "step": 6835 + }, + { + "epoch": 1.179332355731907, + "grad_norm": 0.578125, + "learning_rate": 7.27230318569397e-06, + "loss": 1.3256, + "step": 6836 + }, + { + "epoch": 1.1795048736306393, + "grad_norm": 0.53515625, + "learning_rate": 7.269684623227502e-06, + "loss": 1.3541, + "step": 6837 + }, + { + "epoch": 1.1796773915293712, + "grad_norm": 0.63671875, + "learning_rate": 7.267066263039862e-06, + "loss": 1.4544, + "step": 6838 + }, + { + "epoch": 1.1798499094281032, + "grad_norm": 0.55859375, + "learning_rate": 7.2644481053250215e-06, + "loss": 1.3666, + "step": 6839 + }, + { + "epoch": 1.1800224273268352, + "grad_norm": 0.625, + "learning_rate": 7.261830150276964e-06, + "loss": 1.3835, + "step": 6840 + }, + { + "epoch": 1.1801949452255671, + "grad_norm": 0.5703125, + "learning_rate": 7.259212398089636e-06, + "loss": 1.4254, + "step": 6841 + }, + { + "epoch": 1.180367463124299, + "grad_norm": 0.8359375, + "learning_rate": 7.25659484895698e-06, + "loss": 1.4535, + "step": 6842 + }, + { + "epoch": 1.1805399810230313, + "grad_norm": 0.5859375, + "learning_rate": 7.253977503072916e-06, + "loss": 1.3889, + "step": 6843 + }, + { + "epoch": 1.1807124989217632, + "grad_norm": 0.60546875, + "learning_rate": 7.251360360631359e-06, + "loss": 1.4511, + "step": 6844 + }, + { + "epoch": 1.1808850168204952, + "grad_norm": 0.66015625, + "learning_rate": 7.2487434218262e-06, + "loss": 1.4949, + "step": 6845 + }, + { + "epoch": 1.1810575347192271, + "grad_norm": 0.6171875, + "learning_rate": 7.24612668685132e-06, + "loss": 1.4106, + "step": 6846 + }, + { + "epoch": 1.181230052617959, + "grad_norm": 0.59375, + "learning_rate": 7.24351015590058e-06, + "loss": 1.4674, + "step": 6847 + }, + { + "epoch": 1.181402570516691, + "grad_norm": 0.58984375, + "learning_rate": 7.240893829167834e-06, + "loss": 1.4853, + "step": 6848 + }, + { + "epoch": 1.181575088415423, + "grad_norm": 0.59765625, + "learning_rate": 7.23827770684691e-06, + "loss": 1.4398, + "step": 6849 + }, + { + "epoch": 1.181747606314155, + "grad_norm": 0.6015625, + "learning_rate": 7.2356617891316275e-06, + "loss": 1.4522, + "step": 6850 + }, + { + "epoch": 1.1819201242128872, + "grad_norm": 0.66015625, + "learning_rate": 7.233046076215798e-06, + "loss": 1.5239, + "step": 6851 + }, + { + "epoch": 1.1820926421116191, + "grad_norm": 0.59375, + "learning_rate": 7.230430568293199e-06, + "loss": 1.5447, + "step": 6852 + }, + { + "epoch": 1.182265160010351, + "grad_norm": 0.6484375, + "learning_rate": 7.227815265557611e-06, + "loss": 1.4014, + "step": 6853 + }, + { + "epoch": 1.182437677909083, + "grad_norm": 0.65234375, + "learning_rate": 7.225200168202789e-06, + "loss": 1.4451, + "step": 6854 + }, + { + "epoch": 1.182610195807815, + "grad_norm": 0.58203125, + "learning_rate": 7.222585276422477e-06, + "loss": 1.4846, + "step": 6855 + }, + { + "epoch": 1.182782713706547, + "grad_norm": 0.59765625, + "learning_rate": 7.219970590410399e-06, + "loss": 1.4114, + "step": 6856 + }, + { + "epoch": 1.1829552316052792, + "grad_norm": 0.59765625, + "learning_rate": 7.217356110360275e-06, + "loss": 1.4652, + "step": 6857 + }, + { + "epoch": 1.1831277495040111, + "grad_norm": 0.6484375, + "learning_rate": 7.214741836465793e-06, + "loss": 1.4706, + "step": 6858 + }, + { + "epoch": 1.183300267402743, + "grad_norm": 0.62109375, + "learning_rate": 7.21212776892064e-06, + "loss": 1.4584, + "step": 6859 + }, + { + "epoch": 1.183472785301475, + "grad_norm": 0.59375, + "learning_rate": 7.209513907918479e-06, + "loss": 1.4025, + "step": 6860 + }, + { + "epoch": 1.183645303200207, + "grad_norm": 0.61328125, + "learning_rate": 7.206900253652969e-06, + "loss": 1.3999, + "step": 6861 + }, + { + "epoch": 1.183817821098939, + "grad_norm": 0.59765625, + "learning_rate": 7.204286806317736e-06, + "loss": 1.3659, + "step": 6862 + }, + { + "epoch": 1.183990338997671, + "grad_norm": 0.63671875, + "learning_rate": 7.2016735661064086e-06, + "loss": 1.4903, + "step": 6863 + }, + { + "epoch": 1.1841628568964029, + "grad_norm": 0.65234375, + "learning_rate": 7.1990605332125825e-06, + "loss": 1.4975, + "step": 6864 + }, + { + "epoch": 1.184335374795135, + "grad_norm": 0.6171875, + "learning_rate": 7.1964477078298574e-06, + "loss": 1.3395, + "step": 6865 + }, + { + "epoch": 1.184507892693867, + "grad_norm": 0.640625, + "learning_rate": 7.193835090151803e-06, + "loss": 1.3884, + "step": 6866 + }, + { + "epoch": 1.184680410592599, + "grad_norm": 0.6015625, + "learning_rate": 7.191222680371975e-06, + "loss": 1.3982, + "step": 6867 + }, + { + "epoch": 1.184852928491331, + "grad_norm": 0.58984375, + "learning_rate": 7.188610478683926e-06, + "loss": 1.3685, + "step": 6868 + }, + { + "epoch": 1.185025446390063, + "grad_norm": 0.5625, + "learning_rate": 7.1859984852811775e-06, + "loss": 1.4035, + "step": 6869 + }, + { + "epoch": 1.1851979642887949, + "grad_norm": 0.65234375, + "learning_rate": 7.183386700357245e-06, + "loss": 1.3266, + "step": 6870 + }, + { + "epoch": 1.185370482187527, + "grad_norm": 0.69140625, + "learning_rate": 7.1807751241056215e-06, + "loss": 1.4277, + "step": 6871 + }, + { + "epoch": 1.185543000086259, + "grad_norm": 0.57421875, + "learning_rate": 7.178163756719795e-06, + "loss": 1.3906, + "step": 6872 + }, + { + "epoch": 1.185715517984991, + "grad_norm": 0.6171875, + "learning_rate": 7.175552598393227e-06, + "loss": 1.4967, + "step": 6873 + }, + { + "epoch": 1.185888035883723, + "grad_norm": 0.65234375, + "learning_rate": 7.172941649319374e-06, + "loss": 1.3458, + "step": 6874 + }, + { + "epoch": 1.186060553782455, + "grad_norm": 0.59765625, + "learning_rate": 7.1703309096916625e-06, + "loss": 1.4214, + "step": 6875 + }, + { + "epoch": 1.1862330716811869, + "grad_norm": 0.55078125, + "learning_rate": 7.167720379703522e-06, + "loss": 1.3223, + "step": 6876 + }, + { + "epoch": 1.1864055895799188, + "grad_norm": 0.6015625, + "learning_rate": 7.165110059548353e-06, + "loss": 1.4065, + "step": 6877 + }, + { + "epoch": 1.186578107478651, + "grad_norm": 0.7109375, + "learning_rate": 7.162499949419543e-06, + "loss": 1.4116, + "step": 6878 + }, + { + "epoch": 1.186750625377383, + "grad_norm": 0.59375, + "learning_rate": 7.159890049510463e-06, + "loss": 1.431, + "step": 6879 + }, + { + "epoch": 1.186923143276115, + "grad_norm": 0.64453125, + "learning_rate": 7.157280360014478e-06, + "loss": 1.3818, + "step": 6880 + }, + { + "epoch": 1.1870956611748469, + "grad_norm": 0.5859375, + "learning_rate": 7.154670881124925e-06, + "loss": 1.4078, + "step": 6881 + }, + { + "epoch": 1.1872681790735788, + "grad_norm": 0.59375, + "learning_rate": 7.152061613035128e-06, + "loss": 1.3175, + "step": 6882 + }, + { + "epoch": 1.1874406969723108, + "grad_norm": 0.56640625, + "learning_rate": 7.149452555938407e-06, + "loss": 1.4258, + "step": 6883 + }, + { + "epoch": 1.187613214871043, + "grad_norm": 0.55859375, + "learning_rate": 7.146843710028049e-06, + "loss": 1.3981, + "step": 6884 + }, + { + "epoch": 1.187785732769775, + "grad_norm": 0.609375, + "learning_rate": 7.144235075497339e-06, + "loss": 1.4337, + "step": 6885 + }, + { + "epoch": 1.187958250668507, + "grad_norm": 0.56640625, + "learning_rate": 7.141626652539533e-06, + "loss": 1.4078, + "step": 6886 + }, + { + "epoch": 1.1881307685672389, + "grad_norm": 0.6015625, + "learning_rate": 7.139018441347889e-06, + "loss": 1.3799, + "step": 6887 + }, + { + "epoch": 1.1883032864659708, + "grad_norm": 0.7109375, + "learning_rate": 7.136410442115631e-06, + "loss": 1.4632, + "step": 6888 + }, + { + "epoch": 1.1884758043647028, + "grad_norm": 0.62890625, + "learning_rate": 7.133802655035984e-06, + "loss": 1.4898, + "step": 6889 + }, + { + "epoch": 1.1886483222634348, + "grad_norm": 0.5859375, + "learning_rate": 7.131195080302144e-06, + "loss": 1.4195, + "step": 6890 + }, + { + "epoch": 1.1888208401621667, + "grad_norm": 0.5625, + "learning_rate": 7.128587718107298e-06, + "loss": 1.4984, + "step": 6891 + }, + { + "epoch": 1.188993358060899, + "grad_norm": 0.63671875, + "learning_rate": 7.125980568644612e-06, + "loss": 1.4518, + "step": 6892 + }, + { + "epoch": 1.1891658759596309, + "grad_norm": 0.6328125, + "learning_rate": 7.123373632107247e-06, + "loss": 1.4977, + "step": 6893 + }, + { + "epoch": 1.1893383938583628, + "grad_norm": 0.6328125, + "learning_rate": 7.1207669086883366e-06, + "loss": 1.3991, + "step": 6894 + }, + { + "epoch": 1.1895109117570948, + "grad_norm": 0.53125, + "learning_rate": 7.118160398581004e-06, + "loss": 1.3576, + "step": 6895 + }, + { + "epoch": 1.1896834296558267, + "grad_norm": 0.60546875, + "learning_rate": 7.115554101978354e-06, + "loss": 1.4036, + "step": 6896 + }, + { + "epoch": 1.1898559475545587, + "grad_norm": 0.6953125, + "learning_rate": 7.112948019073481e-06, + "loss": 1.3978, + "step": 6897 + }, + { + "epoch": 1.1900284654532909, + "grad_norm": 0.63671875, + "learning_rate": 7.110342150059457e-06, + "loss": 1.4511, + "step": 6898 + }, + { + "epoch": 1.1902009833520228, + "grad_norm": 0.65625, + "learning_rate": 7.107736495129338e-06, + "loss": 1.4181, + "step": 6899 + }, + { + "epoch": 1.1903735012507548, + "grad_norm": 0.5625, + "learning_rate": 7.1051310544761775e-06, + "loss": 1.3462, + "step": 6900 + }, + { + "epoch": 1.1903735012507548, + "eval_loss": 1.408332109451294, + "eval_runtime": 10.8573, + "eval_samples_per_second": 94.315, + "eval_steps_per_second": 23.579, + "step": 6900 + }, + { + "epoch": 1.1905460191494868, + "grad_norm": 0.62109375, + "learning_rate": 7.102525828292993e-06, + "loss": 1.5048, + "step": 6901 + }, + { + "epoch": 1.1907185370482187, + "grad_norm": 0.62109375, + "learning_rate": 7.099920816772803e-06, + "loss": 1.4587, + "step": 6902 + }, + { + "epoch": 1.1908910549469507, + "grad_norm": 0.59765625, + "learning_rate": 7.097316020108594e-06, + "loss": 1.4227, + "step": 6903 + }, + { + "epoch": 1.1910635728456826, + "grad_norm": 0.5703125, + "learning_rate": 7.094711438493355e-06, + "loss": 1.349, + "step": 6904 + }, + { + "epoch": 1.1912360907444146, + "grad_norm": 0.5703125, + "learning_rate": 7.0921070721200445e-06, + "loss": 1.4222, + "step": 6905 + }, + { + "epoch": 1.1914086086431468, + "grad_norm": 0.89453125, + "learning_rate": 7.089502921181613e-06, + "loss": 1.3994, + "step": 6906 + }, + { + "epoch": 1.1915811265418788, + "grad_norm": 0.63671875, + "learning_rate": 7.086898985870987e-06, + "loss": 1.4528, + "step": 6907 + }, + { + "epoch": 1.1917536444406107, + "grad_norm": 0.5703125, + "learning_rate": 7.084295266381089e-06, + "loss": 1.4027, + "step": 6908 + }, + { + "epoch": 1.1919261623393427, + "grad_norm": 0.55859375, + "learning_rate": 7.081691762904814e-06, + "loss": 1.4115, + "step": 6909 + }, + { + "epoch": 1.1920986802380746, + "grad_norm": 0.5703125, + "learning_rate": 7.079088475635051e-06, + "loss": 1.4856, + "step": 6910 + }, + { + "epoch": 1.1922711981368066, + "grad_norm": 0.62109375, + "learning_rate": 7.076485404764659e-06, + "loss": 1.4102, + "step": 6911 + }, + { + "epoch": 1.1924437160355388, + "grad_norm": 0.6171875, + "learning_rate": 7.073882550486501e-06, + "loss": 1.3498, + "step": 6912 + }, + { + "epoch": 1.1926162339342707, + "grad_norm": 0.60546875, + "learning_rate": 7.071279912993403e-06, + "loss": 1.4475, + "step": 6913 + }, + { + "epoch": 1.1927887518330027, + "grad_norm": 0.578125, + "learning_rate": 7.068677492478191e-06, + "loss": 1.4666, + "step": 6914 + }, + { + "epoch": 1.1929612697317347, + "grad_norm": 0.57421875, + "learning_rate": 7.066075289133662e-06, + "loss": 1.4479, + "step": 6915 + }, + { + "epoch": 1.1931337876304666, + "grad_norm": 0.5859375, + "learning_rate": 7.063473303152608e-06, + "loss": 1.3826, + "step": 6916 + }, + { + "epoch": 1.1933063055291986, + "grad_norm": 0.76171875, + "learning_rate": 7.0608715347278045e-06, + "loss": 1.4623, + "step": 6917 + }, + { + "epoch": 1.1934788234279305, + "grad_norm": 0.6328125, + "learning_rate": 7.0582699840519996e-06, + "loss": 1.3564, + "step": 6918 + }, + { + "epoch": 1.1936513413266627, + "grad_norm": 0.640625, + "learning_rate": 7.055668651317937e-06, + "loss": 1.3768, + "step": 6919 + }, + { + "epoch": 1.1938238592253947, + "grad_norm": 0.6171875, + "learning_rate": 7.053067536718334e-06, + "loss": 1.2849, + "step": 6920 + }, + { + "epoch": 1.1939963771241267, + "grad_norm": 0.5859375, + "learning_rate": 7.050466640445906e-06, + "loss": 1.3967, + "step": 6921 + }, + { + "epoch": 1.1941688950228586, + "grad_norm": 0.55078125, + "learning_rate": 7.0478659626933345e-06, + "loss": 1.3141, + "step": 6922 + }, + { + "epoch": 1.1943414129215906, + "grad_norm": 0.56640625, + "learning_rate": 7.045265503653302e-06, + "loss": 1.4541, + "step": 6923 + }, + { + "epoch": 1.1945139308203225, + "grad_norm": 0.62890625, + "learning_rate": 7.042665263518458e-06, + "loss": 1.481, + "step": 6924 + }, + { + "epoch": 1.1946864487190547, + "grad_norm": 0.63671875, + "learning_rate": 7.040065242481455e-06, + "loss": 1.3615, + "step": 6925 + }, + { + "epoch": 1.1948589666177867, + "grad_norm": 0.59765625, + "learning_rate": 7.0374654407349095e-06, + "loss": 1.3896, + "step": 6926 + }, + { + "epoch": 1.1950314845165186, + "grad_norm": 0.62890625, + "learning_rate": 7.034865858471438e-06, + "loss": 1.4479, + "step": 6927 + }, + { + "epoch": 1.1952040024152506, + "grad_norm": 0.5859375, + "learning_rate": 7.032266495883627e-06, + "loss": 1.473, + "step": 6928 + }, + { + "epoch": 1.1953765203139826, + "grad_norm": 0.578125, + "learning_rate": 7.029667353164061e-06, + "loss": 1.4585, + "step": 6929 + }, + { + "epoch": 1.1955490382127145, + "grad_norm": 0.63671875, + "learning_rate": 7.027068430505295e-06, + "loss": 1.3895, + "step": 6930 + }, + { + "epoch": 1.1957215561114465, + "grad_norm": 0.60546875, + "learning_rate": 7.024469728099873e-06, + "loss": 1.4092, + "step": 6931 + }, + { + "epoch": 1.1958940740101784, + "grad_norm": 0.60546875, + "learning_rate": 7.02187124614033e-06, + "loss": 1.4369, + "step": 6932 + }, + { + "epoch": 1.1960665919089106, + "grad_norm": 0.63671875, + "learning_rate": 7.019272984819171e-06, + "loss": 1.4359, + "step": 6933 + }, + { + "epoch": 1.1962391098076426, + "grad_norm": 0.5625, + "learning_rate": 7.016674944328896e-06, + "loss": 1.3549, + "step": 6934 + }, + { + "epoch": 1.1964116277063745, + "grad_norm": 0.625, + "learning_rate": 7.014077124861978e-06, + "loss": 1.447, + "step": 6935 + }, + { + "epoch": 1.1965841456051065, + "grad_norm": 0.55078125, + "learning_rate": 7.011479526610887e-06, + "loss": 1.4539, + "step": 6936 + }, + { + "epoch": 1.1967566635038385, + "grad_norm": 0.84375, + "learning_rate": 7.0088821497680635e-06, + "loss": 1.348, + "step": 6937 + }, + { + "epoch": 1.1969291814025704, + "grad_norm": 0.5546875, + "learning_rate": 7.006284994525943e-06, + "loss": 1.3846, + "step": 6938 + }, + { + "epoch": 1.1971016993013026, + "grad_norm": 0.5625, + "learning_rate": 7.003688061076929e-06, + "loss": 1.404, + "step": 6939 + }, + { + "epoch": 1.1972742172000346, + "grad_norm": 0.67578125, + "learning_rate": 7.001091349613433e-06, + "loss": 1.4452, + "step": 6940 + }, + { + "epoch": 1.1974467350987665, + "grad_norm": 0.59765625, + "learning_rate": 6.9984948603278225e-06, + "loss": 1.4511, + "step": 6941 + }, + { + "epoch": 1.1976192529974985, + "grad_norm": 0.6953125, + "learning_rate": 6.995898593412471e-06, + "loss": 1.4435, + "step": 6942 + }, + { + "epoch": 1.1977917708962305, + "grad_norm": 0.59375, + "learning_rate": 6.993302549059717e-06, + "loss": 1.425, + "step": 6943 + }, + { + "epoch": 1.1979642887949624, + "grad_norm": 0.671875, + "learning_rate": 6.9907067274619025e-06, + "loss": 1.427, + "step": 6944 + }, + { + "epoch": 1.1981368066936944, + "grad_norm": 0.70703125, + "learning_rate": 6.988111128811334e-06, + "loss": 1.4501, + "step": 6945 + }, + { + "epoch": 1.1983093245924266, + "grad_norm": 0.8046875, + "learning_rate": 6.985515753300314e-06, + "loss": 1.4704, + "step": 6946 + }, + { + "epoch": 1.1984818424911585, + "grad_norm": 0.625, + "learning_rate": 6.982920601121117e-06, + "loss": 1.3816, + "step": 6947 + }, + { + "epoch": 1.1986543603898905, + "grad_norm": 0.61328125, + "learning_rate": 6.980325672466015e-06, + "loss": 1.4104, + "step": 6948 + }, + { + "epoch": 1.1988268782886224, + "grad_norm": 0.59375, + "learning_rate": 6.977730967527259e-06, + "loss": 1.4798, + "step": 6949 + }, + { + "epoch": 1.1989993961873544, + "grad_norm": 0.62890625, + "learning_rate": 6.975136486497074e-06, + "loss": 1.4175, + "step": 6950 + }, + { + "epoch": 1.1991719140860864, + "grad_norm": 0.69140625, + "learning_rate": 6.972542229567682e-06, + "loss": 1.3979, + "step": 6951 + }, + { + "epoch": 1.1993444319848185, + "grad_norm": 0.625, + "learning_rate": 6.969948196931272e-06, + "loss": 1.3222, + "step": 6952 + }, + { + "epoch": 1.1995169498835505, + "grad_norm": 0.59375, + "learning_rate": 6.967354388780037e-06, + "loss": 1.3931, + "step": 6953 + }, + { + "epoch": 1.1996894677822825, + "grad_norm": 0.57421875, + "learning_rate": 6.964760805306137e-06, + "loss": 1.4464, + "step": 6954 + }, + { + "epoch": 1.1998619856810144, + "grad_norm": 0.609375, + "learning_rate": 6.962167446701722e-06, + "loss": 1.4952, + "step": 6955 + }, + { + "epoch": 1.2000345035797464, + "grad_norm": 0.65625, + "learning_rate": 6.95957431315892e-06, + "loss": 1.4631, + "step": 6956 + }, + { + "epoch": 1.2002070214784784, + "grad_norm": 0.6484375, + "learning_rate": 6.956981404869855e-06, + "loss": 1.5089, + "step": 6957 + }, + { + "epoch": 1.2003795393772103, + "grad_norm": 0.62109375, + "learning_rate": 6.954388722026618e-06, + "loss": 1.4603, + "step": 6958 + }, + { + "epoch": 1.2005520572759423, + "grad_norm": 0.56640625, + "learning_rate": 6.951796264821298e-06, + "loss": 1.4433, + "step": 6959 + }, + { + "epoch": 1.2007245751746745, + "grad_norm": 0.59375, + "learning_rate": 6.949204033445951e-06, + "loss": 1.4816, + "step": 6960 + }, + { + "epoch": 1.2008970930734064, + "grad_norm": 0.58203125, + "learning_rate": 6.946612028092636e-06, + "loss": 1.4718, + "step": 6961 + }, + { + "epoch": 1.2010696109721384, + "grad_norm": 0.59765625, + "learning_rate": 6.944020248953379e-06, + "loss": 1.4309, + "step": 6962 + }, + { + "epoch": 1.2012421288708703, + "grad_norm": 0.6953125, + "learning_rate": 6.941428696220195e-06, + "loss": 1.5134, + "step": 6963 + }, + { + "epoch": 1.2014146467696023, + "grad_norm": 0.58984375, + "learning_rate": 6.938837370085087e-06, + "loss": 1.3666, + "step": 6964 + }, + { + "epoch": 1.2015871646683343, + "grad_norm": 0.6484375, + "learning_rate": 6.9362462707400325e-06, + "loss": 1.4257, + "step": 6965 + }, + { + "epoch": 1.2017596825670664, + "grad_norm": 0.73828125, + "learning_rate": 6.9336553983769995e-06, + "loss": 1.43, + "step": 6966 + }, + { + "epoch": 1.2019322004657984, + "grad_norm": 0.62109375, + "learning_rate": 6.931064753187929e-06, + "loss": 1.4498, + "step": 6967 + }, + { + "epoch": 1.2021047183645304, + "grad_norm": 0.6328125, + "learning_rate": 6.928474335364761e-06, + "loss": 1.3838, + "step": 6968 + }, + { + "epoch": 1.2022772362632623, + "grad_norm": 0.5859375, + "learning_rate": 6.925884145099405e-06, + "loss": 1.3933, + "step": 6969 + }, + { + "epoch": 1.2024497541619943, + "grad_norm": 0.59765625, + "learning_rate": 6.923294182583762e-06, + "loss": 1.4558, + "step": 6970 + }, + { + "epoch": 1.2026222720607262, + "grad_norm": 0.64453125, + "learning_rate": 6.920704448009705e-06, + "loss": 1.4645, + "step": 6971 + }, + { + "epoch": 1.2027947899594582, + "grad_norm": 0.5625, + "learning_rate": 6.918114941569108e-06, + "loss": 1.4875, + "step": 6972 + }, + { + "epoch": 1.2029673078581902, + "grad_norm": 0.58984375, + "learning_rate": 6.915525663453808e-06, + "loss": 1.4333, + "step": 6973 + }, + { + "epoch": 1.2031398257569224, + "grad_norm": 0.6171875, + "learning_rate": 6.912936613855643e-06, + "loss": 1.4254, + "step": 6974 + }, + { + "epoch": 1.2033123436556543, + "grad_norm": 0.7265625, + "learning_rate": 6.910347792966418e-06, + "loss": 1.4324, + "step": 6975 + }, + { + "epoch": 1.2034848615543863, + "grad_norm": 0.65625, + "learning_rate": 6.907759200977939e-06, + "loss": 1.5757, + "step": 6976 + }, + { + "epoch": 1.2036573794531182, + "grad_norm": 0.57421875, + "learning_rate": 6.905170838081976e-06, + "loss": 1.3948, + "step": 6977 + }, + { + "epoch": 1.2038298973518502, + "grad_norm": 0.640625, + "learning_rate": 6.902582704470298e-06, + "loss": 1.5119, + "step": 6978 + }, + { + "epoch": 1.2040024152505822, + "grad_norm": 0.5546875, + "learning_rate": 6.899994800334644e-06, + "loss": 1.4551, + "step": 6979 + }, + { + "epoch": 1.2041749331493143, + "grad_norm": 0.609375, + "learning_rate": 6.897407125866743e-06, + "loss": 1.364, + "step": 6980 + }, + { + "epoch": 1.2043474510480463, + "grad_norm": 0.57421875, + "learning_rate": 6.894819681258312e-06, + "loss": 1.2945, + "step": 6981 + }, + { + "epoch": 1.2045199689467783, + "grad_norm": 0.62109375, + "learning_rate": 6.89223246670104e-06, + "loss": 1.428, + "step": 6982 + }, + { + "epoch": 1.2046924868455102, + "grad_norm": 0.5703125, + "learning_rate": 6.889645482386607e-06, + "loss": 1.4469, + "step": 6983 + }, + { + "epoch": 1.2048650047442422, + "grad_norm": 0.6640625, + "learning_rate": 6.887058728506666e-06, + "loss": 1.3184, + "step": 6984 + }, + { + "epoch": 1.2050375226429741, + "grad_norm": 0.625, + "learning_rate": 6.8844722052528704e-06, + "loss": 1.4146, + "step": 6985 + }, + { + "epoch": 1.205210040541706, + "grad_norm": 0.6171875, + "learning_rate": 6.881885912816837e-06, + "loss": 1.4958, + "step": 6986 + }, + { + "epoch": 1.2053825584404383, + "grad_norm": 0.625, + "learning_rate": 6.87929985139018e-06, + "loss": 1.3745, + "step": 6987 + }, + { + "epoch": 1.2055550763391703, + "grad_norm": 0.5859375, + "learning_rate": 6.876714021164486e-06, + "loss": 1.3671, + "step": 6988 + }, + { + "epoch": 1.2057275942379022, + "grad_norm": 0.65625, + "learning_rate": 6.874128422331336e-06, + "loss": 1.4298, + "step": 6989 + }, + { + "epoch": 1.2059001121366342, + "grad_norm": 0.6328125, + "learning_rate": 6.871543055082283e-06, + "loss": 1.4688, + "step": 6990 + }, + { + "epoch": 1.2060726300353661, + "grad_norm": 0.66015625, + "learning_rate": 6.8689579196088694e-06, + "loss": 1.3904, + "step": 6991 + }, + { + "epoch": 1.206245147934098, + "grad_norm": 0.5859375, + "learning_rate": 6.8663730161026125e-06, + "loss": 1.4428, + "step": 6992 + }, + { + "epoch": 1.2064176658328303, + "grad_norm": 0.65625, + "learning_rate": 6.863788344755026e-06, + "loss": 1.4108, + "step": 6993 + }, + { + "epoch": 1.2065901837315622, + "grad_norm": 0.609375, + "learning_rate": 6.861203905757593e-06, + "loss": 1.3966, + "step": 6994 + }, + { + "epoch": 1.2067627016302942, + "grad_norm": 0.58984375, + "learning_rate": 6.858619699301785e-06, + "loss": 1.4039, + "step": 6995 + }, + { + "epoch": 1.2069352195290262, + "grad_norm": 0.6015625, + "learning_rate": 6.856035725579062e-06, + "loss": 1.5806, + "step": 6996 + }, + { + "epoch": 1.2071077374277581, + "grad_norm": 0.70703125, + "learning_rate": 6.853451984780854e-06, + "loss": 1.4333, + "step": 6997 + }, + { + "epoch": 1.20728025532649, + "grad_norm": 0.5546875, + "learning_rate": 6.850868477098587e-06, + "loss": 1.3247, + "step": 6998 + }, + { + "epoch": 1.207452773225222, + "grad_norm": 0.578125, + "learning_rate": 6.848285202723655e-06, + "loss": 1.4973, + "step": 6999 + }, + { + "epoch": 1.207625291123954, + "grad_norm": 0.5625, + "learning_rate": 6.8457021618474514e-06, + "loss": 1.4564, + "step": 7000 + }, + { + "epoch": 1.207625291123954, + "eval_loss": 1.4081740379333496, + "eval_runtime": 11.0377, + "eval_samples_per_second": 92.773, + "eval_steps_per_second": 23.193, + "step": 7000 + }, + { + "epoch": 1.2077978090226862, + "grad_norm": 0.63671875, + "learning_rate": 6.843119354661341e-06, + "loss": 1.4374, + "step": 7001 + }, + { + "epoch": 1.2079703269214181, + "grad_norm": 0.62890625, + "learning_rate": 6.840536781356674e-06, + "loss": 1.3482, + "step": 7002 + }, + { + "epoch": 1.20814284482015, + "grad_norm": 0.6796875, + "learning_rate": 6.837954442124779e-06, + "loss": 1.5448, + "step": 7003 + }, + { + "epoch": 1.208315362718882, + "grad_norm": 0.6875, + "learning_rate": 6.835372337156981e-06, + "loss": 1.5004, + "step": 7004 + }, + { + "epoch": 1.208487880617614, + "grad_norm": 0.78125, + "learning_rate": 6.83279046664457e-06, + "loss": 1.4206, + "step": 7005 + }, + { + "epoch": 1.208660398516346, + "grad_norm": 0.59375, + "learning_rate": 6.830208830778837e-06, + "loss": 1.4897, + "step": 7006 + }, + { + "epoch": 1.2088329164150782, + "grad_norm": 0.609375, + "learning_rate": 6.8276274297510336e-06, + "loss": 1.4595, + "step": 7007 + }, + { + "epoch": 1.2090054343138101, + "grad_norm": 0.55859375, + "learning_rate": 6.825046263752415e-06, + "loss": 1.4146, + "step": 7008 + }, + { + "epoch": 1.209177952212542, + "grad_norm": 0.609375, + "learning_rate": 6.822465332974204e-06, + "loss": 1.386, + "step": 7009 + }, + { + "epoch": 1.209350470111274, + "grad_norm": 0.5625, + "learning_rate": 6.819884637607619e-06, + "loss": 1.3649, + "step": 7010 + }, + { + "epoch": 1.209522988010006, + "grad_norm": 0.66015625, + "learning_rate": 6.817304177843848e-06, + "loss": 1.4369, + "step": 7011 + }, + { + "epoch": 1.209695505908738, + "grad_norm": 0.55078125, + "learning_rate": 6.8147239538740695e-06, + "loss": 1.3845, + "step": 7012 + }, + { + "epoch": 1.20986802380747, + "grad_norm": 0.60546875, + "learning_rate": 6.812143965889446e-06, + "loss": 1.4777, + "step": 7013 + }, + { + "epoch": 1.210040541706202, + "grad_norm": 0.66015625, + "learning_rate": 6.809564214081114e-06, + "loss": 1.5328, + "step": 7014 + }, + { + "epoch": 1.210213059604934, + "grad_norm": 0.59765625, + "learning_rate": 6.806984698640202e-06, + "loss": 1.4277, + "step": 7015 + }, + { + "epoch": 1.210385577503666, + "grad_norm": 0.578125, + "learning_rate": 6.8044054197578115e-06, + "loss": 1.4582, + "step": 7016 + }, + { + "epoch": 1.210558095402398, + "grad_norm": 0.59375, + "learning_rate": 6.801826377625036e-06, + "loss": 1.4857, + "step": 7017 + }, + { + "epoch": 1.21073061330113, + "grad_norm": 0.67578125, + "learning_rate": 6.799247572432945e-06, + "loss": 1.5168, + "step": 7018 + }, + { + "epoch": 1.210903131199862, + "grad_norm": 0.6171875, + "learning_rate": 6.796669004372596e-06, + "loss": 1.4525, + "step": 7019 + }, + { + "epoch": 1.2110756490985939, + "grad_norm": 0.609375, + "learning_rate": 6.794090673635017e-06, + "loss": 1.4734, + "step": 7020 + }, + { + "epoch": 1.211248166997326, + "grad_norm": 0.5859375, + "learning_rate": 6.791512580411237e-06, + "loss": 1.3496, + "step": 7021 + }, + { + "epoch": 1.211420684896058, + "grad_norm": 0.625, + "learning_rate": 6.788934724892251e-06, + "loss": 1.4423, + "step": 7022 + }, + { + "epoch": 1.21159320279479, + "grad_norm": 0.62890625, + "learning_rate": 6.786357107269045e-06, + "loss": 1.3485, + "step": 7023 + }, + { + "epoch": 1.211765720693522, + "grad_norm": 0.76171875, + "learning_rate": 6.78377972773258e-06, + "loss": 1.4124, + "step": 7024 + }, + { + "epoch": 1.211938238592254, + "grad_norm": 0.5859375, + "learning_rate": 6.781202586473814e-06, + "loss": 1.4283, + "step": 7025 + }, + { + "epoch": 1.2121107564909859, + "grad_norm": 0.56640625, + "learning_rate": 6.778625683683671e-06, + "loss": 1.4206, + "step": 7026 + }, + { + "epoch": 1.2122832743897178, + "grad_norm": 0.703125, + "learning_rate": 6.776049019553062e-06, + "loss": 1.6036, + "step": 7027 + }, + { + "epoch": 1.21245579228845, + "grad_norm": 0.62109375, + "learning_rate": 6.773472594272889e-06, + "loss": 1.4514, + "step": 7028 + }, + { + "epoch": 1.212628310187182, + "grad_norm": 0.71484375, + "learning_rate": 6.7708964080340265e-06, + "loss": 1.4119, + "step": 7029 + }, + { + "epoch": 1.212800828085914, + "grad_norm": 0.640625, + "learning_rate": 6.768320461027336e-06, + "loss": 1.4975, + "step": 7030 + }, + { + "epoch": 1.212973345984646, + "grad_norm": 0.5859375, + "learning_rate": 6.765744753443654e-06, + "loss": 1.3481, + "step": 7031 + }, + { + "epoch": 1.2131458638833779, + "grad_norm": 0.66796875, + "learning_rate": 6.763169285473813e-06, + "loss": 1.4414, + "step": 7032 + }, + { + "epoch": 1.2133183817821098, + "grad_norm": 0.57421875, + "learning_rate": 6.760594057308614e-06, + "loss": 1.3994, + "step": 7033 + }, + { + "epoch": 1.213490899680842, + "grad_norm": 0.80078125, + "learning_rate": 6.758019069138851e-06, + "loss": 1.3516, + "step": 7034 + }, + { + "epoch": 1.213663417579574, + "grad_norm": 0.60546875, + "learning_rate": 6.7554443211552864e-06, + "loss": 1.4604, + "step": 7035 + }, + { + "epoch": 1.213835935478306, + "grad_norm": 0.58984375, + "learning_rate": 6.752869813548684e-06, + "loss": 1.4742, + "step": 7036 + }, + { + "epoch": 1.2140084533770379, + "grad_norm": 0.625, + "learning_rate": 6.750295546509771e-06, + "loss": 1.4507, + "step": 7037 + }, + { + "epoch": 1.2141809712757698, + "grad_norm": 0.69140625, + "learning_rate": 6.747721520229273e-06, + "loss": 1.4602, + "step": 7038 + }, + { + "epoch": 1.2143534891745018, + "grad_norm": 0.65625, + "learning_rate": 6.7451477348978835e-06, + "loss": 1.3104, + "step": 7039 + }, + { + "epoch": 1.2145260070732338, + "grad_norm": 0.58203125, + "learning_rate": 6.74257419070629e-06, + "loss": 1.416, + "step": 7040 + }, + { + "epoch": 1.2146985249719657, + "grad_norm": 0.5859375, + "learning_rate": 6.740000887845149e-06, + "loss": 1.4529, + "step": 7041 + }, + { + "epoch": 1.214871042870698, + "grad_norm": 0.890625, + "learning_rate": 6.737427826505116e-06, + "loss": 1.3673, + "step": 7042 + }, + { + "epoch": 1.2150435607694299, + "grad_norm": 0.578125, + "learning_rate": 6.734855006876814e-06, + "loss": 1.4521, + "step": 7043 + }, + { + "epoch": 1.2152160786681618, + "grad_norm": 0.5703125, + "learning_rate": 6.732282429150852e-06, + "loss": 1.3779, + "step": 7044 + }, + { + "epoch": 1.2153885965668938, + "grad_norm": 0.5703125, + "learning_rate": 6.729710093517829e-06, + "loss": 1.4222, + "step": 7045 + }, + { + "epoch": 1.2155611144656258, + "grad_norm": 0.84765625, + "learning_rate": 6.727138000168314e-06, + "loss": 1.4788, + "step": 7046 + }, + { + "epoch": 1.2157336323643577, + "grad_norm": 0.69140625, + "learning_rate": 6.7245661492928674e-06, + "loss": 1.4366, + "step": 7047 + }, + { + "epoch": 1.21590615026309, + "grad_norm": 0.65234375, + "learning_rate": 6.721994541082022e-06, + "loss": 1.4501, + "step": 7048 + }, + { + "epoch": 1.2160786681618219, + "grad_norm": 0.7734375, + "learning_rate": 6.719423175726308e-06, + "loss": 1.3713, + "step": 7049 + }, + { + "epoch": 1.2162511860605538, + "grad_norm": 0.578125, + "learning_rate": 6.716852053416221e-06, + "loss": 1.47, + "step": 7050 + }, + { + "epoch": 1.2164237039592858, + "grad_norm": 0.59765625, + "learning_rate": 6.7142811743422495e-06, + "loss": 1.4462, + "step": 7051 + }, + { + "epoch": 1.2165962218580177, + "grad_norm": 0.6171875, + "learning_rate": 6.711710538694855e-06, + "loss": 1.4658, + "step": 7052 + }, + { + "epoch": 1.2167687397567497, + "grad_norm": 0.578125, + "learning_rate": 6.709140146664494e-06, + "loss": 1.4135, + "step": 7053 + }, + { + "epoch": 1.2169412576554817, + "grad_norm": 0.578125, + "learning_rate": 6.706569998441591e-06, + "loss": 1.4711, + "step": 7054 + }, + { + "epoch": 1.2171137755542136, + "grad_norm": 0.5859375, + "learning_rate": 6.704000094216563e-06, + "loss": 1.4631, + "step": 7055 + }, + { + "epoch": 1.2172862934529458, + "grad_norm": 0.625, + "learning_rate": 6.701430434179799e-06, + "loss": 1.452, + "step": 7056 + }, + { + "epoch": 1.2174588113516778, + "grad_norm": 0.58984375, + "learning_rate": 6.698861018521681e-06, + "loss": 1.5188, + "step": 7057 + }, + { + "epoch": 1.2176313292504097, + "grad_norm": 0.57421875, + "learning_rate": 6.696291847432565e-06, + "loss": 1.4292, + "step": 7058 + }, + { + "epoch": 1.2178038471491417, + "grad_norm": 0.62890625, + "learning_rate": 6.693722921102788e-06, + "loss": 1.4083, + "step": 7059 + }, + { + "epoch": 1.2179763650478737, + "grad_norm": 0.61328125, + "learning_rate": 6.691154239722681e-06, + "loss": 1.4582, + "step": 7060 + }, + { + "epoch": 1.2181488829466056, + "grad_norm": 0.5703125, + "learning_rate": 6.688585803482539e-06, + "loss": 1.53, + "step": 7061 + }, + { + "epoch": 1.2183214008453378, + "grad_norm": 0.640625, + "learning_rate": 6.686017612572655e-06, + "loss": 1.432, + "step": 7062 + }, + { + "epoch": 1.2184939187440698, + "grad_norm": 0.60546875, + "learning_rate": 6.683449667183288e-06, + "loss": 1.3916, + "step": 7063 + }, + { + "epoch": 1.2186664366428017, + "grad_norm": 0.609375, + "learning_rate": 6.680881967504698e-06, + "loss": 1.4147, + "step": 7064 + }, + { + "epoch": 1.2188389545415337, + "grad_norm": 0.62109375, + "learning_rate": 6.678314513727105e-06, + "loss": 1.4874, + "step": 7065 + }, + { + "epoch": 1.2190114724402656, + "grad_norm": 0.6015625, + "learning_rate": 6.675747306040732e-06, + "loss": 1.4433, + "step": 7066 + }, + { + "epoch": 1.2191839903389976, + "grad_norm": 0.64453125, + "learning_rate": 6.673180344635767e-06, + "loss": 1.4913, + "step": 7067 + }, + { + "epoch": 1.2193565082377296, + "grad_norm": 0.6015625, + "learning_rate": 6.670613629702391e-06, + "loss": 1.4379, + "step": 7068 + }, + { + "epoch": 1.2195290261364617, + "grad_norm": 0.6015625, + "learning_rate": 6.6680471614307576e-06, + "loss": 1.4167, + "step": 7069 + }, + { + "epoch": 1.2197015440351937, + "grad_norm": 0.63671875, + "learning_rate": 6.665480940011012e-06, + "loss": 1.5605, + "step": 7070 + }, + { + "epoch": 1.2198740619339257, + "grad_norm": 0.609375, + "learning_rate": 6.662914965633272e-06, + "loss": 1.4221, + "step": 7071 + }, + { + "epoch": 1.2200465798326576, + "grad_norm": 0.6328125, + "learning_rate": 6.660349238487644e-06, + "loss": 1.5069, + "step": 7072 + }, + { + "epoch": 1.2202190977313896, + "grad_norm": 0.578125, + "learning_rate": 6.657783758764208e-06, + "loss": 1.4822, + "step": 7073 + }, + { + "epoch": 1.2203916156301216, + "grad_norm": 0.66015625, + "learning_rate": 6.655218526653038e-06, + "loss": 1.3407, + "step": 7074 + }, + { + "epoch": 1.2205641335288537, + "grad_norm": 0.6796875, + "learning_rate": 6.6526535423441775e-06, + "loss": 1.4482, + "step": 7075 + }, + { + "epoch": 1.2207366514275857, + "grad_norm": 0.578125, + "learning_rate": 6.650088806027655e-06, + "loss": 1.3931, + "step": 7076 + }, + { + "epoch": 1.2209091693263177, + "grad_norm": 0.62109375, + "learning_rate": 6.647524317893489e-06, + "loss": 1.4404, + "step": 7077 + }, + { + "epoch": 1.2210816872250496, + "grad_norm": 0.57421875, + "learning_rate": 6.644960078131667e-06, + "loss": 1.3433, + "step": 7078 + }, + { + "epoch": 1.2212542051237816, + "grad_norm": 0.59765625, + "learning_rate": 6.642396086932168e-06, + "loss": 1.4951, + "step": 7079 + }, + { + "epoch": 1.2214267230225135, + "grad_norm": 0.56640625, + "learning_rate": 6.639832344484942e-06, + "loss": 1.4083, + "step": 7080 + }, + { + "epoch": 1.2215992409212455, + "grad_norm": 0.59765625, + "learning_rate": 6.637268850979934e-06, + "loss": 1.3391, + "step": 7081 + }, + { + "epoch": 1.2217717588199775, + "grad_norm": 0.609375, + "learning_rate": 6.6347056066070605e-06, + "loss": 1.5557, + "step": 7082 + }, + { + "epoch": 1.2219442767187096, + "grad_norm": 0.66015625, + "learning_rate": 6.632142611556225e-06, + "loss": 1.4448, + "step": 7083 + }, + { + "epoch": 1.2221167946174416, + "grad_norm": 0.671875, + "learning_rate": 6.629579866017303e-06, + "loss": 1.4612, + "step": 7084 + }, + { + "epoch": 1.2222893125161736, + "grad_norm": 0.65625, + "learning_rate": 6.6270173701801685e-06, + "loss": 1.4145, + "step": 7085 + }, + { + "epoch": 1.2224618304149055, + "grad_norm": 0.58984375, + "learning_rate": 6.62445512423466e-06, + "loss": 1.4511, + "step": 7086 + }, + { + "epoch": 1.2226343483136375, + "grad_norm": 0.6015625, + "learning_rate": 6.621893128370609e-06, + "loss": 1.3963, + "step": 7087 + }, + { + "epoch": 1.2228068662123694, + "grad_norm": 0.5859375, + "learning_rate": 6.61933138277782e-06, + "loss": 1.4633, + "step": 7088 + }, + { + "epoch": 1.2229793841111016, + "grad_norm": 0.609375, + "learning_rate": 6.616769887646088e-06, + "loss": 1.4498, + "step": 7089 + }, + { + "epoch": 1.2231519020098336, + "grad_norm": 0.58984375, + "learning_rate": 6.614208643165181e-06, + "loss": 1.3724, + "step": 7090 + }, + { + "epoch": 1.2233244199085656, + "grad_norm": 0.578125, + "learning_rate": 6.61164764952485e-06, + "loss": 1.4823, + "step": 7091 + }, + { + "epoch": 1.2234969378072975, + "grad_norm": 0.703125, + "learning_rate": 6.609086906914839e-06, + "loss": 1.3061, + "step": 7092 + }, + { + "epoch": 1.2236694557060295, + "grad_norm": 0.671875, + "learning_rate": 6.606526415524852e-06, + "loss": 1.4084, + "step": 7093 + }, + { + "epoch": 1.2238419736047614, + "grad_norm": 0.58203125, + "learning_rate": 6.603966175544595e-06, + "loss": 1.5003, + "step": 7094 + }, + { + "epoch": 1.2240144915034934, + "grad_norm": 0.60546875, + "learning_rate": 6.601406187163741e-06, + "loss": 1.4785, + "step": 7095 + }, + { + "epoch": 1.2241870094022254, + "grad_norm": 0.6015625, + "learning_rate": 6.598846450571956e-06, + "loss": 1.3297, + "step": 7096 + }, + { + "epoch": 1.2243595273009575, + "grad_norm": 0.578125, + "learning_rate": 6.596286965958872e-06, + "loss": 1.4479, + "step": 7097 + }, + { + "epoch": 1.2245320451996895, + "grad_norm": 0.60546875, + "learning_rate": 6.593727733514119e-06, + "loss": 1.4002, + "step": 7098 + }, + { + "epoch": 1.2247045630984215, + "grad_norm": 0.5703125, + "learning_rate": 6.5911687534272995e-06, + "loss": 1.4138, + "step": 7099 + }, + { + "epoch": 1.2248770809971534, + "grad_norm": 0.6015625, + "learning_rate": 6.588610025887999e-06, + "loss": 1.3873, + "step": 7100 + }, + { + "epoch": 1.2248770809971534, + "eval_loss": 1.408066987991333, + "eval_runtime": 10.9497, + "eval_samples_per_second": 93.518, + "eval_steps_per_second": 23.38, + "step": 7100 + }, + { + "epoch": 1.2250495988958854, + "grad_norm": 0.65625, + "learning_rate": 6.5860515510857795e-06, + "loss": 1.4459, + "step": 7101 + }, + { + "epoch": 1.2252221167946176, + "grad_norm": 0.8515625, + "learning_rate": 6.583493329210197e-06, + "loss": 1.3754, + "step": 7102 + }, + { + "epoch": 1.2253946346933495, + "grad_norm": 0.578125, + "learning_rate": 6.5809353604507735e-06, + "loss": 1.4302, + "step": 7103 + }, + { + "epoch": 1.2255671525920815, + "grad_norm": 0.59375, + "learning_rate": 6.578377644997022e-06, + "loss": 1.4781, + "step": 7104 + }, + { + "epoch": 1.2257396704908134, + "grad_norm": 0.58984375, + "learning_rate": 6.575820183038433e-06, + "loss": 1.4126, + "step": 7105 + }, + { + "epoch": 1.2259121883895454, + "grad_norm": 0.70703125, + "learning_rate": 6.573262974764483e-06, + "loss": 1.3968, + "step": 7106 + }, + { + "epoch": 1.2260847062882774, + "grad_norm": 0.61328125, + "learning_rate": 6.570706020364619e-06, + "loss": 1.4346, + "step": 7107 + }, + { + "epoch": 1.2262572241870093, + "grad_norm": 0.58984375, + "learning_rate": 6.568149320028281e-06, + "loss": 1.4194, + "step": 7108 + }, + { + "epoch": 1.2264297420857413, + "grad_norm": 0.6171875, + "learning_rate": 6.5655928739448874e-06, + "loss": 1.465, + "step": 7109 + }, + { + "epoch": 1.2266022599844735, + "grad_norm": 0.68359375, + "learning_rate": 6.56303668230383e-06, + "loss": 1.3937, + "step": 7110 + }, + { + "epoch": 1.2267747778832054, + "grad_norm": 0.59375, + "learning_rate": 6.560480745294493e-06, + "loss": 1.4778, + "step": 7111 + }, + { + "epoch": 1.2269472957819374, + "grad_norm": 0.59375, + "learning_rate": 6.557925063106229e-06, + "loss": 1.3817, + "step": 7112 + }, + { + "epoch": 1.2271198136806694, + "grad_norm": 0.6015625, + "learning_rate": 6.5553696359283905e-06, + "loss": 1.4118, + "step": 7113 + }, + { + "epoch": 1.2272923315794013, + "grad_norm": 0.625, + "learning_rate": 6.552814463950288e-06, + "loss": 1.3556, + "step": 7114 + }, + { + "epoch": 1.2274648494781333, + "grad_norm": 0.6171875, + "learning_rate": 6.5502595473612304e-06, + "loss": 1.5199, + "step": 7115 + }, + { + "epoch": 1.2276373673768655, + "grad_norm": 0.62890625, + "learning_rate": 6.5477048863504965e-06, + "loss": 1.5357, + "step": 7116 + }, + { + "epoch": 1.2278098852755974, + "grad_norm": 0.578125, + "learning_rate": 6.5451504811073604e-06, + "loss": 1.3962, + "step": 7117 + }, + { + "epoch": 1.2279824031743294, + "grad_norm": 0.7109375, + "learning_rate": 6.542596331821061e-06, + "loss": 1.4034, + "step": 7118 + }, + { + "epoch": 1.2281549210730613, + "grad_norm": 0.57421875, + "learning_rate": 6.540042438680832e-06, + "loss": 1.3123, + "step": 7119 + }, + { + "epoch": 1.2283274389717933, + "grad_norm": 0.56640625, + "learning_rate": 6.537488801875872e-06, + "loss": 1.3925, + "step": 7120 + }, + { + "epoch": 1.2284999568705253, + "grad_norm": 0.55078125, + "learning_rate": 6.534935421595381e-06, + "loss": 1.3863, + "step": 7121 + }, + { + "epoch": 1.2286724747692572, + "grad_norm": 0.5625, + "learning_rate": 6.5323822980285224e-06, + "loss": 1.4513, + "step": 7122 + }, + { + "epoch": 1.2288449926679892, + "grad_norm": 0.6796875, + "learning_rate": 6.5298294313644515e-06, + "loss": 1.4978, + "step": 7123 + }, + { + "epoch": 1.2290175105667214, + "grad_norm": 0.60546875, + "learning_rate": 6.527276821792297e-06, + "loss": 1.3822, + "step": 7124 + }, + { + "epoch": 1.2291900284654533, + "grad_norm": 0.67578125, + "learning_rate": 6.524724469501172e-06, + "loss": 1.363, + "step": 7125 + }, + { + "epoch": 1.2293625463641853, + "grad_norm": 0.6015625, + "learning_rate": 6.522172374680177e-06, + "loss": 1.4761, + "step": 7126 + }, + { + "epoch": 1.2295350642629173, + "grad_norm": 0.5546875, + "learning_rate": 6.5196205375183804e-06, + "loss": 1.419, + "step": 7127 + }, + { + "epoch": 1.2297075821616492, + "grad_norm": 0.58203125, + "learning_rate": 6.517068958204844e-06, + "loss": 1.3345, + "step": 7128 + }, + { + "epoch": 1.2298801000603812, + "grad_norm": 0.671875, + "learning_rate": 6.514517636928598e-06, + "loss": 1.4148, + "step": 7129 + }, + { + "epoch": 1.2300526179591134, + "grad_norm": 0.62890625, + "learning_rate": 6.511966573878667e-06, + "loss": 1.4199, + "step": 7130 + }, + { + "epoch": 1.2302251358578453, + "grad_norm": 0.5625, + "learning_rate": 6.509415769244044e-06, + "loss": 1.3635, + "step": 7131 + }, + { + "epoch": 1.2303976537565773, + "grad_norm": 0.64453125, + "learning_rate": 6.506865223213714e-06, + "loss": 1.3639, + "step": 7132 + }, + { + "epoch": 1.2305701716553092, + "grad_norm": 0.578125, + "learning_rate": 6.50431493597663e-06, + "loss": 1.359, + "step": 7133 + }, + { + "epoch": 1.2307426895540412, + "grad_norm": 0.59375, + "learning_rate": 6.5017649077217415e-06, + "loss": 1.4499, + "step": 7134 + }, + { + "epoch": 1.2309152074527732, + "grad_norm": 0.57421875, + "learning_rate": 6.499215138637964e-06, + "loss": 1.4122, + "step": 7135 + }, + { + "epoch": 1.2310877253515051, + "grad_norm": 0.58984375, + "learning_rate": 6.496665628914205e-06, + "loss": 1.5291, + "step": 7136 + }, + { + "epoch": 1.2312602432502373, + "grad_norm": 0.5625, + "learning_rate": 6.494116378739344e-06, + "loss": 1.3532, + "step": 7137 + }, + { + "epoch": 1.2314327611489693, + "grad_norm": 0.609375, + "learning_rate": 6.491567388302249e-06, + "loss": 1.4024, + "step": 7138 + }, + { + "epoch": 1.2316052790477012, + "grad_norm": 0.57421875, + "learning_rate": 6.489018657791763e-06, + "loss": 1.3638, + "step": 7139 + }, + { + "epoch": 1.2317777969464332, + "grad_norm": 3.328125, + "learning_rate": 6.48647018739671e-06, + "loss": 1.5342, + "step": 7140 + }, + { + "epoch": 1.2319503148451652, + "grad_norm": 0.62890625, + "learning_rate": 6.483921977305903e-06, + "loss": 1.4922, + "step": 7141 + }, + { + "epoch": 1.2321228327438971, + "grad_norm": 0.66015625, + "learning_rate": 6.481374027708123e-06, + "loss": 1.4724, + "step": 7142 + }, + { + "epoch": 1.2322953506426293, + "grad_norm": 0.62109375, + "learning_rate": 6.478826338792144e-06, + "loss": 1.4799, + "step": 7143 + }, + { + "epoch": 1.2324678685413613, + "grad_norm": 0.58984375, + "learning_rate": 6.476278910746705e-06, + "loss": 1.4143, + "step": 7144 + }, + { + "epoch": 1.2326403864400932, + "grad_norm": 0.65234375, + "learning_rate": 6.4737317437605475e-06, + "loss": 1.3686, + "step": 7145 + }, + { + "epoch": 1.2328129043388252, + "grad_norm": 0.62109375, + "learning_rate": 6.471184838022372e-06, + "loss": 1.406, + "step": 7146 + }, + { + "epoch": 1.2329854222375571, + "grad_norm": 0.70703125, + "learning_rate": 6.468638193720875e-06, + "loss": 1.4347, + "step": 7147 + }, + { + "epoch": 1.233157940136289, + "grad_norm": 0.57421875, + "learning_rate": 6.4660918110447215e-06, + "loss": 1.3784, + "step": 7148 + }, + { + "epoch": 1.233330458035021, + "grad_norm": 0.63671875, + "learning_rate": 6.463545690182573e-06, + "loss": 1.4489, + "step": 7149 + }, + { + "epoch": 1.233502975933753, + "grad_norm": 0.609375, + "learning_rate": 6.460999831323054e-06, + "loss": 1.3989, + "step": 7150 + }, + { + "epoch": 1.2336754938324852, + "grad_norm": 0.61328125, + "learning_rate": 6.458454234654781e-06, + "loss": 1.4118, + "step": 7151 + }, + { + "epoch": 1.2338480117312172, + "grad_norm": 0.62890625, + "learning_rate": 6.4559089003663434e-06, + "loss": 1.4362, + "step": 7152 + }, + { + "epoch": 1.2340205296299491, + "grad_norm": 0.609375, + "learning_rate": 6.453363828646323e-06, + "loss": 1.4336, + "step": 7153 + }, + { + "epoch": 1.234193047528681, + "grad_norm": 0.60546875, + "learning_rate": 6.450819019683267e-06, + "loss": 1.501, + "step": 7154 + }, + { + "epoch": 1.234365565427413, + "grad_norm": 0.55078125, + "learning_rate": 6.448274473665718e-06, + "loss": 1.3423, + "step": 7155 + }, + { + "epoch": 1.234538083326145, + "grad_norm": 0.58984375, + "learning_rate": 6.445730190782187e-06, + "loss": 1.4127, + "step": 7156 + }, + { + "epoch": 1.2347106012248772, + "grad_norm": 0.62890625, + "learning_rate": 6.443186171221167e-06, + "loss": 1.4934, + "step": 7157 + }, + { + "epoch": 1.2348831191236092, + "grad_norm": 0.5625, + "learning_rate": 6.4406424151711456e-06, + "loss": 1.4181, + "step": 7158 + }, + { + "epoch": 1.2350556370223411, + "grad_norm": 0.61328125, + "learning_rate": 6.438098922820573e-06, + "loss": 1.44, + "step": 7159 + }, + { + "epoch": 1.235228154921073, + "grad_norm": 0.61328125, + "learning_rate": 6.435555694357888e-06, + "loss": 1.3861, + "step": 7160 + }, + { + "epoch": 1.235400672819805, + "grad_norm": 0.65625, + "learning_rate": 6.433012729971506e-06, + "loss": 1.4417, + "step": 7161 + }, + { + "epoch": 1.235573190718537, + "grad_norm": 0.6171875, + "learning_rate": 6.430470029849832e-06, + "loss": 1.5808, + "step": 7162 + }, + { + "epoch": 1.235745708617269, + "grad_norm": 0.59375, + "learning_rate": 6.4279275941812395e-06, + "loss": 1.3912, + "step": 7163 + }, + { + "epoch": 1.235918226516001, + "grad_norm": 0.59375, + "learning_rate": 6.425385423154091e-06, + "loss": 1.3512, + "step": 7164 + }, + { + "epoch": 1.236090744414733, + "grad_norm": 0.6328125, + "learning_rate": 6.422843516956724e-06, + "loss": 1.3951, + "step": 7165 + }, + { + "epoch": 1.236263262313465, + "grad_norm": 0.5625, + "learning_rate": 6.420301875777464e-06, + "loss": 1.4184, + "step": 7166 + }, + { + "epoch": 1.236435780212197, + "grad_norm": 0.59765625, + "learning_rate": 6.417760499804604e-06, + "loss": 1.4245, + "step": 7167 + }, + { + "epoch": 1.236608298110929, + "grad_norm": 0.6171875, + "learning_rate": 6.415219389226432e-06, + "loss": 1.4479, + "step": 7168 + }, + { + "epoch": 1.236780816009661, + "grad_norm": 0.578125, + "learning_rate": 6.412678544231203e-06, + "loss": 1.4777, + "step": 7169 + }, + { + "epoch": 1.236953333908393, + "grad_norm": 0.58984375, + "learning_rate": 6.410137965007166e-06, + "loss": 1.3913, + "step": 7170 + }, + { + "epoch": 1.237125851807125, + "grad_norm": 0.578125, + "learning_rate": 6.4075976517425365e-06, + "loss": 1.4419, + "step": 7171 + }, + { + "epoch": 1.237298369705857, + "grad_norm": 0.58203125, + "learning_rate": 6.4050576046255176e-06, + "loss": 1.4678, + "step": 7172 + }, + { + "epoch": 1.237470887604589, + "grad_norm": 0.58203125, + "learning_rate": 6.402517823844299e-06, + "loss": 1.4013, + "step": 7173 + }, + { + "epoch": 1.237643405503321, + "grad_norm": 0.57421875, + "learning_rate": 6.399978309587034e-06, + "loss": 1.4265, + "step": 7174 + }, + { + "epoch": 1.237815923402053, + "grad_norm": 0.6171875, + "learning_rate": 6.397439062041873e-06, + "loss": 1.3889, + "step": 7175 + }, + { + "epoch": 1.237988441300785, + "grad_norm": 0.5859375, + "learning_rate": 6.394900081396931e-06, + "loss": 1.4773, + "step": 7176 + }, + { + "epoch": 1.2381609591995169, + "grad_norm": 0.5703125, + "learning_rate": 6.392361367840322e-06, + "loss": 1.4759, + "step": 7177 + }, + { + "epoch": 1.238333477098249, + "grad_norm": 0.58203125, + "learning_rate": 6.3898229215601215e-06, + "loss": 1.3627, + "step": 7178 + }, + { + "epoch": 1.238505994996981, + "grad_norm": 0.5625, + "learning_rate": 6.3872847427443985e-06, + "loss": 1.5092, + "step": 7179 + }, + { + "epoch": 1.238678512895713, + "grad_norm": 0.65234375, + "learning_rate": 6.384746831581191e-06, + "loss": 1.4332, + "step": 7180 + }, + { + "epoch": 1.238851030794445, + "grad_norm": 0.6015625, + "learning_rate": 6.382209188258533e-06, + "loss": 1.3706, + "step": 7181 + }, + { + "epoch": 1.2390235486931769, + "grad_norm": 0.5859375, + "learning_rate": 6.379671812964416e-06, + "loss": 1.4427, + "step": 7182 + }, + { + "epoch": 1.2391960665919088, + "grad_norm": 0.5859375, + "learning_rate": 6.3771347058868404e-06, + "loss": 1.4604, + "step": 7183 + }, + { + "epoch": 1.239368584490641, + "grad_norm": 0.59765625, + "learning_rate": 6.374597867213756e-06, + "loss": 1.4729, + "step": 7184 + }, + { + "epoch": 1.239541102389373, + "grad_norm": 0.59375, + "learning_rate": 6.372061297133119e-06, + "loss": 1.3559, + "step": 7185 + }, + { + "epoch": 1.239713620288105, + "grad_norm": 0.55859375, + "learning_rate": 6.369524995832844e-06, + "loss": 1.4233, + "step": 7186 + }, + { + "epoch": 1.239886138186837, + "grad_norm": 0.66796875, + "learning_rate": 6.366988963500846e-06, + "loss": 1.452, + "step": 7187 + }, + { + "epoch": 1.2400586560855689, + "grad_norm": 0.57421875, + "learning_rate": 6.364453200325005e-06, + "loss": 1.4131, + "step": 7188 + }, + { + "epoch": 1.2402311739843008, + "grad_norm": 0.5625, + "learning_rate": 6.361917706493184e-06, + "loss": 1.4352, + "step": 7189 + }, + { + "epoch": 1.2404036918830328, + "grad_norm": 0.58203125, + "learning_rate": 6.359382482193234e-06, + "loss": 1.4955, + "step": 7190 + }, + { + "epoch": 1.2405762097817647, + "grad_norm": 0.546875, + "learning_rate": 6.356847527612976e-06, + "loss": 1.3062, + "step": 7191 + }, + { + "epoch": 1.240748727680497, + "grad_norm": 0.57421875, + "learning_rate": 6.354312842940219e-06, + "loss": 1.4652, + "step": 7192 + }, + { + "epoch": 1.240921245579229, + "grad_norm": 0.58984375, + "learning_rate": 6.351778428362742e-06, + "loss": 1.4002, + "step": 7193 + }, + { + "epoch": 1.2410937634779609, + "grad_norm": 0.6015625, + "learning_rate": 6.349244284068318e-06, + "loss": 1.3301, + "step": 7194 + }, + { + "epoch": 1.2412662813766928, + "grad_norm": 0.59375, + "learning_rate": 6.346710410244685e-06, + "loss": 1.4002, + "step": 7195 + }, + { + "epoch": 1.2414387992754248, + "grad_norm": 0.62109375, + "learning_rate": 6.344176807079576e-06, + "loss": 1.3778, + "step": 7196 + }, + { + "epoch": 1.2416113171741567, + "grad_norm": 0.58984375, + "learning_rate": 6.341643474760686e-06, + "loss": 1.4565, + "step": 7197 + }, + { + "epoch": 1.241783835072889, + "grad_norm": 0.609375, + "learning_rate": 6.339110413475711e-06, + "loss": 1.3695, + "step": 7198 + }, + { + "epoch": 1.2419563529716209, + "grad_norm": 0.59765625, + "learning_rate": 6.336577623412308e-06, + "loss": 1.2846, + "step": 7199 + }, + { + "epoch": 1.2421288708703528, + "grad_norm": 0.5703125, + "learning_rate": 6.3340451047581275e-06, + "loss": 1.4077, + "step": 7200 + }, + { + "epoch": 1.2421288708703528, + "eval_loss": 1.4079400300979614, + "eval_runtime": 10.842, + "eval_samples_per_second": 94.447, + "eval_steps_per_second": 23.612, + "step": 7200 + }, + { + "epoch": 1.2423013887690848, + "grad_norm": 0.62890625, + "learning_rate": 6.3315128577007874e-06, + "loss": 1.3817, + "step": 7201 + }, + { + "epoch": 1.2424739066678168, + "grad_norm": 0.60546875, + "learning_rate": 6.3289808824279e-06, + "loss": 1.3751, + "step": 7202 + }, + { + "epoch": 1.2426464245665487, + "grad_norm": 0.55078125, + "learning_rate": 6.3264491791270455e-06, + "loss": 1.3145, + "step": 7203 + }, + { + "epoch": 1.2428189424652807, + "grad_norm": 0.578125, + "learning_rate": 6.323917747985786e-06, + "loss": 1.4272, + "step": 7204 + }, + { + "epoch": 1.2429914603640126, + "grad_norm": 0.59765625, + "learning_rate": 6.321386589191674e-06, + "loss": 1.3329, + "step": 7205 + }, + { + "epoch": 1.2431639782627448, + "grad_norm": 0.56640625, + "learning_rate": 6.318855702932225e-06, + "loss": 1.4386, + "step": 7206 + }, + { + "epoch": 1.2433364961614768, + "grad_norm": 0.58984375, + "learning_rate": 6.31632508939495e-06, + "loss": 1.5471, + "step": 7207 + }, + { + "epoch": 1.2435090140602088, + "grad_norm": 0.55859375, + "learning_rate": 6.313794748767324e-06, + "loss": 1.3682, + "step": 7208 + }, + { + "epoch": 1.2436815319589407, + "grad_norm": 0.6328125, + "learning_rate": 6.31126468123682e-06, + "loss": 1.3712, + "step": 7209 + }, + { + "epoch": 1.2438540498576727, + "grad_norm": 0.8203125, + "learning_rate": 6.308734886990875e-06, + "loss": 1.3938, + "step": 7210 + }, + { + "epoch": 1.2440265677564046, + "grad_norm": 0.61328125, + "learning_rate": 6.306205366216915e-06, + "loss": 1.4611, + "step": 7211 + }, + { + "epoch": 1.2441990856551368, + "grad_norm": 0.68359375, + "learning_rate": 6.3036761191023374e-06, + "loss": 1.4358, + "step": 7212 + }, + { + "epoch": 1.2443716035538688, + "grad_norm": 0.6171875, + "learning_rate": 6.301147145834534e-06, + "loss": 1.4512, + "step": 7213 + }, + { + "epoch": 1.2445441214526007, + "grad_norm": 0.68359375, + "learning_rate": 6.298618446600856e-06, + "loss": 1.4666, + "step": 7214 + }, + { + "epoch": 1.2447166393513327, + "grad_norm": 0.64453125, + "learning_rate": 6.2960900215886556e-06, + "loss": 1.3919, + "step": 7215 + }, + { + "epoch": 1.2448891572500647, + "grad_norm": 0.58984375, + "learning_rate": 6.293561870985248e-06, + "loss": 1.3503, + "step": 7216 + }, + { + "epoch": 1.2450616751487966, + "grad_norm": 0.64453125, + "learning_rate": 6.291033994977935e-06, + "loss": 1.4322, + "step": 7217 + }, + { + "epoch": 1.2452341930475286, + "grad_norm": 0.5703125, + "learning_rate": 6.288506393753997e-06, + "loss": 1.3567, + "step": 7218 + }, + { + "epoch": 1.2454067109462608, + "grad_norm": 0.5859375, + "learning_rate": 6.285979067500699e-06, + "loss": 1.4231, + "step": 7219 + }, + { + "epoch": 1.2455792288449927, + "grad_norm": 0.5546875, + "learning_rate": 6.283452016405276e-06, + "loss": 1.3525, + "step": 7220 + }, + { + "epoch": 1.2457517467437247, + "grad_norm": 0.5703125, + "learning_rate": 6.280925240654948e-06, + "loss": 1.4268, + "step": 7221 + }, + { + "epoch": 1.2459242646424566, + "grad_norm": 0.58984375, + "learning_rate": 6.27839874043692e-06, + "loss": 1.438, + "step": 7222 + }, + { + "epoch": 1.2460967825411886, + "grad_norm": 0.58984375, + "learning_rate": 6.275872515938365e-06, + "loss": 1.4832, + "step": 7223 + }, + { + "epoch": 1.2462693004399206, + "grad_norm": 0.58203125, + "learning_rate": 6.2733465673464456e-06, + "loss": 1.49, + "step": 7224 + }, + { + "epoch": 1.2464418183386528, + "grad_norm": 0.6015625, + "learning_rate": 6.270820894848293e-06, + "loss": 1.3996, + "step": 7225 + }, + { + "epoch": 1.2466143362373847, + "grad_norm": 1.046875, + "learning_rate": 6.268295498631034e-06, + "loss": 1.3661, + "step": 7226 + }, + { + "epoch": 1.2467868541361167, + "grad_norm": 0.578125, + "learning_rate": 6.265770378881759e-06, + "loss": 1.3727, + "step": 7227 + }, + { + "epoch": 1.2469593720348486, + "grad_norm": 0.68359375, + "learning_rate": 6.263245535787548e-06, + "loss": 1.4667, + "step": 7228 + }, + { + "epoch": 1.2471318899335806, + "grad_norm": 0.640625, + "learning_rate": 6.260720969535453e-06, + "loss": 1.439, + "step": 7229 + }, + { + "epoch": 1.2473044078323126, + "grad_norm": 0.57421875, + "learning_rate": 6.258196680312517e-06, + "loss": 1.341, + "step": 7230 + }, + { + "epoch": 1.2474769257310445, + "grad_norm": 0.69921875, + "learning_rate": 6.255672668305748e-06, + "loss": 1.4412, + "step": 7231 + }, + { + "epoch": 1.2476494436297765, + "grad_norm": 0.58203125, + "learning_rate": 6.253148933702147e-06, + "loss": 1.3463, + "step": 7232 + }, + { + "epoch": 1.2478219615285087, + "grad_norm": 0.58203125, + "learning_rate": 6.250625476688679e-06, + "loss": 1.3639, + "step": 7233 + }, + { + "epoch": 1.2479944794272406, + "grad_norm": 0.59375, + "learning_rate": 6.248102297452307e-06, + "loss": 1.4123, + "step": 7234 + }, + { + "epoch": 1.2481669973259726, + "grad_norm": 0.59375, + "learning_rate": 6.245579396179957e-06, + "loss": 1.5242, + "step": 7235 + }, + { + "epoch": 1.2483395152247045, + "grad_norm": 0.57421875, + "learning_rate": 6.243056773058542e-06, + "loss": 1.541, + "step": 7236 + }, + { + "epoch": 1.2485120331234365, + "grad_norm": 0.73828125, + "learning_rate": 6.240534428274961e-06, + "loss": 1.5029, + "step": 7237 + }, + { + "epoch": 1.2486845510221685, + "grad_norm": 0.61328125, + "learning_rate": 6.238012362016077e-06, + "loss": 1.4879, + "step": 7238 + }, + { + "epoch": 1.2488570689209006, + "grad_norm": 0.61328125, + "learning_rate": 6.235490574468745e-06, + "loss": 1.4711, + "step": 7239 + }, + { + "epoch": 1.2490295868196326, + "grad_norm": 0.609375, + "learning_rate": 6.232969065819791e-06, + "loss": 1.4517, + "step": 7240 + }, + { + "epoch": 1.2492021047183646, + "grad_norm": 0.62890625, + "learning_rate": 6.230447836256028e-06, + "loss": 1.4584, + "step": 7241 + }, + { + "epoch": 1.2493746226170965, + "grad_norm": 0.58203125, + "learning_rate": 6.2279268859642396e-06, + "loss": 1.4252, + "step": 7242 + }, + { + "epoch": 1.2495471405158285, + "grad_norm": 0.62890625, + "learning_rate": 6.2254062151312e-06, + "loss": 1.3844, + "step": 7243 + }, + { + "epoch": 1.2497196584145605, + "grad_norm": 0.58984375, + "learning_rate": 6.222885823943651e-06, + "loss": 1.4895, + "step": 7244 + }, + { + "epoch": 1.2498921763132924, + "grad_norm": 0.5859375, + "learning_rate": 6.220365712588322e-06, + "loss": 1.4183, + "step": 7245 + }, + { + "epoch": 1.2500646942120244, + "grad_norm": 0.59765625, + "learning_rate": 6.217845881251913e-06, + "loss": 1.3925, + "step": 7246 + }, + { + "epoch": 1.2502372121107566, + "grad_norm": 0.57421875, + "learning_rate": 6.215326330121119e-06, + "loss": 1.4546, + "step": 7247 + }, + { + "epoch": 1.2504097300094885, + "grad_norm": 0.56640625, + "learning_rate": 6.212807059382595e-06, + "loss": 1.3991, + "step": 7248 + }, + { + "epoch": 1.2505822479082205, + "grad_norm": 0.6171875, + "learning_rate": 6.210288069222989e-06, + "loss": 1.4733, + "step": 7249 + }, + { + "epoch": 1.2507547658069524, + "grad_norm": 0.8671875, + "learning_rate": 6.207769359828919e-06, + "loss": 1.4307, + "step": 7250 + }, + { + "epoch": 1.2509272837056844, + "grad_norm": 0.55078125, + "learning_rate": 6.205250931386992e-06, + "loss": 1.4177, + "step": 7251 + }, + { + "epoch": 1.2510998016044166, + "grad_norm": 0.6015625, + "learning_rate": 6.202732784083787e-06, + "loss": 1.3977, + "step": 7252 + }, + { + "epoch": 1.2512723195031485, + "grad_norm": 0.5625, + "learning_rate": 6.20021491810586e-06, + "loss": 1.4586, + "step": 7253 + }, + { + "epoch": 1.2514448374018805, + "grad_norm": 0.6015625, + "learning_rate": 6.197697333639759e-06, + "loss": 1.5194, + "step": 7254 + }, + { + "epoch": 1.2516173553006125, + "grad_norm": 0.79296875, + "learning_rate": 6.195180030871995e-06, + "loss": 1.3996, + "step": 7255 + }, + { + "epoch": 1.2517898731993444, + "grad_norm": 0.58984375, + "learning_rate": 6.19266300998907e-06, + "loss": 1.3985, + "step": 7256 + }, + { + "epoch": 1.2519623910980764, + "grad_norm": 0.6484375, + "learning_rate": 6.190146271177455e-06, + "loss": 1.3829, + "step": 7257 + }, + { + "epoch": 1.2521349089968083, + "grad_norm": 0.60546875, + "learning_rate": 6.187629814623613e-06, + "loss": 1.4638, + "step": 7258 + }, + { + "epoch": 1.2523074268955403, + "grad_norm": 0.64453125, + "learning_rate": 6.1851136405139735e-06, + "loss": 1.4106, + "step": 7259 + }, + { + "epoch": 1.2524799447942723, + "grad_norm": 0.56640625, + "learning_rate": 6.182597749034953e-06, + "loss": 1.3867, + "step": 7260 + }, + { + "epoch": 1.2526524626930045, + "grad_norm": 0.6875, + "learning_rate": 6.1800821403729405e-06, + "loss": 1.3823, + "step": 7261 + }, + { + "epoch": 1.2528249805917364, + "grad_norm": 0.63671875, + "learning_rate": 6.177566814714316e-06, + "loss": 1.4487, + "step": 7262 + }, + { + "epoch": 1.2529974984904684, + "grad_norm": 0.6328125, + "learning_rate": 6.175051772245421e-06, + "loss": 1.4645, + "step": 7263 + }, + { + "epoch": 1.2531700163892003, + "grad_norm": 0.6015625, + "learning_rate": 6.172537013152593e-06, + "loss": 1.4401, + "step": 7264 + }, + { + "epoch": 1.2533425342879323, + "grad_norm": 0.5859375, + "learning_rate": 6.170022537622135e-06, + "loss": 1.5324, + "step": 7265 + }, + { + "epoch": 1.2535150521866645, + "grad_norm": 0.62890625, + "learning_rate": 6.1675083458403405e-06, + "loss": 1.3808, + "step": 7266 + }, + { + "epoch": 1.2536875700853964, + "grad_norm": 0.5859375, + "learning_rate": 6.164994437993474e-06, + "loss": 1.4153, + "step": 7267 + }, + { + "epoch": 1.2538600879841284, + "grad_norm": 0.6328125, + "learning_rate": 6.162480814267779e-06, + "loss": 1.492, + "step": 7268 + }, + { + "epoch": 1.2540326058828604, + "grad_norm": 0.625, + "learning_rate": 6.1599674748494875e-06, + "loss": 1.3307, + "step": 7269 + }, + { + "epoch": 1.2542051237815923, + "grad_norm": 0.6015625, + "learning_rate": 6.1574544199247955e-06, + "loss": 1.4724, + "step": 7270 + }, + { + "epoch": 1.2543776416803243, + "grad_norm": 0.61328125, + "learning_rate": 6.154941649679894e-06, + "loss": 1.4853, + "step": 7271 + }, + { + "epoch": 1.2545501595790562, + "grad_norm": 0.58203125, + "learning_rate": 6.152429164300935e-06, + "loss": 1.3948, + "step": 7272 + }, + { + "epoch": 1.2547226774777882, + "grad_norm": 0.59765625, + "learning_rate": 6.149916963974068e-06, + "loss": 1.4597, + "step": 7273 + }, + { + "epoch": 1.2548951953765204, + "grad_norm": 0.5703125, + "learning_rate": 6.1474050488854055e-06, + "loss": 1.3844, + "step": 7274 + }, + { + "epoch": 1.2550677132752524, + "grad_norm": 0.57421875, + "learning_rate": 6.144893419221052e-06, + "loss": 1.4039, + "step": 7275 + }, + { + "epoch": 1.2552402311739843, + "grad_norm": 0.5703125, + "learning_rate": 6.142382075167082e-06, + "loss": 1.4968, + "step": 7276 + }, + { + "epoch": 1.2554127490727163, + "grad_norm": 0.58984375, + "learning_rate": 6.139871016909551e-06, + "loss": 1.5144, + "step": 7277 + }, + { + "epoch": 1.2555852669714482, + "grad_norm": 0.5703125, + "learning_rate": 6.1373602446344904e-06, + "loss": 1.404, + "step": 7278 + }, + { + "epoch": 1.2557577848701804, + "grad_norm": 0.5859375, + "learning_rate": 6.134849758527923e-06, + "loss": 1.4736, + "step": 7279 + }, + { + "epoch": 1.2559303027689124, + "grad_norm": 0.6015625, + "learning_rate": 6.1323395587758325e-06, + "loss": 1.3478, + "step": 7280 + }, + { + "epoch": 1.2561028206676443, + "grad_norm": 0.5390625, + "learning_rate": 6.129829645564197e-06, + "loss": 1.3936, + "step": 7281 + }, + { + "epoch": 1.2562753385663763, + "grad_norm": 0.62890625, + "learning_rate": 6.127320019078959e-06, + "loss": 1.4741, + "step": 7282 + }, + { + "epoch": 1.2564478564651083, + "grad_norm": 0.625, + "learning_rate": 6.124810679506056e-06, + "loss": 1.4305, + "step": 7283 + }, + { + "epoch": 1.2566203743638402, + "grad_norm": 0.578125, + "learning_rate": 6.122301627031388e-06, + "loss": 1.5184, + "step": 7284 + }, + { + "epoch": 1.2567928922625722, + "grad_norm": 0.625, + "learning_rate": 6.119792861840843e-06, + "loss": 1.4044, + "step": 7285 + }, + { + "epoch": 1.2569654101613041, + "grad_norm": 0.63671875, + "learning_rate": 6.117284384120292e-06, + "loss": 1.422, + "step": 7286 + }, + { + "epoch": 1.257137928060036, + "grad_norm": 0.6015625, + "learning_rate": 6.1147761940555714e-06, + "loss": 1.4325, + "step": 7287 + }, + { + "epoch": 1.2573104459587683, + "grad_norm": 0.57421875, + "learning_rate": 6.112268291832509e-06, + "loss": 1.387, + "step": 7288 + }, + { + "epoch": 1.2574829638575002, + "grad_norm": 0.6171875, + "learning_rate": 6.1097606776369e-06, + "loss": 1.4262, + "step": 7289 + }, + { + "epoch": 1.2576554817562322, + "grad_norm": 0.57421875, + "learning_rate": 6.107253351654529e-06, + "loss": 1.3348, + "step": 7290 + }, + { + "epoch": 1.2578279996549642, + "grad_norm": 0.62109375, + "learning_rate": 6.104746314071153e-06, + "loss": 1.3996, + "step": 7291 + }, + { + "epoch": 1.2580005175536961, + "grad_norm": 0.59375, + "learning_rate": 6.1022395650725095e-06, + "loss": 1.3755, + "step": 7292 + }, + { + "epoch": 1.2581730354524283, + "grad_norm": 0.65234375, + "learning_rate": 6.09973310484431e-06, + "loss": 1.4054, + "step": 7293 + }, + { + "epoch": 1.2583455533511603, + "grad_norm": 0.69921875, + "learning_rate": 6.0972269335722555e-06, + "loss": 1.4913, + "step": 7294 + }, + { + "epoch": 1.2585180712498922, + "grad_norm": 0.62109375, + "learning_rate": 6.094721051442013e-06, + "loss": 1.5199, + "step": 7295 + }, + { + "epoch": 1.2586905891486242, + "grad_norm": 0.6640625, + "learning_rate": 6.092215458639239e-06, + "loss": 1.4474, + "step": 7296 + }, + { + "epoch": 1.2588631070473562, + "grad_norm": 0.65234375, + "learning_rate": 6.089710155349558e-06, + "loss": 1.3976, + "step": 7297 + }, + { + "epoch": 1.2590356249460881, + "grad_norm": 0.59765625, + "learning_rate": 6.087205141758583e-06, + "loss": 1.4169, + "step": 7298 + }, + { + "epoch": 1.25920814284482, + "grad_norm": 0.609375, + "learning_rate": 6.0847004180518985e-06, + "loss": 1.4104, + "step": 7299 + }, + { + "epoch": 1.259380660743552, + "grad_norm": 0.57421875, + "learning_rate": 6.082195984415069e-06, + "loss": 1.3966, + "step": 7300 + }, + { + "epoch": 1.259380660743552, + "eval_loss": 1.4078376293182373, + "eval_runtime": 10.8355, + "eval_samples_per_second": 94.504, + "eval_steps_per_second": 23.626, + "step": 7300 + }, + { + "epoch": 1.259553178642284, + "grad_norm": 0.58203125, + "learning_rate": 6.079691841033643e-06, + "loss": 1.5522, + "step": 7301 + }, + { + "epoch": 1.2597256965410162, + "grad_norm": 0.703125, + "learning_rate": 6.077187988093138e-06, + "loss": 1.414, + "step": 7302 + }, + { + "epoch": 1.2598982144397481, + "grad_norm": 0.58984375, + "learning_rate": 6.074684425779063e-06, + "loss": 1.4165, + "step": 7303 + }, + { + "epoch": 1.26007073233848, + "grad_norm": 0.5625, + "learning_rate": 6.072181154276888e-06, + "loss": 1.3574, + "step": 7304 + }, + { + "epoch": 1.260243250237212, + "grad_norm": 0.578125, + "learning_rate": 6.069678173772079e-06, + "loss": 1.3784, + "step": 7305 + }, + { + "epoch": 1.260415768135944, + "grad_norm": 0.5546875, + "learning_rate": 6.067175484450063e-06, + "loss": 1.4636, + "step": 7306 + }, + { + "epoch": 1.2605882860346762, + "grad_norm": 0.5859375, + "learning_rate": 6.064673086496267e-06, + "loss": 1.3886, + "step": 7307 + }, + { + "epoch": 1.2607608039334082, + "grad_norm": 0.66796875, + "learning_rate": 6.062170980096073e-06, + "loss": 1.376, + "step": 7308 + }, + { + "epoch": 1.2609333218321401, + "grad_norm": 0.625, + "learning_rate": 6.05966916543486e-06, + "loss": 1.3851, + "step": 7309 + }, + { + "epoch": 1.261105839730872, + "grad_norm": 0.60546875, + "learning_rate": 6.057167642697973e-06, + "loss": 1.483, + "step": 7310 + }, + { + "epoch": 1.261278357629604, + "grad_norm": 0.61328125, + "learning_rate": 6.054666412070746e-06, + "loss": 1.4069, + "step": 7311 + }, + { + "epoch": 1.261450875528336, + "grad_norm": 0.63671875, + "learning_rate": 6.0521654737384804e-06, + "loss": 1.3993, + "step": 7312 + }, + { + "epoch": 1.261623393427068, + "grad_norm": 0.578125, + "learning_rate": 6.049664827886468e-06, + "loss": 1.4738, + "step": 7313 + }, + { + "epoch": 1.2617959113258, + "grad_norm": 0.73828125, + "learning_rate": 6.047164474699962e-06, + "loss": 1.5021, + "step": 7314 + }, + { + "epoch": 1.2619684292245321, + "grad_norm": 0.625, + "learning_rate": 6.044664414364214e-06, + "loss": 1.4437, + "step": 7315 + }, + { + "epoch": 1.262140947123264, + "grad_norm": 0.62890625, + "learning_rate": 6.0421646470644394e-06, + "loss": 1.4966, + "step": 7316 + }, + { + "epoch": 1.262313465021996, + "grad_norm": 0.6484375, + "learning_rate": 6.039665172985834e-06, + "loss": 1.4586, + "step": 7317 + }, + { + "epoch": 1.262485982920728, + "grad_norm": 0.59765625, + "learning_rate": 6.0371659923135825e-06, + "loss": 1.4142, + "step": 7318 + }, + { + "epoch": 1.26265850081946, + "grad_norm": 0.58203125, + "learning_rate": 6.0346671052328345e-06, + "loss": 1.5371, + "step": 7319 + }, + { + "epoch": 1.2628310187181921, + "grad_norm": 0.61328125, + "learning_rate": 6.0321685119287245e-06, + "loss": 1.4882, + "step": 7320 + }, + { + "epoch": 1.263003536616924, + "grad_norm": 0.65625, + "learning_rate": 6.0296702125863586e-06, + "loss": 1.4118, + "step": 7321 + }, + { + "epoch": 1.263176054515656, + "grad_norm": 0.6953125, + "learning_rate": 6.027172207390836e-06, + "loss": 1.4162, + "step": 7322 + }, + { + "epoch": 1.263348572414388, + "grad_norm": 0.6015625, + "learning_rate": 6.024674496527219e-06, + "loss": 1.3656, + "step": 7323 + }, + { + "epoch": 1.26352109031312, + "grad_norm": 0.6640625, + "learning_rate": 6.022177080180553e-06, + "loss": 1.4969, + "step": 7324 + }, + { + "epoch": 1.263693608211852, + "grad_norm": 0.58203125, + "learning_rate": 6.019679958535862e-06, + "loss": 1.378, + "step": 7325 + }, + { + "epoch": 1.263866126110584, + "grad_norm": 0.59765625, + "learning_rate": 6.017183131778154e-06, + "loss": 1.4315, + "step": 7326 + }, + { + "epoch": 1.2640386440093159, + "grad_norm": 0.64453125, + "learning_rate": 6.0146866000924035e-06, + "loss": 1.4713, + "step": 7327 + }, + { + "epoch": 1.2642111619080478, + "grad_norm": 0.6015625, + "learning_rate": 6.012190363663571e-06, + "loss": 1.3467, + "step": 7328 + }, + { + "epoch": 1.26438367980678, + "grad_norm": 0.5703125, + "learning_rate": 6.009694422676591e-06, + "loss": 1.4483, + "step": 7329 + }, + { + "epoch": 1.264556197705512, + "grad_norm": 0.7890625, + "learning_rate": 6.007198777316385e-06, + "loss": 1.4766, + "step": 7330 + }, + { + "epoch": 1.264728715604244, + "grad_norm": 0.5859375, + "learning_rate": 6.004703427767837e-06, + "loss": 1.3881, + "step": 7331 + }, + { + "epoch": 1.264901233502976, + "grad_norm": 0.6328125, + "learning_rate": 6.002208374215829e-06, + "loss": 1.4425, + "step": 7332 + }, + { + "epoch": 1.2650737514017079, + "grad_norm": 0.63671875, + "learning_rate": 5.999713616845197e-06, + "loss": 1.4362, + "step": 7333 + }, + { + "epoch": 1.26524626930044, + "grad_norm": 0.640625, + "learning_rate": 5.997219155840777e-06, + "loss": 1.5301, + "step": 7334 + }, + { + "epoch": 1.265418787199172, + "grad_norm": 0.625, + "learning_rate": 5.994724991387375e-06, + "loss": 1.4292, + "step": 7335 + }, + { + "epoch": 1.265591305097904, + "grad_norm": 0.63671875, + "learning_rate": 5.992231123669771e-06, + "loss": 1.4772, + "step": 7336 + }, + { + "epoch": 1.265763822996636, + "grad_norm": 0.57421875, + "learning_rate": 5.989737552872729e-06, + "loss": 1.4154, + "step": 7337 + }, + { + "epoch": 1.2659363408953679, + "grad_norm": 0.5703125, + "learning_rate": 5.9872442791809815e-06, + "loss": 1.4353, + "step": 7338 + }, + { + "epoch": 1.2661088587940998, + "grad_norm": 0.62109375, + "learning_rate": 5.984751302779255e-06, + "loss": 1.4214, + "step": 7339 + }, + { + "epoch": 1.2662813766928318, + "grad_norm": 0.60546875, + "learning_rate": 5.982258623852239e-06, + "loss": 1.4753, + "step": 7340 + }, + { + "epoch": 1.2664538945915638, + "grad_norm": 0.6015625, + "learning_rate": 5.979766242584608e-06, + "loss": 1.5378, + "step": 7341 + }, + { + "epoch": 1.266626412490296, + "grad_norm": 0.5390625, + "learning_rate": 5.977274159161012e-06, + "loss": 1.3289, + "step": 7342 + }, + { + "epoch": 1.266798930389028, + "grad_norm": 0.58203125, + "learning_rate": 5.974782373766084e-06, + "loss": 1.405, + "step": 7343 + }, + { + "epoch": 1.2669714482877599, + "grad_norm": 0.578125, + "learning_rate": 5.972290886584426e-06, + "loss": 1.4091, + "step": 7344 + }, + { + "epoch": 1.2671439661864918, + "grad_norm": 0.60546875, + "learning_rate": 5.969799697800629e-06, + "loss": 1.3101, + "step": 7345 + }, + { + "epoch": 1.2673164840852238, + "grad_norm": 0.5625, + "learning_rate": 5.967308807599248e-06, + "loss": 1.472, + "step": 7346 + }, + { + "epoch": 1.2674890019839558, + "grad_norm": 0.6015625, + "learning_rate": 5.964818216164832e-06, + "loss": 1.5133, + "step": 7347 + }, + { + "epoch": 1.267661519882688, + "grad_norm": 0.5859375, + "learning_rate": 5.962327923681892e-06, + "loss": 1.3993, + "step": 7348 + }, + { + "epoch": 1.26783403778142, + "grad_norm": 0.6640625, + "learning_rate": 5.959837930334926e-06, + "loss": 1.4841, + "step": 7349 + }, + { + "epoch": 1.2680065556801519, + "grad_norm": 0.56640625, + "learning_rate": 5.9573482363084155e-06, + "loss": 1.4165, + "step": 7350 + }, + { + "epoch": 1.2681790735788838, + "grad_norm": 0.57421875, + "learning_rate": 5.954858841786806e-06, + "loss": 1.4749, + "step": 7351 + }, + { + "epoch": 1.2683515914776158, + "grad_norm": 0.58984375, + "learning_rate": 5.952369746954529e-06, + "loss": 1.3882, + "step": 7352 + }, + { + "epoch": 1.2685241093763477, + "grad_norm": 0.58984375, + "learning_rate": 5.9498809519959875e-06, + "loss": 1.5357, + "step": 7353 + }, + { + "epoch": 1.2686966272750797, + "grad_norm": 0.54296875, + "learning_rate": 5.9473924570955776e-06, + "loss": 1.3692, + "step": 7354 + }, + { + "epoch": 1.2688691451738117, + "grad_norm": 0.578125, + "learning_rate": 5.944904262437653e-06, + "loss": 1.3353, + "step": 7355 + }, + { + "epoch": 1.2690416630725438, + "grad_norm": 0.58984375, + "learning_rate": 5.94241636820656e-06, + "loss": 1.4578, + "step": 7356 + }, + { + "epoch": 1.2692141809712758, + "grad_norm": 0.58203125, + "learning_rate": 5.939928774586612e-06, + "loss": 1.4813, + "step": 7357 + }, + { + "epoch": 1.2693866988700078, + "grad_norm": 0.58984375, + "learning_rate": 5.937441481762112e-06, + "loss": 1.464, + "step": 7358 + }, + { + "epoch": 1.2695592167687397, + "grad_norm": 0.59765625, + "learning_rate": 5.934954489917329e-06, + "loss": 1.4479, + "step": 7359 + }, + { + "epoch": 1.2697317346674717, + "grad_norm": 0.6328125, + "learning_rate": 5.93246779923652e-06, + "loss": 1.3553, + "step": 7360 + }, + { + "epoch": 1.2699042525662039, + "grad_norm": 0.65625, + "learning_rate": 5.929981409903907e-06, + "loss": 1.4249, + "step": 7361 + }, + { + "epoch": 1.2700767704649358, + "grad_norm": 0.60546875, + "learning_rate": 5.927495322103707e-06, + "loss": 1.4009, + "step": 7362 + }, + { + "epoch": 1.2702492883636678, + "grad_norm": 0.66796875, + "learning_rate": 5.925009536020094e-06, + "loss": 1.4347, + "step": 7363 + }, + { + "epoch": 1.2704218062623998, + "grad_norm": 0.578125, + "learning_rate": 5.922524051837241e-06, + "loss": 1.4666, + "step": 7364 + }, + { + "epoch": 1.2705943241611317, + "grad_norm": 0.6875, + "learning_rate": 5.9200388697392805e-06, + "loss": 1.4344, + "step": 7365 + }, + { + "epoch": 1.2707668420598637, + "grad_norm": 0.69140625, + "learning_rate": 5.9175539899103315e-06, + "loss": 1.351, + "step": 7366 + }, + { + "epoch": 1.2709393599585956, + "grad_norm": 0.55859375, + "learning_rate": 5.915069412534498e-06, + "loss": 1.347, + "step": 7367 + }, + { + "epoch": 1.2711118778573276, + "grad_norm": 0.671875, + "learning_rate": 5.912585137795841e-06, + "loss": 1.4984, + "step": 7368 + }, + { + "epoch": 1.2712843957560596, + "grad_norm": 0.59375, + "learning_rate": 5.910101165878419e-06, + "loss": 1.4683, + "step": 7369 + }, + { + "epoch": 1.2714569136547917, + "grad_norm": 0.56640625, + "learning_rate": 5.907617496966254e-06, + "loss": 1.3853, + "step": 7370 + }, + { + "epoch": 1.2716294315535237, + "grad_norm": 0.59765625, + "learning_rate": 5.90513413124336e-06, + "loss": 1.4295, + "step": 7371 + }, + { + "epoch": 1.2718019494522557, + "grad_norm": 0.609375, + "learning_rate": 5.902651068893712e-06, + "loss": 1.5122, + "step": 7372 + }, + { + "epoch": 1.2719744673509876, + "grad_norm": 0.6484375, + "learning_rate": 5.9001683101012775e-06, + "loss": 1.3092, + "step": 7373 + }, + { + "epoch": 1.2721469852497196, + "grad_norm": 0.63671875, + "learning_rate": 5.897685855049986e-06, + "loss": 1.4631, + "step": 7374 + }, + { + "epoch": 1.2723195031484518, + "grad_norm": 0.60546875, + "learning_rate": 5.895203703923764e-06, + "loss": 1.4884, + "step": 7375 + }, + { + "epoch": 1.2724920210471837, + "grad_norm": 0.60546875, + "learning_rate": 5.892721856906498e-06, + "loss": 1.3907, + "step": 7376 + }, + { + "epoch": 1.2726645389459157, + "grad_norm": 0.5859375, + "learning_rate": 5.890240314182061e-06, + "loss": 1.4388, + "step": 7377 + }, + { + "epoch": 1.2728370568446477, + "grad_norm": 0.6484375, + "learning_rate": 5.887759075934297e-06, + "loss": 1.4239, + "step": 7378 + }, + { + "epoch": 1.2730095747433796, + "grad_norm": 0.6171875, + "learning_rate": 5.8852781423470395e-06, + "loss": 1.407, + "step": 7379 + }, + { + "epoch": 1.2731820926421116, + "grad_norm": 0.62109375, + "learning_rate": 5.882797513604085e-06, + "loss": 1.4078, + "step": 7380 + }, + { + "epoch": 1.2733546105408435, + "grad_norm": 0.6640625, + "learning_rate": 5.880317189889213e-06, + "loss": 1.528, + "step": 7381 + }, + { + "epoch": 1.2735271284395755, + "grad_norm": 0.625, + "learning_rate": 5.87783717138619e-06, + "loss": 1.4042, + "step": 7382 + }, + { + "epoch": 1.2736996463383077, + "grad_norm": 0.5703125, + "learning_rate": 5.8753574582787435e-06, + "loss": 1.325, + "step": 7383 + }, + { + "epoch": 1.2738721642370396, + "grad_norm": 0.55859375, + "learning_rate": 5.87287805075059e-06, + "loss": 1.5028, + "step": 7384 + }, + { + "epoch": 1.2740446821357716, + "grad_norm": 0.6953125, + "learning_rate": 5.870398948985414e-06, + "loss": 1.4158, + "step": 7385 + }, + { + "epoch": 1.2742172000345036, + "grad_norm": 0.6796875, + "learning_rate": 5.867920153166892e-06, + "loss": 1.4555, + "step": 7386 + }, + { + "epoch": 1.2743897179332355, + "grad_norm": 0.57421875, + "learning_rate": 5.865441663478661e-06, + "loss": 1.4297, + "step": 7387 + }, + { + "epoch": 1.2745622358319677, + "grad_norm": 0.69140625, + "learning_rate": 5.862963480104347e-06, + "loss": 1.4885, + "step": 7388 + }, + { + "epoch": 1.2747347537306997, + "grad_norm": 0.76171875, + "learning_rate": 5.860485603227544e-06, + "loss": 1.4481, + "step": 7389 + }, + { + "epoch": 1.2749072716294316, + "grad_norm": 0.63671875, + "learning_rate": 5.858008033031836e-06, + "loss": 1.4486, + "step": 7390 + }, + { + "epoch": 1.2750797895281636, + "grad_norm": 0.9453125, + "learning_rate": 5.855530769700769e-06, + "loss": 1.4396, + "step": 7391 + }, + { + "epoch": 1.2752523074268955, + "grad_norm": 0.5625, + "learning_rate": 5.853053813417883e-06, + "loss": 1.5185, + "step": 7392 + }, + { + "epoch": 1.2754248253256275, + "grad_norm": 0.5703125, + "learning_rate": 5.85057716436668e-06, + "loss": 1.3693, + "step": 7393 + }, + { + "epoch": 1.2755973432243595, + "grad_norm": 0.609375, + "learning_rate": 5.848100822730649e-06, + "loss": 1.494, + "step": 7394 + }, + { + "epoch": 1.2757698611230914, + "grad_norm": 0.59765625, + "learning_rate": 5.8456247886932475e-06, + "loss": 1.4265, + "step": 7395 + }, + { + "epoch": 1.2759423790218234, + "grad_norm": 0.5625, + "learning_rate": 5.843149062437923e-06, + "loss": 1.4153, + "step": 7396 + }, + { + "epoch": 1.2761148969205556, + "grad_norm": 0.6328125, + "learning_rate": 5.840673644148087e-06, + "loss": 1.4869, + "step": 7397 + }, + { + "epoch": 1.2762874148192875, + "grad_norm": 0.62890625, + "learning_rate": 5.838198534007138e-06, + "loss": 1.4606, + "step": 7398 + }, + { + "epoch": 1.2764599327180195, + "grad_norm": 0.60546875, + "learning_rate": 5.835723732198444e-06, + "loss": 1.4313, + "step": 7399 + }, + { + "epoch": 1.2766324506167515, + "grad_norm": 0.6015625, + "learning_rate": 5.833249238905357e-06, + "loss": 1.446, + "step": 7400 + }, + { + "epoch": 1.2766324506167515, + "eval_loss": 1.4078235626220703, + "eval_runtime": 10.7712, + "eval_samples_per_second": 95.069, + "eval_steps_per_second": 23.767, + "step": 7400 + }, + { + "epoch": 1.2768049685154834, + "grad_norm": 0.74609375, + "learning_rate": 5.8307750543111996e-06, + "loss": 1.41, + "step": 7401 + }, + { + "epoch": 1.2769774864142156, + "grad_norm": 0.58984375, + "learning_rate": 5.828301178599277e-06, + "loss": 1.4281, + "step": 7402 + }, + { + "epoch": 1.2771500043129476, + "grad_norm": 0.66015625, + "learning_rate": 5.825827611952874e-06, + "loss": 1.5065, + "step": 7403 + }, + { + "epoch": 1.2773225222116795, + "grad_norm": 0.6171875, + "learning_rate": 5.823354354555234e-06, + "loss": 1.3859, + "step": 7404 + }, + { + "epoch": 1.2774950401104115, + "grad_norm": 0.875, + "learning_rate": 5.820881406589609e-06, + "loss": 1.4492, + "step": 7405 + }, + { + "epoch": 1.2776675580091434, + "grad_norm": 0.578125, + "learning_rate": 5.818408768239197e-06, + "loss": 1.4723, + "step": 7406 + }, + { + "epoch": 1.2778400759078754, + "grad_norm": 0.5859375, + "learning_rate": 5.815936439687192e-06, + "loss": 1.3366, + "step": 7407 + }, + { + "epoch": 1.2780125938066074, + "grad_norm": 0.609375, + "learning_rate": 5.813464421116759e-06, + "loss": 1.389, + "step": 7408 + }, + { + "epoch": 1.2781851117053393, + "grad_norm": 0.5859375, + "learning_rate": 5.810992712711039e-06, + "loss": 1.4805, + "step": 7409 + }, + { + "epoch": 1.2783576296040713, + "grad_norm": 0.6015625, + "learning_rate": 5.808521314653155e-06, + "loss": 1.4856, + "step": 7410 + }, + { + "epoch": 1.2785301475028035, + "grad_norm": 0.625, + "learning_rate": 5.806050227126203e-06, + "loss": 1.4922, + "step": 7411 + }, + { + "epoch": 1.2787026654015354, + "grad_norm": 0.6484375, + "learning_rate": 5.803579450313249e-06, + "loss": 1.4169, + "step": 7412 + }, + { + "epoch": 1.2788751833002674, + "grad_norm": 0.55859375, + "learning_rate": 5.801108984397355e-06, + "loss": 1.4809, + "step": 7413 + }, + { + "epoch": 1.2790477011989994, + "grad_norm": 0.61328125, + "learning_rate": 5.798638829561545e-06, + "loss": 1.4217, + "step": 7414 + }, + { + "epoch": 1.2792202190977313, + "grad_norm": 0.609375, + "learning_rate": 5.796168985988814e-06, + "loss": 1.3787, + "step": 7415 + }, + { + "epoch": 1.2793927369964635, + "grad_norm": 0.65625, + "learning_rate": 5.793699453862161e-06, + "loss": 1.5047, + "step": 7416 + }, + { + "epoch": 1.2795652548951955, + "grad_norm": 0.5703125, + "learning_rate": 5.791230233364529e-06, + "loss": 1.4522, + "step": 7417 + }, + { + "epoch": 1.2797377727939274, + "grad_norm": 0.80859375, + "learning_rate": 5.788761324678859e-06, + "loss": 1.3559, + "step": 7418 + }, + { + "epoch": 1.2799102906926594, + "grad_norm": 0.62890625, + "learning_rate": 5.7862927279880635e-06, + "loss": 1.4391, + "step": 7419 + }, + { + "epoch": 1.2800828085913913, + "grad_norm": 0.59375, + "learning_rate": 5.78382444347503e-06, + "loss": 1.4629, + "step": 7420 + }, + { + "epoch": 1.2802553264901233, + "grad_norm": 0.63671875, + "learning_rate": 5.781356471322628e-06, + "loss": 1.4712, + "step": 7421 + }, + { + "epoch": 1.2804278443888553, + "grad_norm": 0.62890625, + "learning_rate": 5.7788888117136964e-06, + "loss": 1.4271, + "step": 7422 + }, + { + "epoch": 1.2806003622875872, + "grad_norm": 0.6015625, + "learning_rate": 5.7764214648310564e-06, + "loss": 1.4672, + "step": 7423 + }, + { + "epoch": 1.2807728801863194, + "grad_norm": 0.63671875, + "learning_rate": 5.773954430857509e-06, + "loss": 1.3926, + "step": 7424 + }, + { + "epoch": 1.2809453980850514, + "grad_norm": 0.55859375, + "learning_rate": 5.771487709975814e-06, + "loss": 1.3844, + "step": 7425 + }, + { + "epoch": 1.2811179159837833, + "grad_norm": 0.578125, + "learning_rate": 5.769021302368739e-06, + "loss": 1.4494, + "step": 7426 + }, + { + "epoch": 1.2812904338825153, + "grad_norm": 0.56640625, + "learning_rate": 5.766555208218998e-06, + "loss": 1.4814, + "step": 7427 + }, + { + "epoch": 1.2814629517812473, + "grad_norm": 0.59765625, + "learning_rate": 5.7640894277093e-06, + "loss": 1.5364, + "step": 7428 + }, + { + "epoch": 1.2816354696799794, + "grad_norm": 0.64453125, + "learning_rate": 5.761623961022323e-06, + "loss": 1.4008, + "step": 7429 + }, + { + "epoch": 1.2818079875787114, + "grad_norm": 0.6171875, + "learning_rate": 5.759158808340726e-06, + "loss": 1.3162, + "step": 7430 + }, + { + "epoch": 1.2819805054774434, + "grad_norm": 0.62109375, + "learning_rate": 5.756693969847142e-06, + "loss": 1.3761, + "step": 7431 + }, + { + "epoch": 1.2821530233761753, + "grad_norm": 0.984375, + "learning_rate": 5.754229445724184e-06, + "loss": 1.4603, + "step": 7432 + }, + { + "epoch": 1.2823255412749073, + "grad_norm": 1.3046875, + "learning_rate": 5.751765236154436e-06, + "loss": 1.4425, + "step": 7433 + }, + { + "epoch": 1.2824980591736392, + "grad_norm": 0.609375, + "learning_rate": 5.749301341320464e-06, + "loss": 1.4418, + "step": 7434 + }, + { + "epoch": 1.2826705770723712, + "grad_norm": 0.58203125, + "learning_rate": 5.746837761404811e-06, + "loss": 1.4066, + "step": 7435 + }, + { + "epoch": 1.2828430949711032, + "grad_norm": 0.59375, + "learning_rate": 5.744374496589985e-06, + "loss": 1.3874, + "step": 7436 + }, + { + "epoch": 1.2830156128698351, + "grad_norm": 0.59375, + "learning_rate": 5.741911547058494e-06, + "loss": 1.4895, + "step": 7437 + }, + { + "epoch": 1.2831881307685673, + "grad_norm": 0.59375, + "learning_rate": 5.739448912992798e-06, + "loss": 1.4584, + "step": 7438 + }, + { + "epoch": 1.2833606486672993, + "grad_norm": 0.5859375, + "learning_rate": 5.736986594575347e-06, + "loss": 1.3845, + "step": 7439 + }, + { + "epoch": 1.2835331665660312, + "grad_norm": 0.61328125, + "learning_rate": 5.734524591988566e-06, + "loss": 1.3687, + "step": 7440 + }, + { + "epoch": 1.2837056844647632, + "grad_norm": 0.58984375, + "learning_rate": 5.732062905414855e-06, + "loss": 1.4917, + "step": 7441 + }, + { + "epoch": 1.2838782023634951, + "grad_norm": 0.546875, + "learning_rate": 5.7296015350365905e-06, + "loss": 1.4639, + "step": 7442 + }, + { + "epoch": 1.2840507202622273, + "grad_norm": 0.64453125, + "learning_rate": 5.727140481036133e-06, + "loss": 1.4227, + "step": 7443 + }, + { + "epoch": 1.2842232381609593, + "grad_norm": 0.5703125, + "learning_rate": 5.7246797435957965e-06, + "loss": 1.4035, + "step": 7444 + }, + { + "epoch": 1.2843957560596913, + "grad_norm": 0.58984375, + "learning_rate": 5.722219322897904e-06, + "loss": 1.3356, + "step": 7445 + }, + { + "epoch": 1.2845682739584232, + "grad_norm": 0.57421875, + "learning_rate": 5.719759219124735e-06, + "loss": 1.4335, + "step": 7446 + }, + { + "epoch": 1.2847407918571552, + "grad_norm": 0.69140625, + "learning_rate": 5.71729943245854e-06, + "loss": 1.4359, + "step": 7447 + }, + { + "epoch": 1.2849133097558871, + "grad_norm": 0.578125, + "learning_rate": 5.714839963081571e-06, + "loss": 1.383, + "step": 7448 + }, + { + "epoch": 1.285085827654619, + "grad_norm": 0.58984375, + "learning_rate": 5.712380811176024e-06, + "loss": 1.4616, + "step": 7449 + }, + { + "epoch": 1.285258345553351, + "grad_norm": 0.5703125, + "learning_rate": 5.709921976924106e-06, + "loss": 1.4082, + "step": 7450 + }, + { + "epoch": 1.285430863452083, + "grad_norm": 0.58203125, + "learning_rate": 5.707463460507967e-06, + "loss": 1.5528, + "step": 7451 + }, + { + "epoch": 1.2856033813508152, + "grad_norm": 0.6953125, + "learning_rate": 5.705005262109759e-06, + "loss": 1.4262, + "step": 7452 + }, + { + "epoch": 1.2857758992495472, + "grad_norm": 0.859375, + "learning_rate": 5.702547381911595e-06, + "loss": 1.4013, + "step": 7453 + }, + { + "epoch": 1.2859484171482791, + "grad_norm": 0.60546875, + "learning_rate": 5.700089820095573e-06, + "loss": 1.3427, + "step": 7454 + }, + { + "epoch": 1.286120935047011, + "grad_norm": 0.5546875, + "learning_rate": 5.697632576843766e-06, + "loss": 1.3756, + "step": 7455 + }, + { + "epoch": 1.286293452945743, + "grad_norm": 0.57421875, + "learning_rate": 5.695175652338223e-06, + "loss": 1.4234, + "step": 7456 + }, + { + "epoch": 1.2864659708444752, + "grad_norm": 0.6328125, + "learning_rate": 5.692719046760957e-06, + "loss": 1.4469, + "step": 7457 + }, + { + "epoch": 1.2866384887432072, + "grad_norm": 0.58203125, + "learning_rate": 5.690262760293986e-06, + "loss": 1.4501, + "step": 7458 + }, + { + "epoch": 1.2868110066419391, + "grad_norm": 0.5625, + "learning_rate": 5.687806793119273e-06, + "loss": 1.3342, + "step": 7459 + }, + { + "epoch": 1.286983524540671, + "grad_norm": 0.58984375, + "learning_rate": 5.685351145418778e-06, + "loss": 1.427, + "step": 7460 + }, + { + "epoch": 1.287156042439403, + "grad_norm": 0.609375, + "learning_rate": 5.682895817374429e-06, + "loss": 1.4885, + "step": 7461 + }, + { + "epoch": 1.287328560338135, + "grad_norm": 0.6015625, + "learning_rate": 5.680440809168131e-06, + "loss": 1.4662, + "step": 7462 + }, + { + "epoch": 1.287501078236867, + "grad_norm": 0.56640625, + "learning_rate": 5.677986120981769e-06, + "loss": 1.4228, + "step": 7463 + }, + { + "epoch": 1.287673596135599, + "grad_norm": 0.578125, + "learning_rate": 5.675531752997201e-06, + "loss": 1.4616, + "step": 7464 + }, + { + "epoch": 1.2878461140343311, + "grad_norm": 0.56640625, + "learning_rate": 5.67307770539626e-06, + "loss": 1.3393, + "step": 7465 + }, + { + "epoch": 1.288018631933063, + "grad_norm": 0.62109375, + "learning_rate": 5.670623978360759e-06, + "loss": 1.374, + "step": 7466 + }, + { + "epoch": 1.288191149831795, + "grad_norm": 0.546875, + "learning_rate": 5.668170572072489e-06, + "loss": 1.4372, + "step": 7467 + }, + { + "epoch": 1.288363667730527, + "grad_norm": 0.578125, + "learning_rate": 5.6657174867132e-06, + "loss": 1.4121, + "step": 7468 + }, + { + "epoch": 1.288536185629259, + "grad_norm": 0.61328125, + "learning_rate": 5.663264722464651e-06, + "loss": 1.4619, + "step": 7469 + }, + { + "epoch": 1.2887087035279912, + "grad_norm": 0.578125, + "learning_rate": 5.6608122795085444e-06, + "loss": 1.4514, + "step": 7470 + }, + { + "epoch": 1.2888812214267231, + "grad_norm": 0.5546875, + "learning_rate": 5.658360158026577e-06, + "loss": 1.3991, + "step": 7471 + }, + { + "epoch": 1.289053739325455, + "grad_norm": 0.66015625, + "learning_rate": 5.655908358200415e-06, + "loss": 1.4462, + "step": 7472 + }, + { + "epoch": 1.289226257224187, + "grad_norm": 0.58203125, + "learning_rate": 5.653456880211707e-06, + "loss": 1.4311, + "step": 7473 + }, + { + "epoch": 1.289398775122919, + "grad_norm": 0.59375, + "learning_rate": 5.651005724242072e-06, + "loss": 1.4442, + "step": 7474 + }, + { + "epoch": 1.289571293021651, + "grad_norm": 0.60546875, + "learning_rate": 5.648554890473108e-06, + "loss": 1.5117, + "step": 7475 + }, + { + "epoch": 1.289743810920383, + "grad_norm": 0.65625, + "learning_rate": 5.646104379086381e-06, + "loss": 1.4792, + "step": 7476 + }, + { + "epoch": 1.2899163288191149, + "grad_norm": 0.625, + "learning_rate": 5.64365419026345e-06, + "loss": 1.4261, + "step": 7477 + }, + { + "epoch": 1.2900888467178468, + "grad_norm": 0.58984375, + "learning_rate": 5.64120432418584e-06, + "loss": 1.3697, + "step": 7478 + }, + { + "epoch": 1.290261364616579, + "grad_norm": 0.54296875, + "learning_rate": 5.63875478103504e-06, + "loss": 1.379, + "step": 7479 + }, + { + "epoch": 1.290433882515311, + "grad_norm": 0.57421875, + "learning_rate": 5.636305560992545e-06, + "loss": 1.3918, + "step": 7480 + }, + { + "epoch": 1.290606400414043, + "grad_norm": 0.671875, + "learning_rate": 5.6338566642397915e-06, + "loss": 1.4982, + "step": 7481 + }, + { + "epoch": 1.290778918312775, + "grad_norm": 0.5859375, + "learning_rate": 5.631408090958225e-06, + "loss": 1.4698, + "step": 7482 + }, + { + "epoch": 1.2909514362115069, + "grad_norm": 0.60546875, + "learning_rate": 5.62895984132924e-06, + "loss": 1.4574, + "step": 7483 + }, + { + "epoch": 1.291123954110239, + "grad_norm": 0.63671875, + "learning_rate": 5.626511915534221e-06, + "loss": 1.3892, + "step": 7484 + }, + { + "epoch": 1.291296472008971, + "grad_norm": 0.6015625, + "learning_rate": 5.624064313754525e-06, + "loss": 1.4437, + "step": 7485 + }, + { + "epoch": 1.291468989907703, + "grad_norm": 0.60546875, + "learning_rate": 5.621617036171487e-06, + "loss": 1.3831, + "step": 7486 + }, + { + "epoch": 1.291641507806435, + "grad_norm": 0.58203125, + "learning_rate": 5.619170082966419e-06, + "loss": 1.3423, + "step": 7487 + }, + { + "epoch": 1.291814025705167, + "grad_norm": 0.54296875, + "learning_rate": 5.616723454320605e-06, + "loss": 1.4161, + "step": 7488 + }, + { + "epoch": 1.2919865436038989, + "grad_norm": 0.59765625, + "learning_rate": 5.6142771504152995e-06, + "loss": 1.4779, + "step": 7489 + }, + { + "epoch": 1.2921590615026308, + "grad_norm": 0.5859375, + "learning_rate": 5.611831171431752e-06, + "loss": 1.4647, + "step": 7490 + }, + { + "epoch": 1.2923315794013628, + "grad_norm": 0.58203125, + "learning_rate": 5.609385517551167e-06, + "loss": 1.3995, + "step": 7491 + }, + { + "epoch": 1.292504097300095, + "grad_norm": 0.62890625, + "learning_rate": 5.606940188954737e-06, + "loss": 1.5, + "step": 7492 + }, + { + "epoch": 1.292676615198827, + "grad_norm": 0.59375, + "learning_rate": 5.604495185823626e-06, + "loss": 1.4466, + "step": 7493 + }, + { + "epoch": 1.292849133097559, + "grad_norm": 0.609375, + "learning_rate": 5.602050508338976e-06, + "loss": 1.5505, + "step": 7494 + }, + { + "epoch": 1.2930216509962908, + "grad_norm": 0.609375, + "learning_rate": 5.599606156681905e-06, + "loss": 1.3898, + "step": 7495 + }, + { + "epoch": 1.2931941688950228, + "grad_norm": 0.5859375, + "learning_rate": 5.597162131033505e-06, + "loss": 1.4221, + "step": 7496 + }, + { + "epoch": 1.2933666867937548, + "grad_norm": 0.57421875, + "learning_rate": 5.594718431574844e-06, + "loss": 1.3453, + "step": 7497 + }, + { + "epoch": 1.293539204692487, + "grad_norm": 0.59765625, + "learning_rate": 5.592275058486967e-06, + "loss": 1.4244, + "step": 7498 + }, + { + "epoch": 1.293711722591219, + "grad_norm": 0.58984375, + "learning_rate": 5.589832011950897e-06, + "loss": 1.3701, + "step": 7499 + }, + { + "epoch": 1.2938842404899509, + "grad_norm": 0.671875, + "learning_rate": 5.5873892921476215e-06, + "loss": 1.4186, + "step": 7500 + }, + { + "epoch": 1.2938842404899509, + "eval_loss": 1.4076613187789917, + "eval_runtime": 10.8861, + "eval_samples_per_second": 94.065, + "eval_steps_per_second": 23.516, + "step": 7500 + }, + { + "epoch": 1.2940567583886828, + "grad_norm": 0.58984375, + "learning_rate": 5.584946899258125e-06, + "loss": 1.3333, + "step": 7501 + }, + { + "epoch": 1.2942292762874148, + "grad_norm": 0.578125, + "learning_rate": 5.582504833463347e-06, + "loss": 1.4381, + "step": 7502 + }, + { + "epoch": 1.2944017941861468, + "grad_norm": 0.578125, + "learning_rate": 5.58006309494421e-06, + "loss": 1.4592, + "step": 7503 + }, + { + "epoch": 1.2945743120848787, + "grad_norm": 0.57421875, + "learning_rate": 5.577621683881618e-06, + "loss": 1.4971, + "step": 7504 + }, + { + "epoch": 1.2947468299836107, + "grad_norm": 0.59765625, + "learning_rate": 5.5751806004564435e-06, + "loss": 1.396, + "step": 7505 + }, + { + "epoch": 1.2949193478823429, + "grad_norm": 0.5703125, + "learning_rate": 5.572739844849537e-06, + "loss": 1.4507, + "step": 7506 + }, + { + "epoch": 1.2950918657810748, + "grad_norm": 0.57421875, + "learning_rate": 5.57029941724173e-06, + "loss": 1.3708, + "step": 7507 + }, + { + "epoch": 1.2952643836798068, + "grad_norm": 0.6171875, + "learning_rate": 5.5678593178138125e-06, + "loss": 1.4396, + "step": 7508 + }, + { + "epoch": 1.2954369015785387, + "grad_norm": 0.56640625, + "learning_rate": 5.565419546746574e-06, + "loss": 1.4847, + "step": 7509 + }, + { + "epoch": 1.2956094194772707, + "grad_norm": 0.6796875, + "learning_rate": 5.562980104220763e-06, + "loss": 1.3488, + "step": 7510 + }, + { + "epoch": 1.295781937376003, + "grad_norm": 0.56640625, + "learning_rate": 5.560540990417111e-06, + "loss": 1.3707, + "step": 7511 + }, + { + "epoch": 1.2959544552747349, + "grad_norm": 0.58203125, + "learning_rate": 5.558102205516325e-06, + "loss": 1.4186, + "step": 7512 + }, + { + "epoch": 1.2961269731734668, + "grad_norm": 0.625, + "learning_rate": 5.555663749699074e-06, + "loss": 1.3705, + "step": 7513 + }, + { + "epoch": 1.2962994910721988, + "grad_norm": 0.59375, + "learning_rate": 5.55322562314603e-06, + "loss": 1.5207, + "step": 7514 + }, + { + "epoch": 1.2964720089709307, + "grad_norm": 0.66015625, + "learning_rate": 5.5507878260378115e-06, + "loss": 1.4564, + "step": 7515 + }, + { + "epoch": 1.2966445268696627, + "grad_norm": 0.578125, + "learning_rate": 5.5483503585550326e-06, + "loss": 1.4188, + "step": 7516 + }, + { + "epoch": 1.2968170447683947, + "grad_norm": 0.5859375, + "learning_rate": 5.545913220878272e-06, + "loss": 1.3734, + "step": 7517 + }, + { + "epoch": 1.2969895626671266, + "grad_norm": 0.640625, + "learning_rate": 5.5434764131880915e-06, + "loss": 1.3631, + "step": 7518 + }, + { + "epoch": 1.2971620805658586, + "grad_norm": 0.5859375, + "learning_rate": 5.541039935665025e-06, + "loss": 1.4003, + "step": 7519 + }, + { + "epoch": 1.2973345984645908, + "grad_norm": 0.56640625, + "learning_rate": 5.538603788489584e-06, + "loss": 1.454, + "step": 7520 + }, + { + "epoch": 1.2975071163633227, + "grad_norm": 0.640625, + "learning_rate": 5.5361679718422426e-06, + "loss": 1.3602, + "step": 7521 + }, + { + "epoch": 1.2976796342620547, + "grad_norm": 0.61328125, + "learning_rate": 5.533732485903477e-06, + "loss": 1.4205, + "step": 7522 + }, + { + "epoch": 1.2978521521607866, + "grad_norm": 0.640625, + "learning_rate": 5.531297330853711e-06, + "loss": 1.425, + "step": 7523 + }, + { + "epoch": 1.2980246700595186, + "grad_norm": 0.56640625, + "learning_rate": 5.528862506873361e-06, + "loss": 1.419, + "step": 7524 + }, + { + "epoch": 1.2981971879582508, + "grad_norm": 0.671875, + "learning_rate": 5.526428014142814e-06, + "loss": 1.5736, + "step": 7525 + }, + { + "epoch": 1.2983697058569827, + "grad_norm": 0.60546875, + "learning_rate": 5.523993852842431e-06, + "loss": 1.3834, + "step": 7526 + }, + { + "epoch": 1.2985422237557147, + "grad_norm": 0.58984375, + "learning_rate": 5.521560023152552e-06, + "loss": 1.4649, + "step": 7527 + }, + { + "epoch": 1.2987147416544467, + "grad_norm": 0.58203125, + "learning_rate": 5.519126525253486e-06, + "loss": 1.465, + "step": 7528 + }, + { + "epoch": 1.2988872595531786, + "grad_norm": 0.6328125, + "learning_rate": 5.516693359325528e-06, + "loss": 1.4627, + "step": 7529 + }, + { + "epoch": 1.2990597774519106, + "grad_norm": 0.6484375, + "learning_rate": 5.514260525548938e-06, + "loss": 1.3427, + "step": 7530 + }, + { + "epoch": 1.2992322953506426, + "grad_norm": 1.4375, + "learning_rate": 5.51182802410396e-06, + "loss": 1.3953, + "step": 7531 + }, + { + "epoch": 1.2994048132493745, + "grad_norm": 0.66015625, + "learning_rate": 5.509395855170798e-06, + "loss": 1.345, + "step": 7532 + }, + { + "epoch": 1.2995773311481067, + "grad_norm": 0.62890625, + "learning_rate": 5.506964018929657e-06, + "loss": 1.3181, + "step": 7533 + }, + { + "epoch": 1.2997498490468387, + "grad_norm": 0.5859375, + "learning_rate": 5.5045325155606925e-06, + "loss": 1.5654, + "step": 7534 + }, + { + "epoch": 1.2999223669455706, + "grad_norm": 0.5859375, + "learning_rate": 5.502101345244047e-06, + "loss": 1.441, + "step": 7535 + }, + { + "epoch": 1.3000948848443026, + "grad_norm": 0.5546875, + "learning_rate": 5.499670508159838e-06, + "loss": 1.4018, + "step": 7536 + }, + { + "epoch": 1.3002674027430345, + "grad_norm": 0.58203125, + "learning_rate": 5.497240004488158e-06, + "loss": 1.3829, + "step": 7537 + }, + { + "epoch": 1.3004399206417665, + "grad_norm": 1.0, + "learning_rate": 5.494809834409071e-06, + "loss": 1.4306, + "step": 7538 + }, + { + "epoch": 1.3006124385404987, + "grad_norm": 0.59765625, + "learning_rate": 5.492379998102627e-06, + "loss": 1.4568, + "step": 7539 + }, + { + "epoch": 1.3007849564392306, + "grad_norm": 0.578125, + "learning_rate": 5.48995049574883e-06, + "loss": 1.5109, + "step": 7540 + }, + { + "epoch": 1.3009574743379626, + "grad_norm": 0.5703125, + "learning_rate": 5.4875213275276875e-06, + "loss": 1.3706, + "step": 7541 + }, + { + "epoch": 1.3011299922366946, + "grad_norm": 0.6015625, + "learning_rate": 5.485092493619153e-06, + "loss": 1.3853, + "step": 7542 + }, + { + "epoch": 1.3013025101354265, + "grad_norm": 0.58984375, + "learning_rate": 5.482663994203179e-06, + "loss": 1.426, + "step": 7543 + }, + { + "epoch": 1.3014750280341585, + "grad_norm": 0.66796875, + "learning_rate": 5.480235829459688e-06, + "loss": 1.4036, + "step": 7544 + }, + { + "epoch": 1.3016475459328904, + "grad_norm": 0.59375, + "learning_rate": 5.477807999568558e-06, + "loss": 1.4661, + "step": 7545 + }, + { + "epoch": 1.3018200638316224, + "grad_norm": 0.56640625, + "learning_rate": 5.475380504709678e-06, + "loss": 1.3434, + "step": 7546 + }, + { + "epoch": 1.3019925817303546, + "grad_norm": 0.5859375, + "learning_rate": 5.472953345062875e-06, + "loss": 1.3539, + "step": 7547 + }, + { + "epoch": 1.3021650996290866, + "grad_norm": 0.63671875, + "learning_rate": 5.470526520807975e-06, + "loss": 1.4634, + "step": 7548 + }, + { + "epoch": 1.3023376175278185, + "grad_norm": 0.6015625, + "learning_rate": 5.4681000321247725e-06, + "loss": 1.3594, + "step": 7549 + }, + { + "epoch": 1.3025101354265505, + "grad_norm": 0.59375, + "learning_rate": 5.465673879193035e-06, + "loss": 1.5056, + "step": 7550 + }, + { + "epoch": 1.3026826533252824, + "grad_norm": 0.61328125, + "learning_rate": 5.46324806219251e-06, + "loss": 1.3833, + "step": 7551 + }, + { + "epoch": 1.3028551712240146, + "grad_norm": 0.5859375, + "learning_rate": 5.460822581302918e-06, + "loss": 1.3744, + "step": 7552 + }, + { + "epoch": 1.3030276891227466, + "grad_norm": 0.67578125, + "learning_rate": 5.458397436703944e-06, + "loss": 1.4649, + "step": 7553 + }, + { + "epoch": 1.3032002070214785, + "grad_norm": 0.58984375, + "learning_rate": 5.455972628575272e-06, + "loss": 1.4986, + "step": 7554 + }, + { + "epoch": 1.3033727249202105, + "grad_norm": 0.67578125, + "learning_rate": 5.453548157096538e-06, + "loss": 1.5157, + "step": 7555 + }, + { + "epoch": 1.3035452428189425, + "grad_norm": 0.58203125, + "learning_rate": 5.4511240224473625e-06, + "loss": 1.4927, + "step": 7556 + }, + { + "epoch": 1.3037177607176744, + "grad_norm": 0.6171875, + "learning_rate": 5.448700224807342e-06, + "loss": 1.4667, + "step": 7557 + }, + { + "epoch": 1.3038902786164064, + "grad_norm": 0.60546875, + "learning_rate": 5.446276764356048e-06, + "loss": 1.4254, + "step": 7558 + }, + { + "epoch": 1.3040627965151383, + "grad_norm": 1.484375, + "learning_rate": 5.443853641273024e-06, + "loss": 1.338, + "step": 7559 + }, + { + "epoch": 1.3042353144138703, + "grad_norm": 0.5703125, + "learning_rate": 5.441430855737789e-06, + "loss": 1.3658, + "step": 7560 + }, + { + "epoch": 1.3044078323126025, + "grad_norm": 0.61328125, + "learning_rate": 5.43900840792984e-06, + "loss": 1.4133, + "step": 7561 + }, + { + "epoch": 1.3045803502113344, + "grad_norm": 0.60546875, + "learning_rate": 5.436586298028647e-06, + "loss": 1.4209, + "step": 7562 + }, + { + "epoch": 1.3047528681100664, + "grad_norm": 0.58203125, + "learning_rate": 5.434164526213659e-06, + "loss": 1.4423, + "step": 7563 + }, + { + "epoch": 1.3049253860087984, + "grad_norm": 0.6171875, + "learning_rate": 5.431743092664283e-06, + "loss": 1.3585, + "step": 7564 + }, + { + "epoch": 1.3050979039075303, + "grad_norm": 0.58984375, + "learning_rate": 5.429321997559931e-06, + "loss": 1.4644, + "step": 7565 + }, + { + "epoch": 1.3052704218062625, + "grad_norm": 0.69140625, + "learning_rate": 5.42690124107996e-06, + "loss": 1.4258, + "step": 7566 + }, + { + "epoch": 1.3054429397049945, + "grad_norm": 0.609375, + "learning_rate": 5.4244808234037195e-06, + "loss": 1.3962, + "step": 7567 + }, + { + "epoch": 1.3056154576037264, + "grad_norm": 0.6328125, + "learning_rate": 5.422060744710527e-06, + "loss": 1.3899, + "step": 7568 + }, + { + "epoch": 1.3057879755024584, + "grad_norm": 0.62890625, + "learning_rate": 5.419641005179681e-06, + "loss": 1.474, + "step": 7569 + }, + { + "epoch": 1.3059604934011904, + "grad_norm": 0.81640625, + "learning_rate": 5.417221604990448e-06, + "loss": 1.5104, + "step": 7570 + }, + { + "epoch": 1.3061330112999223, + "grad_norm": 0.6015625, + "learning_rate": 5.414802544322072e-06, + "loss": 1.5238, + "step": 7571 + }, + { + "epoch": 1.3063055291986543, + "grad_norm": 0.60546875, + "learning_rate": 5.412383823353774e-06, + "loss": 1.3963, + "step": 7572 + }, + { + "epoch": 1.3064780470973862, + "grad_norm": 0.5703125, + "learning_rate": 5.409965442264751e-06, + "loss": 1.3876, + "step": 7573 + }, + { + "epoch": 1.3066505649961184, + "grad_norm": 0.578125, + "learning_rate": 5.407547401234161e-06, + "loss": 1.4604, + "step": 7574 + }, + { + "epoch": 1.3068230828948504, + "grad_norm": 0.6875, + "learning_rate": 5.405129700441157e-06, + "loss": 1.5196, + "step": 7575 + }, + { + "epoch": 1.3069956007935823, + "grad_norm": 0.6171875, + "learning_rate": 5.402712340064859e-06, + "loss": 1.3634, + "step": 7576 + }, + { + "epoch": 1.3071681186923143, + "grad_norm": 0.64453125, + "learning_rate": 5.400295320284348e-06, + "loss": 1.5047, + "step": 7577 + }, + { + "epoch": 1.3073406365910463, + "grad_norm": 0.58984375, + "learning_rate": 5.397878641278709e-06, + "loss": 1.3541, + "step": 7578 + }, + { + "epoch": 1.3075131544897785, + "grad_norm": 0.578125, + "learning_rate": 5.3954623032269705e-06, + "loss": 1.4108, + "step": 7579 + }, + { + "epoch": 1.3076856723885104, + "grad_norm": 0.671875, + "learning_rate": 5.3930463063081564e-06, + "loss": 1.4113, + "step": 7580 + }, + { + "epoch": 1.3078581902872424, + "grad_norm": 0.6484375, + "learning_rate": 5.390630650701257e-06, + "loss": 1.4792, + "step": 7581 + }, + { + "epoch": 1.3080307081859743, + "grad_norm": 0.6015625, + "learning_rate": 5.388215336585239e-06, + "loss": 1.4458, + "step": 7582 + }, + { + "epoch": 1.3082032260847063, + "grad_norm": 0.6328125, + "learning_rate": 5.385800364139044e-06, + "loss": 1.4197, + "step": 7583 + }, + { + "epoch": 1.3083757439834383, + "grad_norm": 0.5546875, + "learning_rate": 5.383385733541594e-06, + "loss": 1.3762, + "step": 7584 + }, + { + "epoch": 1.3085482618821702, + "grad_norm": 0.61328125, + "learning_rate": 5.380971444971766e-06, + "loss": 1.4279, + "step": 7585 + }, + { + "epoch": 1.3087207797809022, + "grad_norm": 0.58984375, + "learning_rate": 5.3785574986084435e-06, + "loss": 1.4792, + "step": 7586 + }, + { + "epoch": 1.3088932976796341, + "grad_norm": 0.609375, + "learning_rate": 5.376143894630454e-06, + "loss": 1.4466, + "step": 7587 + }, + { + "epoch": 1.3090658155783663, + "grad_norm": 0.546875, + "learning_rate": 5.373730633216614e-06, + "loss": 1.4329, + "step": 7588 + }, + { + "epoch": 1.3092383334770983, + "grad_norm": 0.578125, + "learning_rate": 5.3713177145457165e-06, + "loss": 1.4848, + "step": 7589 + }, + { + "epoch": 1.3094108513758302, + "grad_norm": 0.58203125, + "learning_rate": 5.368905138796523e-06, + "loss": 1.4444, + "step": 7590 + }, + { + "epoch": 1.3095833692745622, + "grad_norm": 0.5859375, + "learning_rate": 5.366492906147775e-06, + "loss": 1.466, + "step": 7591 + }, + { + "epoch": 1.3097558871732942, + "grad_norm": 0.5859375, + "learning_rate": 5.364081016778182e-06, + "loss": 1.406, + "step": 7592 + }, + { + "epoch": 1.3099284050720263, + "grad_norm": 0.5625, + "learning_rate": 5.361669470866435e-06, + "loss": 1.3943, + "step": 7593 + }, + { + "epoch": 1.3101009229707583, + "grad_norm": 0.69140625, + "learning_rate": 5.359258268591195e-06, + "loss": 1.4474, + "step": 7594 + }, + { + "epoch": 1.3102734408694903, + "grad_norm": 0.625, + "learning_rate": 5.356847410131103e-06, + "loss": 1.4458, + "step": 7595 + }, + { + "epoch": 1.3104459587682222, + "grad_norm": 0.62890625, + "learning_rate": 5.354436895664759e-06, + "loss": 1.4278, + "step": 7596 + }, + { + "epoch": 1.3106184766669542, + "grad_norm": 0.55078125, + "learning_rate": 5.352026725370763e-06, + "loss": 1.4711, + "step": 7597 + }, + { + "epoch": 1.3107909945656862, + "grad_norm": 0.5859375, + "learning_rate": 5.3496168994276635e-06, + "loss": 1.423, + "step": 7598 + }, + { + "epoch": 1.3109635124644181, + "grad_norm": 0.609375, + "learning_rate": 5.347207418014006e-06, + "loss": 1.4974, + "step": 7599 + }, + { + "epoch": 1.31113603036315, + "grad_norm": 0.75, + "learning_rate": 5.344798281308295e-06, + "loss": 1.3676, + "step": 7600 + }, + { + "epoch": 1.31113603036315, + "eval_loss": 1.407556176185608, + "eval_runtime": 10.9219, + "eval_samples_per_second": 93.756, + "eval_steps_per_second": 23.439, + "step": 7600 + }, + { + "epoch": 1.311308548261882, + "grad_norm": 0.59765625, + "learning_rate": 5.34238948948901e-06, + "loss": 1.477, + "step": 7601 + }, + { + "epoch": 1.3114810661606142, + "grad_norm": 0.578125, + "learning_rate": 5.339981042734617e-06, + "loss": 1.4956, + "step": 7602 + }, + { + "epoch": 1.3116535840593462, + "grad_norm": 0.66796875, + "learning_rate": 5.337572941223544e-06, + "loss": 1.4456, + "step": 7603 + }, + { + "epoch": 1.3118261019580781, + "grad_norm": 0.57421875, + "learning_rate": 5.3351651851342e-06, + "loss": 1.5027, + "step": 7604 + }, + { + "epoch": 1.31199861985681, + "grad_norm": 0.578125, + "learning_rate": 5.33275777464497e-06, + "loss": 1.4509, + "step": 7605 + }, + { + "epoch": 1.312171137755542, + "grad_norm": 0.59375, + "learning_rate": 5.3303507099342e-06, + "loss": 1.4749, + "step": 7606 + }, + { + "epoch": 1.3123436556542742, + "grad_norm": 0.59765625, + "learning_rate": 5.3279439911802286e-06, + "loss": 1.4949, + "step": 7607 + }, + { + "epoch": 1.3125161735530062, + "grad_norm": 0.83203125, + "learning_rate": 5.325537618561364e-06, + "loss": 1.5091, + "step": 7608 + }, + { + "epoch": 1.3126886914517382, + "grad_norm": 0.578125, + "learning_rate": 5.323131592255871e-06, + "loss": 1.5434, + "step": 7609 + }, + { + "epoch": 1.3128612093504701, + "grad_norm": 0.578125, + "learning_rate": 5.3207259124420205e-06, + "loss": 1.4441, + "step": 7610 + }, + { + "epoch": 1.313033727249202, + "grad_norm": 0.73828125, + "learning_rate": 5.318320579298028e-06, + "loss": 1.4234, + "step": 7611 + }, + { + "epoch": 1.313206245147934, + "grad_norm": 0.59765625, + "learning_rate": 5.3159155930021e-06, + "loss": 1.4529, + "step": 7612 + }, + { + "epoch": 1.313378763046666, + "grad_norm": 0.62890625, + "learning_rate": 5.313510953732411e-06, + "loss": 1.4523, + "step": 7613 + }, + { + "epoch": 1.313551280945398, + "grad_norm": 0.59765625, + "learning_rate": 5.311106661667115e-06, + "loss": 1.4488, + "step": 7614 + }, + { + "epoch": 1.3137237988441302, + "grad_norm": 0.5703125, + "learning_rate": 5.308702716984333e-06, + "loss": 1.3764, + "step": 7615 + }, + { + "epoch": 1.3138963167428621, + "grad_norm": 0.65234375, + "learning_rate": 5.306299119862171e-06, + "loss": 1.3542, + "step": 7616 + }, + { + "epoch": 1.314068834641594, + "grad_norm": 0.5546875, + "learning_rate": 5.30389587047869e-06, + "loss": 1.4034, + "step": 7617 + }, + { + "epoch": 1.314241352540326, + "grad_norm": 0.6328125, + "learning_rate": 5.301492969011954e-06, + "loss": 1.4876, + "step": 7618 + }, + { + "epoch": 1.314413870439058, + "grad_norm": 0.59375, + "learning_rate": 5.299090415639973e-06, + "loss": 1.5078, + "step": 7619 + }, + { + "epoch": 1.3145863883377902, + "grad_norm": 0.58984375, + "learning_rate": 5.296688210540746e-06, + "loss": 1.4435, + "step": 7620 + }, + { + "epoch": 1.3147589062365221, + "grad_norm": 0.9375, + "learning_rate": 5.294286353892243e-06, + "loss": 1.5348, + "step": 7621 + }, + { + "epoch": 1.314931424135254, + "grad_norm": 0.6328125, + "learning_rate": 5.29188484587241e-06, + "loss": 1.3537, + "step": 7622 + }, + { + "epoch": 1.315103942033986, + "grad_norm": 0.609375, + "learning_rate": 5.2894836866591655e-06, + "loss": 1.4277, + "step": 7623 + }, + { + "epoch": 1.315276459932718, + "grad_norm": 0.609375, + "learning_rate": 5.287082876430403e-06, + "loss": 1.3934, + "step": 7624 + }, + { + "epoch": 1.31544897783145, + "grad_norm": 0.62890625, + "learning_rate": 5.284682415363988e-06, + "loss": 1.4736, + "step": 7625 + }, + { + "epoch": 1.315621495730182, + "grad_norm": 0.66015625, + "learning_rate": 5.2822823036377625e-06, + "loss": 1.3568, + "step": 7626 + }, + { + "epoch": 1.315794013628914, + "grad_norm": 0.5390625, + "learning_rate": 5.279882541429544e-06, + "loss": 1.36, + "step": 7627 + }, + { + "epoch": 1.3159665315276459, + "grad_norm": 0.55859375, + "learning_rate": 5.2774831289171136e-06, + "loss": 1.4064, + "step": 7628 + }, + { + "epoch": 1.316139049426378, + "grad_norm": 0.59765625, + "learning_rate": 5.275084066278248e-06, + "loss": 1.4406, + "step": 7629 + }, + { + "epoch": 1.31631156732511, + "grad_norm": 0.6484375, + "learning_rate": 5.27268535369067e-06, + "loss": 1.4174, + "step": 7630 + }, + { + "epoch": 1.316484085223842, + "grad_norm": 0.62890625, + "learning_rate": 5.270286991332106e-06, + "loss": 1.2914, + "step": 7631 + }, + { + "epoch": 1.316656603122574, + "grad_norm": 0.6015625, + "learning_rate": 5.267888979380229e-06, + "loss": 1.3725, + "step": 7632 + }, + { + "epoch": 1.316829121021306, + "grad_norm": 0.55078125, + "learning_rate": 5.265491318012705e-06, + "loss": 1.392, + "step": 7633 + }, + { + "epoch": 1.317001638920038, + "grad_norm": 0.63671875, + "learning_rate": 5.263094007407168e-06, + "loss": 1.4334, + "step": 7634 + }, + { + "epoch": 1.31717415681877, + "grad_norm": 0.5859375, + "learning_rate": 5.2606970477412236e-06, + "loss": 1.4093, + "step": 7635 + }, + { + "epoch": 1.317346674717502, + "grad_norm": 0.58203125, + "learning_rate": 5.258300439192454e-06, + "loss": 1.4355, + "step": 7636 + }, + { + "epoch": 1.317519192616234, + "grad_norm": 0.60546875, + "learning_rate": 5.255904181938419e-06, + "loss": 1.4449, + "step": 7637 + }, + { + "epoch": 1.317691710514966, + "grad_norm": 0.56640625, + "learning_rate": 5.253508276156638e-06, + "loss": 1.447, + "step": 7638 + }, + { + "epoch": 1.3178642284136979, + "grad_norm": 0.91796875, + "learning_rate": 5.251112722024625e-06, + "loss": 1.4952, + "step": 7639 + }, + { + "epoch": 1.3180367463124298, + "grad_norm": 0.5859375, + "learning_rate": 5.248717519719857e-06, + "loss": 1.4281, + "step": 7640 + }, + { + "epoch": 1.3182092642111618, + "grad_norm": 0.62890625, + "learning_rate": 5.246322669419775e-06, + "loss": 1.3986, + "step": 7641 + }, + { + "epoch": 1.3183817821098938, + "grad_norm": 0.59375, + "learning_rate": 5.2439281713018196e-06, + "loss": 1.4189, + "step": 7642 + }, + { + "epoch": 1.318554300008626, + "grad_norm": 0.55078125, + "learning_rate": 5.24153402554338e-06, + "loss": 1.4394, + "step": 7643 + }, + { + "epoch": 1.318726817907358, + "grad_norm": 0.6484375, + "learning_rate": 5.239140232321831e-06, + "loss": 1.4017, + "step": 7644 + }, + { + "epoch": 1.3188993358060899, + "grad_norm": 0.5703125, + "learning_rate": 5.236746791814522e-06, + "loss": 1.4656, + "step": 7645 + }, + { + "epoch": 1.3190718537048218, + "grad_norm": 0.6640625, + "learning_rate": 5.2343537041987715e-06, + "loss": 1.4927, + "step": 7646 + }, + { + "epoch": 1.3192443716035538, + "grad_norm": 0.6328125, + "learning_rate": 5.231960969651876e-06, + "loss": 1.4623, + "step": 7647 + }, + { + "epoch": 1.319416889502286, + "grad_norm": 0.578125, + "learning_rate": 5.2295685883511086e-06, + "loss": 1.5245, + "step": 7648 + }, + { + "epoch": 1.319589407401018, + "grad_norm": 0.6015625, + "learning_rate": 5.227176560473698e-06, + "loss": 1.3987, + "step": 7649 + }, + { + "epoch": 1.31976192529975, + "grad_norm": 0.57421875, + "learning_rate": 5.224784886196878e-06, + "loss": 1.4556, + "step": 7650 + }, + { + "epoch": 1.3199344431984819, + "grad_norm": 0.59765625, + "learning_rate": 5.222393565697828e-06, + "loss": 1.4211, + "step": 7651 + }, + { + "epoch": 1.3201069610972138, + "grad_norm": 0.578125, + "learning_rate": 5.2200025991537126e-06, + "loss": 1.4429, + "step": 7652 + }, + { + "epoch": 1.3202794789959458, + "grad_norm": 0.65234375, + "learning_rate": 5.217611986741673e-06, + "loss": 1.4242, + "step": 7653 + }, + { + "epoch": 1.3204519968946777, + "grad_norm": 0.62890625, + "learning_rate": 5.215221728638815e-06, + "loss": 1.4632, + "step": 7654 + }, + { + "epoch": 1.3206245147934097, + "grad_norm": 0.5859375, + "learning_rate": 5.2128318250222355e-06, + "loss": 1.4538, + "step": 7655 + }, + { + "epoch": 1.3207970326921419, + "grad_norm": 0.57421875, + "learning_rate": 5.210442276068981e-06, + "loss": 1.4823, + "step": 7656 + }, + { + "epoch": 1.3209695505908738, + "grad_norm": 0.68359375, + "learning_rate": 5.208053081956091e-06, + "loss": 1.3459, + "step": 7657 + }, + { + "epoch": 1.3211420684896058, + "grad_norm": 0.55859375, + "learning_rate": 5.205664242860568e-06, + "loss": 1.402, + "step": 7658 + }, + { + "epoch": 1.3213145863883378, + "grad_norm": 0.59375, + "learning_rate": 5.203275758959396e-06, + "loss": 1.376, + "step": 7659 + }, + { + "epoch": 1.3214871042870697, + "grad_norm": 0.57421875, + "learning_rate": 5.200887630429528e-06, + "loss": 1.3909, + "step": 7660 + }, + { + "epoch": 1.321659622185802, + "grad_norm": 0.56640625, + "learning_rate": 5.198499857447894e-06, + "loss": 1.4473, + "step": 7661 + }, + { + "epoch": 1.3218321400845339, + "grad_norm": 0.671875, + "learning_rate": 5.196112440191383e-06, + "loss": 1.3559, + "step": 7662 + }, + { + "epoch": 1.3220046579832658, + "grad_norm": 0.5703125, + "learning_rate": 5.193725378836886e-06, + "loss": 1.4613, + "step": 7663 + }, + { + "epoch": 1.3221771758819978, + "grad_norm": 0.56640625, + "learning_rate": 5.19133867356124e-06, + "loss": 1.3656, + "step": 7664 + }, + { + "epoch": 1.3223496937807298, + "grad_norm": 0.58984375, + "learning_rate": 5.188952324541272e-06, + "loss": 1.4035, + "step": 7665 + }, + { + "epoch": 1.3225222116794617, + "grad_norm": 0.5859375, + "learning_rate": 5.1865663319537764e-06, + "loss": 1.4547, + "step": 7666 + }, + { + "epoch": 1.3226947295781937, + "grad_norm": 0.6484375, + "learning_rate": 5.184180695975522e-06, + "loss": 1.4893, + "step": 7667 + }, + { + "epoch": 1.3228672474769256, + "grad_norm": 0.5625, + "learning_rate": 5.181795416783253e-06, + "loss": 1.4425, + "step": 7668 + }, + { + "epoch": 1.3230397653756576, + "grad_norm": 0.59765625, + "learning_rate": 5.1794104945536886e-06, + "loss": 1.4272, + "step": 7669 + }, + { + "epoch": 1.3232122832743898, + "grad_norm": 2.5, + "learning_rate": 5.1770259294635075e-06, + "loss": 1.4514, + "step": 7670 + }, + { + "epoch": 1.3233848011731217, + "grad_norm": 0.7890625, + "learning_rate": 5.1746417216893845e-06, + "loss": 1.4506, + "step": 7671 + }, + { + "epoch": 1.3235573190718537, + "grad_norm": 0.5859375, + "learning_rate": 5.172257871407957e-06, + "loss": 1.4556, + "step": 7672 + }, + { + "epoch": 1.3237298369705857, + "grad_norm": 0.58203125, + "learning_rate": 5.169874378795824e-06, + "loss": 1.3554, + "step": 7673 + }, + { + "epoch": 1.3239023548693176, + "grad_norm": 0.640625, + "learning_rate": 5.167491244029584e-06, + "loss": 1.4715, + "step": 7674 + }, + { + "epoch": 1.3240748727680498, + "grad_norm": 3.78125, + "learning_rate": 5.165108467285784e-06, + "loss": 1.4466, + "step": 7675 + }, + { + "epoch": 1.3242473906667818, + "grad_norm": 0.70703125, + "learning_rate": 5.16272604874096e-06, + "loss": 1.4296, + "step": 7676 + }, + { + "epoch": 1.3244199085655137, + "grad_norm": 0.59765625, + "learning_rate": 5.160343988571614e-06, + "loss": 1.4277, + "step": 7677 + }, + { + "epoch": 1.3245924264642457, + "grad_norm": 0.625, + "learning_rate": 5.157962286954224e-06, + "loss": 1.5272, + "step": 7678 + }, + { + "epoch": 1.3247649443629776, + "grad_norm": 0.6171875, + "learning_rate": 5.155580944065244e-06, + "loss": 1.5264, + "step": 7679 + }, + { + "epoch": 1.3249374622617096, + "grad_norm": 0.62109375, + "learning_rate": 5.153199960081099e-06, + "loss": 1.3906, + "step": 7680 + }, + { + "epoch": 1.3251099801604416, + "grad_norm": 0.65234375, + "learning_rate": 5.150819335178179e-06, + "loss": 1.4098, + "step": 7681 + }, + { + "epoch": 1.3252824980591735, + "grad_norm": 0.73828125, + "learning_rate": 5.148439069532868e-06, + "loss": 1.3736, + "step": 7682 + }, + { + "epoch": 1.3254550159579057, + "grad_norm": 0.66796875, + "learning_rate": 5.1460591633215015e-06, + "loss": 1.36, + "step": 7683 + }, + { + "epoch": 1.3256275338566377, + "grad_norm": 0.60546875, + "learning_rate": 5.143679616720401e-06, + "loss": 1.4018, + "step": 7684 + }, + { + "epoch": 1.3258000517553696, + "grad_norm": 0.58203125, + "learning_rate": 5.141300429905858e-06, + "loss": 1.4803, + "step": 7685 + }, + { + "epoch": 1.3259725696541016, + "grad_norm": 0.55859375, + "learning_rate": 5.1389216030541345e-06, + "loss": 1.3943, + "step": 7686 + }, + { + "epoch": 1.3261450875528336, + "grad_norm": 0.546875, + "learning_rate": 5.1365431363414784e-06, + "loss": 1.3702, + "step": 7687 + }, + { + "epoch": 1.3263176054515655, + "grad_norm": 0.58984375, + "learning_rate": 5.134165029944094e-06, + "loss": 1.5292, + "step": 7688 + }, + { + "epoch": 1.3264901233502977, + "grad_norm": 0.60546875, + "learning_rate": 5.1317872840381645e-06, + "loss": 1.4318, + "step": 7689 + }, + { + "epoch": 1.3266626412490297, + "grad_norm": 0.62109375, + "learning_rate": 5.129409898799852e-06, + "loss": 1.4077, + "step": 7690 + }, + { + "epoch": 1.3268351591477616, + "grad_norm": 0.57421875, + "learning_rate": 5.1270328744052864e-06, + "loss": 1.4297, + "step": 7691 + }, + { + "epoch": 1.3270076770464936, + "grad_norm": 0.640625, + "learning_rate": 5.124656211030574e-06, + "loss": 1.4418, + "step": 7692 + }, + { + "epoch": 1.3271801949452255, + "grad_norm": 0.6015625, + "learning_rate": 5.122279908851796e-06, + "loss": 1.4186, + "step": 7693 + }, + { + "epoch": 1.3273527128439575, + "grad_norm": 0.65625, + "learning_rate": 5.119903968044992e-06, + "loss": 1.4164, + "step": 7694 + }, + { + "epoch": 1.3275252307426895, + "grad_norm": 0.59765625, + "learning_rate": 5.117528388786201e-06, + "loss": 1.4222, + "step": 7695 + }, + { + "epoch": 1.3276977486414214, + "grad_norm": 0.58984375, + "learning_rate": 5.1151531712514115e-06, + "loss": 1.3717, + "step": 7696 + }, + { + "epoch": 1.3278702665401536, + "grad_norm": 0.58203125, + "learning_rate": 5.112778315616596e-06, + "loss": 1.4239, + "step": 7697 + }, + { + "epoch": 1.3280427844388856, + "grad_norm": 0.65234375, + "learning_rate": 5.1104038220577e-06, + "loss": 1.4251, + "step": 7698 + }, + { + "epoch": 1.3282153023376175, + "grad_norm": 0.6015625, + "learning_rate": 5.108029690750641e-06, + "loss": 1.3776, + "step": 7699 + }, + { + "epoch": 1.3283878202363495, + "grad_norm": 0.60546875, + "learning_rate": 5.105655921871309e-06, + "loss": 1.4722, + "step": 7700 + }, + { + "epoch": 1.3283878202363495, + "eval_loss": 1.4075722694396973, + "eval_runtime": 11.0299, + "eval_samples_per_second": 92.839, + "eval_steps_per_second": 23.21, + "step": 7700 + }, + { + "epoch": 1.3285603381350815, + "grad_norm": 0.5625, + "learning_rate": 5.10328251559557e-06, + "loss": 1.4283, + "step": 7701 + }, + { + "epoch": 1.3287328560338136, + "grad_norm": 0.62109375, + "learning_rate": 5.100909472099251e-06, + "loss": 1.4533, + "step": 7702 + }, + { + "epoch": 1.3289053739325456, + "grad_norm": 0.58984375, + "learning_rate": 5.098536791558175e-06, + "loss": 1.2669, + "step": 7703 + }, + { + "epoch": 1.3290778918312776, + "grad_norm": 0.55859375, + "learning_rate": 5.096164474148122e-06, + "loss": 1.3087, + "step": 7704 + }, + { + "epoch": 1.3292504097300095, + "grad_norm": 0.66796875, + "learning_rate": 5.093792520044837e-06, + "loss": 1.3846, + "step": 7705 + }, + { + "epoch": 1.3294229276287415, + "grad_norm": 0.5625, + "learning_rate": 5.091420929424065e-06, + "loss": 1.4604, + "step": 7706 + }, + { + "epoch": 1.3295954455274734, + "grad_norm": 0.58203125, + "learning_rate": 5.089049702461497e-06, + "loss": 1.4529, + "step": 7707 + }, + { + "epoch": 1.3297679634262054, + "grad_norm": 0.578125, + "learning_rate": 5.086678839332813e-06, + "loss": 1.4119, + "step": 7708 + }, + { + "epoch": 1.3299404813249374, + "grad_norm": 0.63671875, + "learning_rate": 5.084308340213661e-06, + "loss": 1.4156, + "step": 7709 + }, + { + "epoch": 1.3301129992236693, + "grad_norm": 0.6015625, + "learning_rate": 5.08193820527966e-06, + "loss": 1.5028, + "step": 7710 + }, + { + "epoch": 1.3302855171224015, + "grad_norm": 0.63671875, + "learning_rate": 5.079568434706408e-06, + "loss": 1.4327, + "step": 7711 + }, + { + "epoch": 1.3304580350211335, + "grad_norm": 0.640625, + "learning_rate": 5.0771990286694725e-06, + "loss": 1.5038, + "step": 7712 + }, + { + "epoch": 1.3306305529198654, + "grad_norm": 0.5703125, + "learning_rate": 5.0748299873443855e-06, + "loss": 1.4443, + "step": 7713 + }, + { + "epoch": 1.3308030708185974, + "grad_norm": 0.59765625, + "learning_rate": 5.072461310906675e-06, + "loss": 1.4156, + "step": 7714 + }, + { + "epoch": 1.3309755887173293, + "grad_norm": 0.61328125, + "learning_rate": 5.07009299953181e-06, + "loss": 1.4279, + "step": 7715 + }, + { + "epoch": 1.3311481066160615, + "grad_norm": 0.64453125, + "learning_rate": 5.0677250533952695e-06, + "loss": 1.5014, + "step": 7716 + }, + { + "epoch": 1.3313206245147935, + "grad_norm": 0.58984375, + "learning_rate": 5.065357472672469e-06, + "loss": 1.4707, + "step": 7717 + }, + { + "epoch": 1.3314931424135255, + "grad_norm": 0.578125, + "learning_rate": 5.0629902575388165e-06, + "loss": 1.4062, + "step": 7718 + }, + { + "epoch": 1.3316656603122574, + "grad_norm": 0.5625, + "learning_rate": 5.060623408169703e-06, + "loss": 1.4613, + "step": 7719 + }, + { + "epoch": 1.3318381782109894, + "grad_norm": 0.60546875, + "learning_rate": 5.058256924740463e-06, + "loss": 1.4527, + "step": 7720 + }, + { + "epoch": 1.3320106961097213, + "grad_norm": 0.70703125, + "learning_rate": 5.0558908074264315e-06, + "loss": 1.5059, + "step": 7721 + }, + { + "epoch": 1.3321832140084533, + "grad_norm": 0.74609375, + "learning_rate": 5.053525056402898e-06, + "loss": 1.4908, + "step": 7722 + }, + { + "epoch": 1.3323557319071853, + "grad_norm": 0.69140625, + "learning_rate": 5.051159671845136e-06, + "loss": 1.529, + "step": 7723 + }, + { + "epoch": 1.3325282498059174, + "grad_norm": 0.61328125, + "learning_rate": 5.048794653928389e-06, + "loss": 1.4887, + "step": 7724 + }, + { + "epoch": 1.3327007677046494, + "grad_norm": 0.62109375, + "learning_rate": 5.046430002827874e-06, + "loss": 1.3807, + "step": 7725 + }, + { + "epoch": 1.3328732856033814, + "grad_norm": 0.69140625, + "learning_rate": 5.044065718718766e-06, + "loss": 1.3835, + "step": 7726 + }, + { + "epoch": 1.3330458035021133, + "grad_norm": 0.5546875, + "learning_rate": 5.041701801776244e-06, + "loss": 1.3587, + "step": 7727 + }, + { + "epoch": 1.3332183214008453, + "grad_norm": 0.578125, + "learning_rate": 5.039338252175431e-06, + "loss": 1.4684, + "step": 7728 + }, + { + "epoch": 1.3333908392995775, + "grad_norm": 0.71484375, + "learning_rate": 5.0369750700914345e-06, + "loss": 1.3529, + "step": 7729 + }, + { + "epoch": 1.3335633571983094, + "grad_norm": 0.578125, + "learning_rate": 5.034612255699336e-06, + "loss": 1.448, + "step": 7730 + }, + { + "epoch": 1.3337358750970414, + "grad_norm": 0.68359375, + "learning_rate": 5.032249809174187e-06, + "loss": 1.361, + "step": 7731 + }, + { + "epoch": 1.3339083929957734, + "grad_norm": 0.6328125, + "learning_rate": 5.029887730691011e-06, + "loss": 1.3377, + "step": 7732 + }, + { + "epoch": 1.3340809108945053, + "grad_norm": 0.578125, + "learning_rate": 5.027526020424811e-06, + "loss": 1.4446, + "step": 7733 + }, + { + "epoch": 1.3342534287932373, + "grad_norm": 0.6640625, + "learning_rate": 5.025164678550545e-06, + "loss": 1.3311, + "step": 7734 + }, + { + "epoch": 1.3344259466919692, + "grad_norm": 0.6328125, + "learning_rate": 5.0228037052431685e-06, + "loss": 1.4285, + "step": 7735 + }, + { + "epoch": 1.3345984645907012, + "grad_norm": 0.609375, + "learning_rate": 5.020443100677595e-06, + "loss": 1.3957, + "step": 7736 + }, + { + "epoch": 1.3347709824894332, + "grad_norm": 0.59375, + "learning_rate": 5.018082865028704e-06, + "loss": 1.4813, + "step": 7737 + }, + { + "epoch": 1.3349435003881653, + "grad_norm": 0.59375, + "learning_rate": 5.01572299847137e-06, + "loss": 1.3646, + "step": 7738 + }, + { + "epoch": 1.3351160182868973, + "grad_norm": 0.625, + "learning_rate": 5.013363501180415e-06, + "loss": 1.5413, + "step": 7739 + }, + { + "epoch": 1.3352885361856293, + "grad_norm": 0.65625, + "learning_rate": 5.01100437333065e-06, + "loss": 1.4785, + "step": 7740 + }, + { + "epoch": 1.3354610540843612, + "grad_norm": 0.578125, + "learning_rate": 5.008645615096855e-06, + "loss": 1.4102, + "step": 7741 + }, + { + "epoch": 1.3356335719830932, + "grad_norm": 0.56640625, + "learning_rate": 5.006287226653779e-06, + "loss": 1.4611, + "step": 7742 + }, + { + "epoch": 1.3358060898818254, + "grad_norm": 0.68359375, + "learning_rate": 5.003929208176148e-06, + "loss": 1.4491, + "step": 7743 + }, + { + "epoch": 1.3359786077805573, + "grad_norm": 0.6484375, + "learning_rate": 5.00157155983866e-06, + "loss": 1.5095, + "step": 7744 + }, + { + "epoch": 1.3361511256792893, + "grad_norm": 0.5859375, + "learning_rate": 4.999214281815977e-06, + "loss": 1.496, + "step": 7745 + }, + { + "epoch": 1.3363236435780212, + "grad_norm": 0.6015625, + "learning_rate": 4.996857374282754e-06, + "loss": 1.4236, + "step": 7746 + }, + { + "epoch": 1.3364961614767532, + "grad_norm": 0.5703125, + "learning_rate": 4.99450083741359e-06, + "loss": 1.2766, + "step": 7747 + }, + { + "epoch": 1.3366686793754852, + "grad_norm": 0.59375, + "learning_rate": 4.992144671383087e-06, + "loss": 1.3265, + "step": 7748 + }, + { + "epoch": 1.3368411972742171, + "grad_norm": 0.61328125, + "learning_rate": 4.989788876365793e-06, + "loss": 1.4192, + "step": 7749 + }, + { + "epoch": 1.337013715172949, + "grad_norm": 0.62109375, + "learning_rate": 4.987433452536244e-06, + "loss": 1.476, + "step": 7750 + }, + { + "epoch": 1.337186233071681, + "grad_norm": 0.58203125, + "learning_rate": 4.985078400068947e-06, + "loss": 1.4746, + "step": 7751 + }, + { + "epoch": 1.3373587509704132, + "grad_norm": 0.5703125, + "learning_rate": 4.982723719138375e-06, + "loss": 1.4289, + "step": 7752 + }, + { + "epoch": 1.3375312688691452, + "grad_norm": 1.375, + "learning_rate": 4.980369409918979e-06, + "loss": 1.3759, + "step": 7753 + }, + { + "epoch": 1.3377037867678772, + "grad_norm": 0.609375, + "learning_rate": 4.978015472585183e-06, + "loss": 1.4064, + "step": 7754 + }, + { + "epoch": 1.3378763046666091, + "grad_norm": 0.5859375, + "learning_rate": 4.975661907311377e-06, + "loss": 1.4212, + "step": 7755 + }, + { + "epoch": 1.338048822565341, + "grad_norm": 0.55859375, + "learning_rate": 4.973308714271933e-06, + "loss": 1.4666, + "step": 7756 + }, + { + "epoch": 1.3382213404640733, + "grad_norm": 0.59375, + "learning_rate": 4.97095589364119e-06, + "loss": 1.4525, + "step": 7757 + }, + { + "epoch": 1.3383938583628052, + "grad_norm": 0.7734375, + "learning_rate": 4.96860344559345e-06, + "loss": 1.4663, + "step": 7758 + }, + { + "epoch": 1.3385663762615372, + "grad_norm": 0.5546875, + "learning_rate": 4.966251370303011e-06, + "loss": 1.3687, + "step": 7759 + }, + { + "epoch": 1.3387388941602691, + "grad_norm": 0.5625, + "learning_rate": 4.963899667944121e-06, + "loss": 1.415, + "step": 7760 + }, + { + "epoch": 1.338911412059001, + "grad_norm": 0.6171875, + "learning_rate": 4.961548338691009e-06, + "loss": 1.45, + "step": 7761 + }, + { + "epoch": 1.339083929957733, + "grad_norm": 0.55078125, + "learning_rate": 4.959197382717878e-06, + "loss": 1.4356, + "step": 7762 + }, + { + "epoch": 1.339256447856465, + "grad_norm": 0.62109375, + "learning_rate": 4.956846800198902e-06, + "loss": 1.4391, + "step": 7763 + }, + { + "epoch": 1.339428965755197, + "grad_norm": 0.5703125, + "learning_rate": 4.954496591308227e-06, + "loss": 1.4465, + "step": 7764 + }, + { + "epoch": 1.3396014836539292, + "grad_norm": 0.6171875, + "learning_rate": 4.952146756219972e-06, + "loss": 1.3614, + "step": 7765 + }, + { + "epoch": 1.3397740015526611, + "grad_norm": 0.63671875, + "learning_rate": 4.949797295108218e-06, + "loss": 1.3784, + "step": 7766 + }, + { + "epoch": 1.339946519451393, + "grad_norm": 0.5625, + "learning_rate": 4.947448208147041e-06, + "loss": 1.424, + "step": 7767 + }, + { + "epoch": 1.340119037350125, + "grad_norm": 0.5859375, + "learning_rate": 4.9450994955104736e-06, + "loss": 1.4743, + "step": 7768 + }, + { + "epoch": 1.340291555248857, + "grad_norm": 0.7109375, + "learning_rate": 4.9427511573725125e-06, + "loss": 1.3808, + "step": 7769 + }, + { + "epoch": 1.3404640731475892, + "grad_norm": 0.578125, + "learning_rate": 4.940403193907153e-06, + "loss": 1.3319, + "step": 7770 + }, + { + "epoch": 1.3406365910463212, + "grad_norm": 0.55859375, + "learning_rate": 4.938055605288334e-06, + "loss": 1.5072, + "step": 7771 + }, + { + "epoch": 1.3408091089450531, + "grad_norm": 1.0234375, + "learning_rate": 4.935708391689985e-06, + "loss": 1.4005, + "step": 7772 + }, + { + "epoch": 1.340981626843785, + "grad_norm": 0.58203125, + "learning_rate": 4.9333615532860005e-06, + "loss": 1.4497, + "step": 7773 + }, + { + "epoch": 1.341154144742517, + "grad_norm": 0.6015625, + "learning_rate": 4.931015090250251e-06, + "loss": 1.4624, + "step": 7774 + }, + { + "epoch": 1.341326662641249, + "grad_norm": 1.140625, + "learning_rate": 4.928669002756576e-06, + "loss": 1.4006, + "step": 7775 + }, + { + "epoch": 1.341499180539981, + "grad_norm": 0.66015625, + "learning_rate": 4.926323290978787e-06, + "loss": 1.5053, + "step": 7776 + }, + { + "epoch": 1.341671698438713, + "grad_norm": 0.61328125, + "learning_rate": 4.923977955090672e-06, + "loss": 1.515, + "step": 7777 + }, + { + "epoch": 1.3418442163374449, + "grad_norm": 0.578125, + "learning_rate": 4.9216329952659895e-06, + "loss": 1.385, + "step": 7778 + }, + { + "epoch": 1.342016734236177, + "grad_norm": 0.58203125, + "learning_rate": 4.919288411678459e-06, + "loss": 1.4394, + "step": 7779 + }, + { + "epoch": 1.342189252134909, + "grad_norm": 0.6484375, + "learning_rate": 4.916944204501796e-06, + "loss": 1.4153, + "step": 7780 + }, + { + "epoch": 1.342361770033641, + "grad_norm": 0.5703125, + "learning_rate": 4.914600373909662e-06, + "loss": 1.356, + "step": 7781 + }, + { + "epoch": 1.342534287932373, + "grad_norm": 0.62109375, + "learning_rate": 4.912256920075708e-06, + "loss": 1.4713, + "step": 7782 + }, + { + "epoch": 1.342706805831105, + "grad_norm": 0.671875, + "learning_rate": 4.909913843173552e-06, + "loss": 1.3787, + "step": 7783 + }, + { + "epoch": 1.342879323729837, + "grad_norm": 0.69921875, + "learning_rate": 4.907571143376782e-06, + "loss": 1.4151, + "step": 7784 + }, + { + "epoch": 1.343051841628569, + "grad_norm": 0.73828125, + "learning_rate": 4.905228820858959e-06, + "loss": 1.5189, + "step": 7785 + }, + { + "epoch": 1.343224359527301, + "grad_norm": 0.5625, + "learning_rate": 4.902886875793621e-06, + "loss": 1.3454, + "step": 7786 + }, + { + "epoch": 1.343396877426033, + "grad_norm": 0.5859375, + "learning_rate": 4.900545308354271e-06, + "loss": 1.4785, + "step": 7787 + }, + { + "epoch": 1.343569395324765, + "grad_norm": 0.5625, + "learning_rate": 4.898204118714387e-06, + "loss": 1.4196, + "step": 7788 + }, + { + "epoch": 1.343741913223497, + "grad_norm": 0.5703125, + "learning_rate": 4.895863307047423e-06, + "loss": 1.4437, + "step": 7789 + }, + { + "epoch": 1.3439144311222289, + "grad_norm": 0.5625, + "learning_rate": 4.89352287352679e-06, + "loss": 1.4041, + "step": 7790 + }, + { + "epoch": 1.3440869490209608, + "grad_norm": 0.58984375, + "learning_rate": 4.891182818325897e-06, + "loss": 1.4244, + "step": 7791 + }, + { + "epoch": 1.3442594669196928, + "grad_norm": 1.3359375, + "learning_rate": 4.888843141618098e-06, + "loss": 1.4494, + "step": 7792 + }, + { + "epoch": 1.344431984818425, + "grad_norm": 0.55859375, + "learning_rate": 4.886503843576736e-06, + "loss": 1.4727, + "step": 7793 + }, + { + "epoch": 1.344604502717157, + "grad_norm": 0.578125, + "learning_rate": 4.884164924375119e-06, + "loss": 1.3916, + "step": 7794 + }, + { + "epoch": 1.3447770206158889, + "grad_norm": 0.58203125, + "learning_rate": 4.881826384186529e-06, + "loss": 1.3834, + "step": 7795 + }, + { + "epoch": 1.3449495385146208, + "grad_norm": 0.59765625, + "learning_rate": 4.879488223184221e-06, + "loss": 1.3876, + "step": 7796 + }, + { + "epoch": 1.3451220564133528, + "grad_norm": 0.58203125, + "learning_rate": 4.877150441541424e-06, + "loss": 1.3679, + "step": 7797 + }, + { + "epoch": 1.345294574312085, + "grad_norm": 0.5546875, + "learning_rate": 4.8748130394313234e-06, + "loss": 1.4242, + "step": 7798 + }, + { + "epoch": 1.345467092210817, + "grad_norm": 0.5625, + "learning_rate": 4.8724760170271e-06, + "loss": 1.4362, + "step": 7799 + }, + { + "epoch": 1.345639610109549, + "grad_norm": 0.546875, + "learning_rate": 4.870139374501895e-06, + "loss": 1.3885, + "step": 7800 + }, + { + "epoch": 1.345639610109549, + "eval_loss": 1.407551646232605, + "eval_runtime": 10.8526, + "eval_samples_per_second": 94.355, + "eval_steps_per_second": 23.589, + "step": 7800 + }, + { + "epoch": 1.3458121280082809, + "grad_norm": 0.578125, + "learning_rate": 4.8678031120288115e-06, + "loss": 1.452, + "step": 7801 + }, + { + "epoch": 1.3459846459070128, + "grad_norm": 0.62109375, + "learning_rate": 4.865467229780948e-06, + "loss": 1.4555, + "step": 7802 + }, + { + "epoch": 1.3461571638057448, + "grad_norm": 0.61328125, + "learning_rate": 4.863131727931347e-06, + "loss": 1.6427, + "step": 7803 + }, + { + "epoch": 1.3463296817044768, + "grad_norm": 0.57421875, + "learning_rate": 4.860796606653051e-06, + "loss": 1.3934, + "step": 7804 + }, + { + "epoch": 1.3465021996032087, + "grad_norm": 0.59375, + "learning_rate": 4.858461866119051e-06, + "loss": 1.5244, + "step": 7805 + }, + { + "epoch": 1.346674717501941, + "grad_norm": 0.68359375, + "learning_rate": 4.856127506502321e-06, + "loss": 1.4147, + "step": 7806 + }, + { + "epoch": 1.3468472354006729, + "grad_norm": 0.60546875, + "learning_rate": 4.853793527975806e-06, + "loss": 1.5024, + "step": 7807 + }, + { + "epoch": 1.3470197532994048, + "grad_norm": 0.609375, + "learning_rate": 4.85145993071242e-06, + "loss": 1.4646, + "step": 7808 + }, + { + "epoch": 1.3471922711981368, + "grad_norm": 0.62109375, + "learning_rate": 4.849126714885053e-06, + "loss": 1.4857, + "step": 7809 + }, + { + "epoch": 1.3473647890968687, + "grad_norm": 0.5859375, + "learning_rate": 4.846793880666567e-06, + "loss": 1.3657, + "step": 7810 + }, + { + "epoch": 1.347537306995601, + "grad_norm": 0.5703125, + "learning_rate": 4.844461428229782e-06, + "loss": 1.3712, + "step": 7811 + }, + { + "epoch": 1.3477098248943329, + "grad_norm": 0.55859375, + "learning_rate": 4.8421293577475145e-06, + "loss": 1.385, + "step": 7812 + }, + { + "epoch": 1.3478823427930648, + "grad_norm": 0.609375, + "learning_rate": 4.839797669392528e-06, + "loss": 1.4094, + "step": 7813 + }, + { + "epoch": 1.3480548606917968, + "grad_norm": 0.609375, + "learning_rate": 4.837466363337573e-06, + "loss": 1.4854, + "step": 7814 + }, + { + "epoch": 1.3482273785905288, + "grad_norm": 0.5546875, + "learning_rate": 4.835135439755367e-06, + "loss": 1.3394, + "step": 7815 + }, + { + "epoch": 1.3483998964892607, + "grad_norm": 0.6484375, + "learning_rate": 4.832804898818599e-06, + "loss": 1.4, + "step": 7816 + }, + { + "epoch": 1.3485724143879927, + "grad_norm": 0.5390625, + "learning_rate": 4.8304747406999304e-06, + "loss": 1.2918, + "step": 7817 + }, + { + "epoch": 1.3487449322867247, + "grad_norm": 0.5703125, + "learning_rate": 4.828144965571994e-06, + "loss": 1.2981, + "step": 7818 + }, + { + "epoch": 1.3489174501854566, + "grad_norm": 0.58984375, + "learning_rate": 4.825815573607393e-06, + "loss": 1.4247, + "step": 7819 + }, + { + "epoch": 1.3490899680841888, + "grad_norm": 0.62890625, + "learning_rate": 4.823486564978705e-06, + "loss": 1.5022, + "step": 7820 + }, + { + "epoch": 1.3492624859829208, + "grad_norm": 0.5703125, + "learning_rate": 4.821157939858479e-06, + "loss": 1.3706, + "step": 7821 + }, + { + "epoch": 1.3494350038816527, + "grad_norm": 0.68359375, + "learning_rate": 4.818829698419225e-06, + "loss": 1.4452, + "step": 7822 + }, + { + "epoch": 1.3496075217803847, + "grad_norm": 0.55859375, + "learning_rate": 4.816501840833448e-06, + "loss": 1.4214, + "step": 7823 + }, + { + "epoch": 1.3497800396791166, + "grad_norm": 0.5859375, + "learning_rate": 4.814174367273599e-06, + "loss": 1.4343, + "step": 7824 + }, + { + "epoch": 1.3499525575778488, + "grad_norm": 0.55078125, + "learning_rate": 4.811847277912115e-06, + "loss": 1.4916, + "step": 7825 + }, + { + "epoch": 1.3501250754765808, + "grad_norm": 0.61328125, + "learning_rate": 4.8095205729214015e-06, + "loss": 1.5595, + "step": 7826 + }, + { + "epoch": 1.3502975933753127, + "grad_norm": 0.578125, + "learning_rate": 4.8071942524738355e-06, + "loss": 1.4548, + "step": 7827 + }, + { + "epoch": 1.3504701112740447, + "grad_norm": 0.6484375, + "learning_rate": 4.8048683167417664e-06, + "loss": 1.3287, + "step": 7828 + }, + { + "epoch": 1.3506426291727767, + "grad_norm": 0.625, + "learning_rate": 4.802542765897516e-06, + "loss": 1.4488, + "step": 7829 + }, + { + "epoch": 1.3508151470715086, + "grad_norm": 0.57421875, + "learning_rate": 4.800217600113366e-06, + "loss": 1.4453, + "step": 7830 + }, + { + "epoch": 1.3509876649702406, + "grad_norm": 0.625, + "learning_rate": 4.797892819561589e-06, + "loss": 1.4369, + "step": 7831 + }, + { + "epoch": 1.3511601828689725, + "grad_norm": 0.5546875, + "learning_rate": 4.795568424414421e-06, + "loss": 1.3552, + "step": 7832 + }, + { + "epoch": 1.3513327007677047, + "grad_norm": 0.5703125, + "learning_rate": 4.793244414844054e-06, + "loss": 1.3656, + "step": 7833 + }, + { + "epoch": 1.3515052186664367, + "grad_norm": 0.578125, + "learning_rate": 4.790920791022682e-06, + "loss": 1.4695, + "step": 7834 + }, + { + "epoch": 1.3516777365651687, + "grad_norm": 0.63671875, + "learning_rate": 4.788597553122438e-06, + "loss": 1.405, + "step": 7835 + }, + { + "epoch": 1.3518502544639006, + "grad_norm": 0.60546875, + "learning_rate": 4.786274701315458e-06, + "loss": 1.3898, + "step": 7836 + }, + { + "epoch": 1.3520227723626326, + "grad_norm": 0.5859375, + "learning_rate": 4.7839522357738196e-06, + "loss": 1.5197, + "step": 7837 + }, + { + "epoch": 1.3521952902613645, + "grad_norm": 1.46875, + "learning_rate": 4.781630156669592e-06, + "loss": 1.4992, + "step": 7838 + }, + { + "epoch": 1.3523678081600967, + "grad_norm": 0.59375, + "learning_rate": 4.7793084641748085e-06, + "loss": 1.5419, + "step": 7839 + }, + { + "epoch": 1.3525403260588287, + "grad_norm": 0.60546875, + "learning_rate": 4.776987158461475e-06, + "loss": 1.365, + "step": 7840 + }, + { + "epoch": 1.3527128439575606, + "grad_norm": 0.62890625, + "learning_rate": 4.774666239701566e-06, + "loss": 1.4914, + "step": 7841 + }, + { + "epoch": 1.3528853618562926, + "grad_norm": 0.61328125, + "learning_rate": 4.772345708067035e-06, + "loss": 1.4306, + "step": 7842 + }, + { + "epoch": 1.3530578797550246, + "grad_norm": 0.59375, + "learning_rate": 4.770025563729792e-06, + "loss": 1.4419, + "step": 7843 + }, + { + "epoch": 1.3532303976537565, + "grad_norm": 0.6015625, + "learning_rate": 4.767705806861741e-06, + "loss": 1.4563, + "step": 7844 + }, + { + "epoch": 1.3534029155524885, + "grad_norm": 0.65234375, + "learning_rate": 4.765386437634732e-06, + "loss": 1.3103, + "step": 7845 + }, + { + "epoch": 1.3535754334512204, + "grad_norm": 0.5859375, + "learning_rate": 4.763067456220604e-06, + "loss": 1.4048, + "step": 7846 + }, + { + "epoch": 1.3537479513499526, + "grad_norm": 0.5625, + "learning_rate": 4.760748862791159e-06, + "loss": 1.3718, + "step": 7847 + }, + { + "epoch": 1.3539204692486846, + "grad_norm": 0.62890625, + "learning_rate": 4.758430657518176e-06, + "loss": 1.4345, + "step": 7848 + }, + { + "epoch": 1.3540929871474165, + "grad_norm": 0.5390625, + "learning_rate": 4.7561128405734e-06, + "loss": 1.3932, + "step": 7849 + }, + { + "epoch": 1.3542655050461485, + "grad_norm": 0.9375, + "learning_rate": 4.753795412128552e-06, + "loss": 1.4566, + "step": 7850 + }, + { + "epoch": 1.3544380229448805, + "grad_norm": 0.578125, + "learning_rate": 4.751478372355317e-06, + "loss": 1.3935, + "step": 7851 + }, + { + "epoch": 1.3546105408436127, + "grad_norm": 0.6015625, + "learning_rate": 4.749161721425359e-06, + "loss": 1.4629, + "step": 7852 + }, + { + "epoch": 1.3547830587423446, + "grad_norm": 0.65625, + "learning_rate": 4.746845459510314e-06, + "loss": 1.4346, + "step": 7853 + }, + { + "epoch": 1.3549555766410766, + "grad_norm": 0.6328125, + "learning_rate": 4.744529586781773e-06, + "loss": 1.3455, + "step": 7854 + }, + { + "epoch": 1.3551280945398085, + "grad_norm": 0.66796875, + "learning_rate": 4.742214103411325e-06, + "loss": 1.5296, + "step": 7855 + }, + { + "epoch": 1.3553006124385405, + "grad_norm": 0.57421875, + "learning_rate": 4.739899009570506e-06, + "loss": 1.4611, + "step": 7856 + }, + { + "epoch": 1.3554731303372725, + "grad_norm": 0.6328125, + "learning_rate": 4.7375843054308335e-06, + "loss": 1.4042, + "step": 7857 + }, + { + "epoch": 1.3556456482360044, + "grad_norm": 0.58203125, + "learning_rate": 4.735269991163798e-06, + "loss": 1.4087, + "step": 7858 + }, + { + "epoch": 1.3558181661347364, + "grad_norm": 0.671875, + "learning_rate": 4.732956066940856e-06, + "loss": 1.3889, + "step": 7859 + }, + { + "epoch": 1.3559906840334683, + "grad_norm": 0.5859375, + "learning_rate": 4.7306425329334386e-06, + "loss": 1.4786, + "step": 7860 + }, + { + "epoch": 1.3561632019322005, + "grad_norm": 0.98046875, + "learning_rate": 4.7283293893129515e-06, + "loss": 1.4151, + "step": 7861 + }, + { + "epoch": 1.3563357198309325, + "grad_norm": 0.56640625, + "learning_rate": 4.726016636250753e-06, + "loss": 1.4943, + "step": 7862 + }, + { + "epoch": 1.3565082377296644, + "grad_norm": 0.67578125, + "learning_rate": 4.7237042739182006e-06, + "loss": 1.4195, + "step": 7863 + }, + { + "epoch": 1.3566807556283964, + "grad_norm": 0.58203125, + "learning_rate": 4.721392302486602e-06, + "loss": 1.3886, + "step": 7864 + }, + { + "epoch": 1.3568532735271284, + "grad_norm": 0.671875, + "learning_rate": 4.719080722127246e-06, + "loss": 1.5363, + "step": 7865 + }, + { + "epoch": 1.3570257914258606, + "grad_norm": 0.60546875, + "learning_rate": 4.716769533011389e-06, + "loss": 1.4005, + "step": 7866 + }, + { + "epoch": 1.3571983093245925, + "grad_norm": 0.59765625, + "learning_rate": 4.714458735310249e-06, + "loss": 1.4047, + "step": 7867 + }, + { + "epoch": 1.3573708272233245, + "grad_norm": 0.6015625, + "learning_rate": 4.71214832919504e-06, + "loss": 1.3229, + "step": 7868 + }, + { + "epoch": 1.3575433451220564, + "grad_norm": 0.61328125, + "learning_rate": 4.709838314836918e-06, + "loss": 1.4652, + "step": 7869 + }, + { + "epoch": 1.3577158630207884, + "grad_norm": 0.56640625, + "learning_rate": 4.707528692407027e-06, + "loss": 1.4012, + "step": 7870 + }, + { + "epoch": 1.3578883809195204, + "grad_norm": 0.640625, + "learning_rate": 4.705219462076481e-06, + "loss": 1.4204, + "step": 7871 + }, + { + "epoch": 1.3580608988182523, + "grad_norm": 0.5859375, + "learning_rate": 4.70291062401636e-06, + "loss": 1.4563, + "step": 7872 + }, + { + "epoch": 1.3582334167169843, + "grad_norm": 0.5703125, + "learning_rate": 4.700602178397719e-06, + "loss": 1.468, + "step": 7873 + }, + { + "epoch": 1.3584059346157165, + "grad_norm": 1.390625, + "learning_rate": 4.698294125391583e-06, + "loss": 1.3792, + "step": 7874 + }, + { + "epoch": 1.3585784525144484, + "grad_norm": 0.58203125, + "learning_rate": 4.695986465168937e-06, + "loss": 1.4291, + "step": 7875 + }, + { + "epoch": 1.3587509704131804, + "grad_norm": 0.59375, + "learning_rate": 4.693679197900763e-06, + "loss": 1.3912, + "step": 7876 + }, + { + "epoch": 1.3589234883119123, + "grad_norm": 0.5859375, + "learning_rate": 4.691372323757985e-06, + "loss": 1.4333, + "step": 7877 + }, + { + "epoch": 1.3590960062106443, + "grad_norm": 0.5625, + "learning_rate": 4.689065842911517e-06, + "loss": 1.3318, + "step": 7878 + }, + { + "epoch": 1.3592685241093765, + "grad_norm": 0.59375, + "learning_rate": 4.686759755532234e-06, + "loss": 1.4298, + "step": 7879 + }, + { + "epoch": 1.3594410420081084, + "grad_norm": 0.6015625, + "learning_rate": 4.684454061790987e-06, + "loss": 1.5446, + "step": 7880 + }, + { + "epoch": 1.3596135599068404, + "grad_norm": 0.65234375, + "learning_rate": 4.6821487618585956e-06, + "loss": 1.4827, + "step": 7881 + }, + { + "epoch": 1.3597860778055724, + "grad_norm": 0.5703125, + "learning_rate": 4.679843855905853e-06, + "loss": 1.3706, + "step": 7882 + }, + { + "epoch": 1.3599585957043043, + "grad_norm": 0.61328125, + "learning_rate": 4.6775393441035185e-06, + "loss": 1.4048, + "step": 7883 + }, + { + "epoch": 1.3601311136030363, + "grad_norm": 0.59375, + "learning_rate": 4.6752352266223255e-06, + "loss": 1.3972, + "step": 7884 + }, + { + "epoch": 1.3603036315017683, + "grad_norm": 0.60546875, + "learning_rate": 4.672931503632981e-06, + "loss": 1.4176, + "step": 7885 + }, + { + "epoch": 1.3604761494005002, + "grad_norm": 0.6015625, + "learning_rate": 4.67062817530615e-06, + "loss": 1.4636, + "step": 7886 + }, + { + "epoch": 1.3606486672992322, + "grad_norm": 0.703125, + "learning_rate": 4.6683252418124895e-06, + "loss": 1.3784, + "step": 7887 + }, + { + "epoch": 1.3608211851979644, + "grad_norm": 0.58203125, + "learning_rate": 4.666022703322605e-06, + "loss": 1.4577, + "step": 7888 + }, + { + "epoch": 1.3609937030966963, + "grad_norm": 0.58984375, + "learning_rate": 4.663720560007087e-06, + "loss": 1.4598, + "step": 7889 + }, + { + "epoch": 1.3611662209954283, + "grad_norm": 0.58984375, + "learning_rate": 4.661418812036492e-06, + "loss": 1.3935, + "step": 7890 + }, + { + "epoch": 1.3613387388941602, + "grad_norm": 0.6328125, + "learning_rate": 4.659117459581351e-06, + "loss": 1.3132, + "step": 7891 + }, + { + "epoch": 1.3615112567928922, + "grad_norm": 0.58984375, + "learning_rate": 4.656816502812157e-06, + "loss": 1.498, + "step": 7892 + }, + { + "epoch": 1.3616837746916244, + "grad_norm": 0.55859375, + "learning_rate": 4.6545159418993866e-06, + "loss": 1.4276, + "step": 7893 + }, + { + "epoch": 1.3618562925903563, + "grad_norm": 0.5546875, + "learning_rate": 4.652215777013469e-06, + "loss": 1.3338, + "step": 7894 + }, + { + "epoch": 1.3620288104890883, + "grad_norm": 0.6015625, + "learning_rate": 4.649916008324824e-06, + "loss": 1.4057, + "step": 7895 + }, + { + "epoch": 1.3622013283878203, + "grad_norm": 0.578125, + "learning_rate": 4.64761663600383e-06, + "loss": 1.4477, + "step": 7896 + }, + { + "epoch": 1.3623738462865522, + "grad_norm": 0.640625, + "learning_rate": 4.645317660220838e-06, + "loss": 1.4506, + "step": 7897 + }, + { + "epoch": 1.3625463641852842, + "grad_norm": 0.57421875, + "learning_rate": 4.643019081146177e-06, + "loss": 1.4392, + "step": 7898 + }, + { + "epoch": 1.3627188820840161, + "grad_norm": 0.60546875, + "learning_rate": 4.640720898950126e-06, + "loss": 1.2814, + "step": 7899 + }, + { + "epoch": 1.362891399982748, + "grad_norm": 0.60546875, + "learning_rate": 4.638423113802964e-06, + "loss": 1.4289, + "step": 7900 + }, + { + "epoch": 1.362891399982748, + "eval_loss": 1.4074419736862183, + "eval_runtime": 10.7675, + "eval_samples_per_second": 95.101, + "eval_steps_per_second": 23.775, + "step": 7900 + }, + { + "epoch": 1.36306391788148, + "grad_norm": 0.58984375, + "learning_rate": 4.636125725874916e-06, + "loss": 1.4032, + "step": 7901 + }, + { + "epoch": 1.3632364357802123, + "grad_norm": 0.5859375, + "learning_rate": 4.6338287353361875e-06, + "loss": 1.4314, + "step": 7902 + }, + { + "epoch": 1.3634089536789442, + "grad_norm": 0.57421875, + "learning_rate": 4.631532142356957e-06, + "loss": 1.409, + "step": 7903 + }, + { + "epoch": 1.3635814715776762, + "grad_norm": 0.5703125, + "learning_rate": 4.629235947107369e-06, + "loss": 1.4498, + "step": 7904 + }, + { + "epoch": 1.3637539894764081, + "grad_norm": 0.57421875, + "learning_rate": 4.626940149757541e-06, + "loss": 1.3947, + "step": 7905 + }, + { + "epoch": 1.36392650737514, + "grad_norm": 0.625, + "learning_rate": 4.62464475047756e-06, + "loss": 1.4714, + "step": 7906 + }, + { + "epoch": 1.3640990252738723, + "grad_norm": 0.65625, + "learning_rate": 4.622349749437478e-06, + "loss": 1.3314, + "step": 7907 + }, + { + "epoch": 1.3642715431726042, + "grad_norm": 0.58984375, + "learning_rate": 4.620055146807334e-06, + "loss": 1.3417, + "step": 7908 + }, + { + "epoch": 1.3644440610713362, + "grad_norm": 0.578125, + "learning_rate": 4.617760942757117e-06, + "loss": 1.5331, + "step": 7909 + }, + { + "epoch": 1.3646165789700682, + "grad_norm": 0.6171875, + "learning_rate": 4.615467137456798e-06, + "loss": 1.3129, + "step": 7910 + }, + { + "epoch": 1.3647890968688001, + "grad_norm": 0.65234375, + "learning_rate": 4.613173731076319e-06, + "loss": 1.5621, + "step": 7911 + }, + { + "epoch": 1.364961614767532, + "grad_norm": 2.0625, + "learning_rate": 4.610880723785588e-06, + "loss": 1.4238, + "step": 7912 + }, + { + "epoch": 1.365134132666264, + "grad_norm": 0.58203125, + "learning_rate": 4.608588115754486e-06, + "loss": 1.4943, + "step": 7913 + }, + { + "epoch": 1.365306650564996, + "grad_norm": 0.625, + "learning_rate": 4.606295907152862e-06, + "loss": 1.4239, + "step": 7914 + }, + { + "epoch": 1.3654791684637282, + "grad_norm": 0.60546875, + "learning_rate": 4.6040040981505395e-06, + "loss": 1.4424, + "step": 7915 + }, + { + "epoch": 1.3656516863624601, + "grad_norm": 0.55859375, + "learning_rate": 4.601712688917309e-06, + "loss": 1.3706, + "step": 7916 + }, + { + "epoch": 1.365824204261192, + "grad_norm": 0.63671875, + "learning_rate": 4.599421679622936e-06, + "loss": 1.4369, + "step": 7917 + }, + { + "epoch": 1.365996722159924, + "grad_norm": 0.6328125, + "learning_rate": 4.597131070437143e-06, + "loss": 1.3608, + "step": 7918 + }, + { + "epoch": 1.366169240058656, + "grad_norm": 1.0, + "learning_rate": 4.594840861529646e-06, + "loss": 1.4133, + "step": 7919 + }, + { + "epoch": 1.3663417579573882, + "grad_norm": 0.546875, + "learning_rate": 4.5925510530701065e-06, + "loss": 1.4296, + "step": 7920 + }, + { + "epoch": 1.3665142758561202, + "grad_norm": 0.63671875, + "learning_rate": 4.590261645228173e-06, + "loss": 1.421, + "step": 7921 + }, + { + "epoch": 1.3666867937548521, + "grad_norm": 0.59765625, + "learning_rate": 4.587972638173459e-06, + "loss": 1.4531, + "step": 7922 + }, + { + "epoch": 1.366859311653584, + "grad_norm": 0.60546875, + "learning_rate": 4.5856840320755465e-06, + "loss": 1.3976, + "step": 7923 + }, + { + "epoch": 1.367031829552316, + "grad_norm": 0.5859375, + "learning_rate": 4.583395827103992e-06, + "loss": 1.4801, + "step": 7924 + }, + { + "epoch": 1.367204347451048, + "grad_norm": 0.5703125, + "learning_rate": 4.581108023428319e-06, + "loss": 1.4647, + "step": 7925 + }, + { + "epoch": 1.36737686534978, + "grad_norm": 0.58203125, + "learning_rate": 4.578820621218023e-06, + "loss": 1.4242, + "step": 7926 + }, + { + "epoch": 1.367549383248512, + "grad_norm": 0.578125, + "learning_rate": 4.576533620642568e-06, + "loss": 1.4149, + "step": 7927 + }, + { + "epoch": 1.367721901147244, + "grad_norm": 0.5625, + "learning_rate": 4.57424702187139e-06, + "loss": 1.3514, + "step": 7928 + }, + { + "epoch": 1.367894419045976, + "grad_norm": 0.61328125, + "learning_rate": 4.5719608250738936e-06, + "loss": 1.4513, + "step": 7929 + }, + { + "epoch": 1.368066936944708, + "grad_norm": 0.58984375, + "learning_rate": 4.569675030419459e-06, + "loss": 1.4672, + "step": 7930 + }, + { + "epoch": 1.36823945484344, + "grad_norm": 0.58984375, + "learning_rate": 4.567389638077421e-06, + "loss": 1.3883, + "step": 7931 + }, + { + "epoch": 1.368411972742172, + "grad_norm": 0.66015625, + "learning_rate": 4.565104648217111e-06, + "loss": 1.4488, + "step": 7932 + }, + { + "epoch": 1.368584490640904, + "grad_norm": 0.59765625, + "learning_rate": 4.562820061007803e-06, + "loss": 1.3655, + "step": 7933 + }, + { + "epoch": 1.368757008539636, + "grad_norm": 0.59375, + "learning_rate": 4.560535876618759e-06, + "loss": 1.3948, + "step": 7934 + }, + { + "epoch": 1.368929526438368, + "grad_norm": 0.62890625, + "learning_rate": 4.558252095219204e-06, + "loss": 1.5201, + "step": 7935 + }, + { + "epoch": 1.3691020443371, + "grad_norm": 0.55859375, + "learning_rate": 4.5559687169783354e-06, + "loss": 1.4503, + "step": 7936 + }, + { + "epoch": 1.369274562235832, + "grad_norm": 0.58984375, + "learning_rate": 4.55368574206532e-06, + "loss": 1.388, + "step": 7937 + }, + { + "epoch": 1.369447080134564, + "grad_norm": 0.59765625, + "learning_rate": 4.551403170649299e-06, + "loss": 1.4712, + "step": 7938 + }, + { + "epoch": 1.369619598033296, + "grad_norm": 0.5703125, + "learning_rate": 4.5491210028993685e-06, + "loss": 1.3087, + "step": 7939 + }, + { + "epoch": 1.3697921159320279, + "grad_norm": 0.65625, + "learning_rate": 4.5468392389846195e-06, + "loss": 1.488, + "step": 7940 + }, + { + "epoch": 1.3699646338307598, + "grad_norm": 0.59765625, + "learning_rate": 4.544557879074088e-06, + "loss": 1.4399, + "step": 7941 + }, + { + "epoch": 1.3701371517294918, + "grad_norm": 0.65234375, + "learning_rate": 4.542276923336798e-06, + "loss": 1.4177, + "step": 7942 + }, + { + "epoch": 1.370309669628224, + "grad_norm": 0.54296875, + "learning_rate": 4.539996371941734e-06, + "loss": 1.3468, + "step": 7943 + }, + { + "epoch": 1.370482187526956, + "grad_norm": 0.890625, + "learning_rate": 4.5377162250578545e-06, + "loss": 1.3186, + "step": 7944 + }, + { + "epoch": 1.370654705425688, + "grad_norm": 0.56640625, + "learning_rate": 4.535436482854087e-06, + "loss": 1.4187, + "step": 7945 + }, + { + "epoch": 1.3708272233244199, + "grad_norm": 0.59375, + "learning_rate": 4.533157145499328e-06, + "loss": 1.3719, + "step": 7946 + }, + { + "epoch": 1.3709997412231518, + "grad_norm": 0.60546875, + "learning_rate": 4.530878213162447e-06, + "loss": 1.4092, + "step": 7947 + }, + { + "epoch": 1.371172259121884, + "grad_norm": 0.59375, + "learning_rate": 4.528599686012281e-06, + "loss": 1.4266, + "step": 7948 + }, + { + "epoch": 1.371344777020616, + "grad_norm": 0.59765625, + "learning_rate": 4.526321564217641e-06, + "loss": 1.4805, + "step": 7949 + }, + { + "epoch": 1.371517294919348, + "grad_norm": 0.58203125, + "learning_rate": 4.5240438479472926e-06, + "loss": 1.4013, + "step": 7950 + }, + { + "epoch": 1.37168981281808, + "grad_norm": 0.57421875, + "learning_rate": 4.521766537369998e-06, + "loss": 1.4912, + "step": 7951 + }, + { + "epoch": 1.3718623307168119, + "grad_norm": 0.54296875, + "learning_rate": 4.519489632654461e-06, + "loss": 1.3681, + "step": 7952 + }, + { + "epoch": 1.3720348486155438, + "grad_norm": 0.59375, + "learning_rate": 4.517213133969385e-06, + "loss": 1.516, + "step": 7953 + }, + { + "epoch": 1.3722073665142758, + "grad_norm": 0.609375, + "learning_rate": 4.5149370414834125e-06, + "loss": 1.4078, + "step": 7954 + }, + { + "epoch": 1.3723798844130077, + "grad_norm": 0.57421875, + "learning_rate": 4.512661355365177e-06, + "loss": 1.3906, + "step": 7955 + }, + { + "epoch": 1.37255240231174, + "grad_norm": 0.60546875, + "learning_rate": 4.510386075783274e-06, + "loss": 1.4647, + "step": 7956 + }, + { + "epoch": 1.3727249202104719, + "grad_norm": 0.54296875, + "learning_rate": 4.508111202906271e-06, + "loss": 1.4007, + "step": 7957 + }, + { + "epoch": 1.3728974381092038, + "grad_norm": 0.55859375, + "learning_rate": 4.5058367369027054e-06, + "loss": 1.4573, + "step": 7958 + }, + { + "epoch": 1.3730699560079358, + "grad_norm": 0.58984375, + "learning_rate": 4.503562677941088e-06, + "loss": 1.4147, + "step": 7959 + }, + { + "epoch": 1.3732424739066678, + "grad_norm": 0.61328125, + "learning_rate": 4.501289026189882e-06, + "loss": 1.3457, + "step": 7960 + }, + { + "epoch": 1.3734149918054, + "grad_norm": 0.58984375, + "learning_rate": 4.499015781817547e-06, + "loss": 1.4553, + "step": 7961 + }, + { + "epoch": 1.373587509704132, + "grad_norm": 0.57421875, + "learning_rate": 4.496742944992499e-06, + "loss": 1.3613, + "step": 7962 + }, + { + "epoch": 1.3737600276028639, + "grad_norm": 0.62890625, + "learning_rate": 4.494470515883111e-06, + "loss": 1.4739, + "step": 7963 + }, + { + "epoch": 1.3739325455015958, + "grad_norm": 0.57421875, + "learning_rate": 4.492198494657755e-06, + "loss": 1.428, + "step": 7964 + }, + { + "epoch": 1.3741050634003278, + "grad_norm": 0.6015625, + "learning_rate": 4.4899268814847455e-06, + "loss": 1.4915, + "step": 7965 + }, + { + "epoch": 1.3742775812990597, + "grad_norm": 0.5703125, + "learning_rate": 4.4876556765323805e-06, + "loss": 1.4217, + "step": 7966 + }, + { + "epoch": 1.3744500991977917, + "grad_norm": 0.61328125, + "learning_rate": 4.485384879968926e-06, + "loss": 1.4559, + "step": 7967 + }, + { + "epoch": 1.3746226170965237, + "grad_norm": 0.578125, + "learning_rate": 4.483114491962617e-06, + "loss": 1.4177, + "step": 7968 + }, + { + "epoch": 1.3747951349952556, + "grad_norm": 0.58203125, + "learning_rate": 4.480844512681657e-06, + "loss": 1.4573, + "step": 7969 + }, + { + "epoch": 1.3749676528939878, + "grad_norm": 0.62109375, + "learning_rate": 4.478574942294225e-06, + "loss": 1.4633, + "step": 7970 + }, + { + "epoch": 1.3751401707927198, + "grad_norm": 0.55078125, + "learning_rate": 4.476305780968452e-06, + "loss": 1.3839, + "step": 7971 + }, + { + "epoch": 1.3753126886914517, + "grad_norm": 0.60546875, + "learning_rate": 4.474037028872468e-06, + "loss": 1.4457, + "step": 7972 + }, + { + "epoch": 1.3754852065901837, + "grad_norm": 0.58984375, + "learning_rate": 4.471768686174347e-06, + "loss": 1.3865, + "step": 7973 + }, + { + "epoch": 1.3756577244889157, + "grad_norm": 1.2578125, + "learning_rate": 4.469500753042142e-06, + "loss": 1.3655, + "step": 7974 + }, + { + "epoch": 1.3758302423876478, + "grad_norm": 0.5859375, + "learning_rate": 4.467233229643878e-06, + "loss": 1.4127, + "step": 7975 + }, + { + "epoch": 1.3760027602863798, + "grad_norm": 0.5703125, + "learning_rate": 4.464966116147546e-06, + "loss": 1.4156, + "step": 7976 + }, + { + "epoch": 1.3761752781851118, + "grad_norm": 0.6015625, + "learning_rate": 4.46269941272111e-06, + "loss": 1.4055, + "step": 7977 + }, + { + "epoch": 1.3763477960838437, + "grad_norm": 0.58203125, + "learning_rate": 4.460433119532499e-06, + "loss": 1.4684, + "step": 7978 + }, + { + "epoch": 1.3765203139825757, + "grad_norm": 0.5625, + "learning_rate": 4.458167236749616e-06, + "loss": 1.415, + "step": 7979 + }, + { + "epoch": 1.3766928318813076, + "grad_norm": 0.5703125, + "learning_rate": 4.45590176454033e-06, + "loss": 1.5098, + "step": 7980 + }, + { + "epoch": 1.3768653497800396, + "grad_norm": 0.61328125, + "learning_rate": 4.453636703072487e-06, + "loss": 1.5133, + "step": 7981 + }, + { + "epoch": 1.3770378676787716, + "grad_norm": 0.59375, + "learning_rate": 4.451372052513884e-06, + "loss": 1.394, + "step": 7982 + }, + { + "epoch": 1.3772103855775037, + "grad_norm": 0.7421875, + "learning_rate": 4.4491078130323174e-06, + "loss": 1.394, + "step": 7983 + }, + { + "epoch": 1.3773829034762357, + "grad_norm": 0.625, + "learning_rate": 4.446843984795519e-06, + "loss": 1.4961, + "step": 7984 + }, + { + "epoch": 1.3775554213749677, + "grad_norm": 0.58203125, + "learning_rate": 4.444580567971224e-06, + "loss": 1.5189, + "step": 7985 + }, + { + "epoch": 1.3777279392736996, + "grad_norm": 0.58984375, + "learning_rate": 4.442317562727109e-06, + "loss": 1.4859, + "step": 7986 + }, + { + "epoch": 1.3779004571724316, + "grad_norm": 0.62890625, + "learning_rate": 4.440054969230834e-06, + "loss": 1.4891, + "step": 7987 + }, + { + "epoch": 1.3780729750711636, + "grad_norm": 0.59375, + "learning_rate": 4.437792787650028e-06, + "loss": 1.3835, + "step": 7988 + }, + { + "epoch": 1.3782454929698957, + "grad_norm": 0.6171875, + "learning_rate": 4.435531018152286e-06, + "loss": 1.3922, + "step": 7989 + }, + { + "epoch": 1.3784180108686277, + "grad_norm": 0.54296875, + "learning_rate": 4.433269660905175e-06, + "loss": 1.4601, + "step": 7990 + }, + { + "epoch": 1.3785905287673597, + "grad_norm": 0.6015625, + "learning_rate": 4.431008716076232e-06, + "loss": 1.4917, + "step": 7991 + }, + { + "epoch": 1.3787630466660916, + "grad_norm": 0.56640625, + "learning_rate": 4.428748183832955e-06, + "loss": 1.5076, + "step": 7992 + }, + { + "epoch": 1.3789355645648236, + "grad_norm": 0.640625, + "learning_rate": 4.426488064342826e-06, + "loss": 1.498, + "step": 7993 + }, + { + "epoch": 1.3791080824635555, + "grad_norm": 0.5703125, + "learning_rate": 4.4242283577732905e-06, + "loss": 1.3959, + "step": 7994 + }, + { + "epoch": 1.3792806003622875, + "grad_norm": 0.62890625, + "learning_rate": 4.421969064291749e-06, + "loss": 1.4679, + "step": 7995 + }, + { + "epoch": 1.3794531182610195, + "grad_norm": 0.58203125, + "learning_rate": 4.4197101840656e-06, + "loss": 1.4727, + "step": 7996 + }, + { + "epoch": 1.3796256361597516, + "grad_norm": 0.55859375, + "learning_rate": 4.417451717262184e-06, + "loss": 1.467, + "step": 7997 + }, + { + "epoch": 1.3797981540584836, + "grad_norm": 0.61328125, + "learning_rate": 4.415193664048827e-06, + "loss": 1.4182, + "step": 7998 + }, + { + "epoch": 1.3799706719572156, + "grad_norm": 0.6328125, + "learning_rate": 4.412936024592818e-06, + "loss": 1.5055, + "step": 7999 + }, + { + "epoch": 1.3801431898559475, + "grad_norm": 0.5859375, + "learning_rate": 4.410678799061417e-06, + "loss": 1.4059, + "step": 8000 + }, + { + "epoch": 1.3801431898559475, + "eval_loss": 1.407374382019043, + "eval_runtime": 11.0415, + "eval_samples_per_second": 92.741, + "eval_steps_per_second": 23.185, + "step": 8000 + }, + { + "epoch": 1.3803157077546795, + "grad_norm": 0.59765625, + "learning_rate": 4.408421987621856e-06, + "loss": 1.4668, + "step": 8001 + }, + { + "epoch": 1.3804882256534117, + "grad_norm": 0.609375, + "learning_rate": 4.406165590441335e-06, + "loss": 1.4717, + "step": 8002 + }, + { + "epoch": 1.3806607435521436, + "grad_norm": 0.578125, + "learning_rate": 4.403909607687012e-06, + "loss": 1.4689, + "step": 8003 + }, + { + "epoch": 1.3808332614508756, + "grad_norm": 0.640625, + "learning_rate": 4.4016540395260375e-06, + "loss": 1.4595, + "step": 8004 + }, + { + "epoch": 1.3810057793496076, + "grad_norm": 0.62109375, + "learning_rate": 4.399398886125508e-06, + "loss": 1.4481, + "step": 8005 + }, + { + "epoch": 1.3811782972483395, + "grad_norm": 0.5625, + "learning_rate": 4.397144147652506e-06, + "loss": 1.4332, + "step": 8006 + }, + { + "epoch": 1.3813508151470715, + "grad_norm": 0.66796875, + "learning_rate": 4.3948898242740715e-06, + "loss": 1.3817, + "step": 8007 + }, + { + "epoch": 1.3815233330458034, + "grad_norm": 0.54296875, + "learning_rate": 4.392635916157221e-06, + "loss": 1.3656, + "step": 8008 + }, + { + "epoch": 1.3816958509445354, + "grad_norm": 0.5625, + "learning_rate": 4.390382423468938e-06, + "loss": 1.2959, + "step": 8009 + }, + { + "epoch": 1.3818683688432674, + "grad_norm": 0.59765625, + "learning_rate": 4.388129346376177e-06, + "loss": 1.4901, + "step": 8010 + }, + { + "epoch": 1.3820408867419995, + "grad_norm": 0.5546875, + "learning_rate": 4.385876685045858e-06, + "loss": 1.3806, + "step": 8011 + }, + { + "epoch": 1.3822134046407315, + "grad_norm": 0.578125, + "learning_rate": 4.3836244396448725e-06, + "loss": 1.3983, + "step": 8012 + }, + { + "epoch": 1.3823859225394635, + "grad_norm": 0.55859375, + "learning_rate": 4.3813726103400825e-06, + "loss": 1.4383, + "step": 8013 + }, + { + "epoch": 1.3825584404381954, + "grad_norm": 0.69140625, + "learning_rate": 4.379121197298315e-06, + "loss": 1.3774, + "step": 8014 + }, + { + "epoch": 1.3827309583369274, + "grad_norm": 0.6015625, + "learning_rate": 4.3768702006863735e-06, + "loss": 1.4498, + "step": 8015 + }, + { + "epoch": 1.3829034762356596, + "grad_norm": 0.64453125, + "learning_rate": 4.374619620671015e-06, + "loss": 1.3819, + "step": 8016 + }, + { + "epoch": 1.3830759941343915, + "grad_norm": 0.5859375, + "learning_rate": 4.372369457418994e-06, + "loss": 1.5088, + "step": 8017 + }, + { + "epoch": 1.3832485120331235, + "grad_norm": 0.6171875, + "learning_rate": 4.370119711097e-06, + "loss": 1.3558, + "step": 8018 + }, + { + "epoch": 1.3834210299318555, + "grad_norm": 0.6015625, + "learning_rate": 4.3678703818717165e-06, + "loss": 1.3744, + "step": 8019 + }, + { + "epoch": 1.3835935478305874, + "grad_norm": 0.6171875, + "learning_rate": 4.365621469909785e-06, + "loss": 1.4178, + "step": 8020 + }, + { + "epoch": 1.3837660657293194, + "grad_norm": 0.640625, + "learning_rate": 4.363372975377821e-06, + "loss": 1.4425, + "step": 8021 + }, + { + "epoch": 1.3839385836280513, + "grad_norm": 0.61328125, + "learning_rate": 4.361124898442406e-06, + "loss": 1.4155, + "step": 8022 + }, + { + "epoch": 1.3841111015267833, + "grad_norm": 0.60546875, + "learning_rate": 4.358877239270096e-06, + "loss": 1.5466, + "step": 8023 + }, + { + "epoch": 1.3842836194255155, + "grad_norm": 0.59765625, + "learning_rate": 4.3566299980273995e-06, + "loss": 1.4899, + "step": 8024 + }, + { + "epoch": 1.3844561373242474, + "grad_norm": 0.59375, + "learning_rate": 4.354383174880819e-06, + "loss": 1.3355, + "step": 8025 + }, + { + "epoch": 1.3846286552229794, + "grad_norm": 0.59765625, + "learning_rate": 4.3521367699968105e-06, + "loss": 1.3952, + "step": 8026 + }, + { + "epoch": 1.3848011731217114, + "grad_norm": 0.5859375, + "learning_rate": 4.349890783541793e-06, + "loss": 1.3639, + "step": 8027 + }, + { + "epoch": 1.3849736910204433, + "grad_norm": 0.88671875, + "learning_rate": 4.3476452156821765e-06, + "loss": 1.3987, + "step": 8028 + }, + { + "epoch": 1.3851462089191753, + "grad_norm": 0.61328125, + "learning_rate": 4.345400066584318e-06, + "loss": 1.4278, + "step": 8029 + }, + { + "epoch": 1.3853187268179075, + "grad_norm": 0.609375, + "learning_rate": 4.343155336414553e-06, + "loss": 1.4388, + "step": 8030 + }, + { + "epoch": 1.3854912447166394, + "grad_norm": 0.62890625, + "learning_rate": 4.340911025339186e-06, + "loss": 1.4773, + "step": 8031 + }, + { + "epoch": 1.3856637626153714, + "grad_norm": 0.6328125, + "learning_rate": 4.33866713352449e-06, + "loss": 1.4371, + "step": 8032 + }, + { + "epoch": 1.3858362805141033, + "grad_norm": 0.578125, + "learning_rate": 4.336423661136708e-06, + "loss": 1.4436, + "step": 8033 + }, + { + "epoch": 1.3860087984128353, + "grad_norm": 0.6015625, + "learning_rate": 4.3341806083420504e-06, + "loss": 1.4122, + "step": 8034 + }, + { + "epoch": 1.3861813163115673, + "grad_norm": 0.55859375, + "learning_rate": 4.331937975306687e-06, + "loss": 1.3701, + "step": 8035 + }, + { + "epoch": 1.3863538342102992, + "grad_norm": 0.5625, + "learning_rate": 4.329695762196783e-06, + "loss": 1.4333, + "step": 8036 + }, + { + "epoch": 1.3865263521090312, + "grad_norm": 0.5703125, + "learning_rate": 4.3274539691784434e-06, + "loss": 1.448, + "step": 8037 + }, + { + "epoch": 1.3866988700077634, + "grad_norm": 0.5703125, + "learning_rate": 4.325212596417756e-06, + "loss": 1.4208, + "step": 8038 + }, + { + "epoch": 1.3868713879064953, + "grad_norm": 0.6171875, + "learning_rate": 4.322971644080778e-06, + "loss": 1.3671, + "step": 8039 + }, + { + "epoch": 1.3870439058052273, + "grad_norm": 0.58984375, + "learning_rate": 4.320731112333528e-06, + "loss": 1.4953, + "step": 8040 + }, + { + "epoch": 1.3872164237039593, + "grad_norm": 0.6015625, + "learning_rate": 4.318491001342011e-06, + "loss": 1.4839, + "step": 8041 + }, + { + "epoch": 1.3873889416026912, + "grad_norm": 0.5703125, + "learning_rate": 4.316251311272177e-06, + "loss": 1.5123, + "step": 8042 + }, + { + "epoch": 1.3875614595014234, + "grad_norm": 0.63671875, + "learning_rate": 4.314012042289959e-06, + "loss": 1.5034, + "step": 8043 + }, + { + "epoch": 1.3877339774001554, + "grad_norm": 0.56640625, + "learning_rate": 4.311773194561256e-06, + "loss": 1.367, + "step": 8044 + }, + { + "epoch": 1.3879064952988873, + "grad_norm": 0.74609375, + "learning_rate": 4.309534768251937e-06, + "loss": 1.5059, + "step": 8045 + }, + { + "epoch": 1.3880790131976193, + "grad_norm": 0.5859375, + "learning_rate": 4.307296763527838e-06, + "loss": 1.4364, + "step": 8046 + }, + { + "epoch": 1.3882515310963512, + "grad_norm": 0.625, + "learning_rate": 4.3050591805547696e-06, + "loss": 1.4842, + "step": 8047 + }, + { + "epoch": 1.3884240489950832, + "grad_norm": 0.65625, + "learning_rate": 4.302822019498492e-06, + "loss": 1.4707, + "step": 8048 + }, + { + "epoch": 1.3885965668938152, + "grad_norm": 0.59375, + "learning_rate": 4.300585280524764e-06, + "loss": 1.3719, + "step": 8049 + }, + { + "epoch": 1.3887690847925471, + "grad_norm": 1.078125, + "learning_rate": 4.2983489637992855e-06, + "loss": 1.4976, + "step": 8050 + }, + { + "epoch": 1.388941602691279, + "grad_norm": 0.625, + "learning_rate": 4.296113069487743e-06, + "loss": 1.3982, + "step": 8051 + }, + { + "epoch": 1.3891141205900113, + "grad_norm": 0.671875, + "learning_rate": 4.293877597755783e-06, + "loss": 1.4678, + "step": 8052 + }, + { + "epoch": 1.3892866384887432, + "grad_norm": 0.80078125, + "learning_rate": 4.291642548769024e-06, + "loss": 1.4979, + "step": 8053 + }, + { + "epoch": 1.3894591563874752, + "grad_norm": 0.66015625, + "learning_rate": 4.289407922693053e-06, + "loss": 1.4575, + "step": 8054 + }, + { + "epoch": 1.3896316742862072, + "grad_norm": 0.5859375, + "learning_rate": 4.287173719693427e-06, + "loss": 1.4439, + "step": 8055 + }, + { + "epoch": 1.3898041921849391, + "grad_norm": 0.62890625, + "learning_rate": 4.284939939935662e-06, + "loss": 1.4507, + "step": 8056 + }, + { + "epoch": 1.3899767100836713, + "grad_norm": 0.83984375, + "learning_rate": 4.282706583585258e-06, + "loss": 1.4287, + "step": 8057 + }, + { + "epoch": 1.3901492279824033, + "grad_norm": 0.625, + "learning_rate": 4.280473650807676e-06, + "loss": 1.5049, + "step": 8058 + }, + { + "epoch": 1.3903217458811352, + "grad_norm": 0.61328125, + "learning_rate": 4.278241141768338e-06, + "loss": 1.3884, + "step": 8059 + }, + { + "epoch": 1.3904942637798672, + "grad_norm": 0.625, + "learning_rate": 4.276009056632653e-06, + "loss": 1.4438, + "step": 8060 + }, + { + "epoch": 1.3906667816785991, + "grad_norm": 0.6171875, + "learning_rate": 4.27377739556598e-06, + "loss": 1.3712, + "step": 8061 + }, + { + "epoch": 1.390839299577331, + "grad_norm": 0.5859375, + "learning_rate": 4.271546158733656e-06, + "loss": 1.3682, + "step": 8062 + }, + { + "epoch": 1.391011817476063, + "grad_norm": 0.70703125, + "learning_rate": 4.269315346300985e-06, + "loss": 1.4151, + "step": 8063 + }, + { + "epoch": 1.391184335374795, + "grad_norm": 0.5625, + "learning_rate": 4.2670849584332405e-06, + "loss": 1.5293, + "step": 8064 + }, + { + "epoch": 1.3913568532735272, + "grad_norm": 0.58984375, + "learning_rate": 4.264854995295664e-06, + "loss": 1.4793, + "step": 8065 + }, + { + "epoch": 1.3915293711722592, + "grad_norm": 0.5625, + "learning_rate": 4.262625457053467e-06, + "loss": 1.5049, + "step": 8066 + }, + { + "epoch": 1.3917018890709911, + "grad_norm": 0.6171875, + "learning_rate": 4.260396343871819e-06, + "loss": 1.4196, + "step": 8067 + }, + { + "epoch": 1.391874406969723, + "grad_norm": 0.6171875, + "learning_rate": 4.258167655915878e-06, + "loss": 1.4206, + "step": 8068 + }, + { + "epoch": 1.392046924868455, + "grad_norm": 0.62109375, + "learning_rate": 4.255939393350751e-06, + "loss": 1.3965, + "step": 8069 + }, + { + "epoch": 1.3922194427671872, + "grad_norm": 0.65625, + "learning_rate": 4.253711556341524e-06, + "loss": 1.4317, + "step": 8070 + }, + { + "epoch": 1.3923919606659192, + "grad_norm": 0.609375, + "learning_rate": 4.25148414505325e-06, + "loss": 1.3929, + "step": 8071 + }, + { + "epoch": 1.3925644785646512, + "grad_norm": 0.58203125, + "learning_rate": 4.249257159650944e-06, + "loss": 1.3943, + "step": 8072 + }, + { + "epoch": 1.3927369964633831, + "grad_norm": 0.66015625, + "learning_rate": 4.2470306002996085e-06, + "loss": 1.5479, + "step": 8073 + }, + { + "epoch": 1.392909514362115, + "grad_norm": 0.578125, + "learning_rate": 4.244804467164189e-06, + "loss": 1.4135, + "step": 8074 + }, + { + "epoch": 1.393082032260847, + "grad_norm": 0.5703125, + "learning_rate": 4.242578760409614e-06, + "loss": 1.3848, + "step": 8075 + }, + { + "epoch": 1.393254550159579, + "grad_norm": 0.55859375, + "learning_rate": 4.240353480200777e-06, + "loss": 1.3422, + "step": 8076 + }, + { + "epoch": 1.393427068058311, + "grad_norm": 0.5859375, + "learning_rate": 4.238128626702545e-06, + "loss": 1.4875, + "step": 8077 + }, + { + "epoch": 1.393599585957043, + "grad_norm": 0.59375, + "learning_rate": 4.2359042000797434e-06, + "loss": 1.3744, + "step": 8078 + }, + { + "epoch": 1.393772103855775, + "grad_norm": 0.6015625, + "learning_rate": 4.23368020049718e-06, + "loss": 1.393, + "step": 8079 + }, + { + "epoch": 1.393944621754507, + "grad_norm": 0.6328125, + "learning_rate": 4.231456628119609e-06, + "loss": 1.4639, + "step": 8080 + }, + { + "epoch": 1.394117139653239, + "grad_norm": 0.60546875, + "learning_rate": 4.229233483111781e-06, + "loss": 1.4048, + "step": 8081 + }, + { + "epoch": 1.394289657551971, + "grad_norm": 0.640625, + "learning_rate": 4.227010765638392e-06, + "loss": 1.3948, + "step": 8082 + }, + { + "epoch": 1.394462175450703, + "grad_norm": 0.6015625, + "learning_rate": 4.2247884758641155e-06, + "loss": 1.4422, + "step": 8083 + }, + { + "epoch": 1.3946346933494351, + "grad_norm": 0.578125, + "learning_rate": 4.222566613953594e-06, + "loss": 1.5084, + "step": 8084 + }, + { + "epoch": 1.394807211248167, + "grad_norm": 0.5546875, + "learning_rate": 4.220345180071437e-06, + "loss": 1.4686, + "step": 8085 + }, + { + "epoch": 1.394979729146899, + "grad_norm": 0.62890625, + "learning_rate": 4.218124174382222e-06, + "loss": 1.4242, + "step": 8086 + }, + { + "epoch": 1.395152247045631, + "grad_norm": 0.609375, + "learning_rate": 4.215903597050499e-06, + "loss": 1.4581, + "step": 8087 + }, + { + "epoch": 1.395324764944363, + "grad_norm": 0.65234375, + "learning_rate": 4.213683448240771e-06, + "loss": 1.5284, + "step": 8088 + }, + { + "epoch": 1.395497282843095, + "grad_norm": 0.5859375, + "learning_rate": 4.211463728117531e-06, + "loss": 1.3873, + "step": 8089 + }, + { + "epoch": 1.395669800741827, + "grad_norm": 0.58203125, + "learning_rate": 4.20924443684523e-06, + "loss": 1.3575, + "step": 8090 + }, + { + "epoch": 1.3958423186405589, + "grad_norm": 0.734375, + "learning_rate": 4.207025574588276e-06, + "loss": 1.3796, + "step": 8091 + }, + { + "epoch": 1.3960148365392908, + "grad_norm": 0.54296875, + "learning_rate": 4.204807141511071e-06, + "loss": 1.3057, + "step": 8092 + }, + { + "epoch": 1.396187354438023, + "grad_norm": 0.55859375, + "learning_rate": 4.20258913777796e-06, + "loss": 1.398, + "step": 8093 + }, + { + "epoch": 1.396359872336755, + "grad_norm": 0.58203125, + "learning_rate": 4.200371563553269e-06, + "loss": 1.5357, + "step": 8094 + }, + { + "epoch": 1.396532390235487, + "grad_norm": 0.56640625, + "learning_rate": 4.1981544190012915e-06, + "loss": 1.3717, + "step": 8095 + }, + { + "epoch": 1.3967049081342189, + "grad_norm": 0.62109375, + "learning_rate": 4.195937704286285e-06, + "loss": 1.4642, + "step": 8096 + }, + { + "epoch": 1.3968774260329508, + "grad_norm": 0.59375, + "learning_rate": 4.193721419572478e-06, + "loss": 1.376, + "step": 8097 + }, + { + "epoch": 1.397049943931683, + "grad_norm": 0.56640625, + "learning_rate": 4.191505565024073e-06, + "loss": 1.4358, + "step": 8098 + }, + { + "epoch": 1.397222461830415, + "grad_norm": 0.609375, + "learning_rate": 4.189290140805221e-06, + "loss": 1.4701, + "step": 8099 + }, + { + "epoch": 1.397394979729147, + "grad_norm": 0.6484375, + "learning_rate": 4.18707514708007e-06, + "loss": 1.5173, + "step": 8100 + }, + { + "epoch": 1.397394979729147, + "eval_loss": 1.4073193073272705, + "eval_runtime": 11.2049, + "eval_samples_per_second": 91.389, + "eval_steps_per_second": 22.847, + "step": 8100 + }, + { + "epoch": 1.397567497627879, + "grad_norm": 0.58203125, + "learning_rate": 4.184860584012704e-06, + "loss": 1.38, + "step": 8101 + }, + { + "epoch": 1.3977400155266109, + "grad_norm": 0.58203125, + "learning_rate": 4.182646451767209e-06, + "loss": 1.4285, + "step": 8102 + }, + { + "epoch": 1.3979125334253428, + "grad_norm": 0.5625, + "learning_rate": 4.18043275050761e-06, + "loss": 1.4443, + "step": 8103 + }, + { + "epoch": 1.3980850513240748, + "grad_norm": 0.55078125, + "learning_rate": 4.178219480397911e-06, + "loss": 1.4382, + "step": 8104 + }, + { + "epoch": 1.3982575692228068, + "grad_norm": 0.55859375, + "learning_rate": 4.1760066416020975e-06, + "loss": 1.5393, + "step": 8105 + }, + { + "epoch": 1.398430087121539, + "grad_norm": 0.640625, + "learning_rate": 4.173794234284096e-06, + "loss": 1.3635, + "step": 8106 + }, + { + "epoch": 1.398602605020271, + "grad_norm": 0.52734375, + "learning_rate": 4.171582258607824e-06, + "loss": 1.2441, + "step": 8107 + }, + { + "epoch": 1.3987751229190029, + "grad_norm": 0.5625, + "learning_rate": 4.169370714737155e-06, + "loss": 1.3819, + "step": 8108 + }, + { + "epoch": 1.3989476408177348, + "grad_norm": 0.61328125, + "learning_rate": 4.167159602835934e-06, + "loss": 1.419, + "step": 8109 + }, + { + "epoch": 1.3991201587164668, + "grad_norm": 0.56640625, + "learning_rate": 4.164948923067976e-06, + "loss": 1.3778, + "step": 8110 + }, + { + "epoch": 1.399292676615199, + "grad_norm": 0.58203125, + "learning_rate": 4.162738675597065e-06, + "loss": 1.4298, + "step": 8111 + }, + { + "epoch": 1.399465194513931, + "grad_norm": 0.58984375, + "learning_rate": 4.1605288605869365e-06, + "loss": 1.4636, + "step": 8112 + }, + { + "epoch": 1.3996377124126629, + "grad_norm": 0.5859375, + "learning_rate": 4.158319478201325e-06, + "loss": 1.3767, + "step": 8113 + }, + { + "epoch": 1.3998102303113948, + "grad_norm": 0.578125, + "learning_rate": 4.156110528603904e-06, + "loss": 1.4037, + "step": 8114 + }, + { + "epoch": 1.3999827482101268, + "grad_norm": 0.61328125, + "learning_rate": 4.153902011958329e-06, + "loss": 1.4435, + "step": 8115 + }, + { + "epoch": 1.4001552661088588, + "grad_norm": 0.5625, + "learning_rate": 4.151693928428221e-06, + "loss": 1.4166, + "step": 8116 + }, + { + "epoch": 1.4003277840075907, + "grad_norm": 0.69140625, + "learning_rate": 4.149486278177168e-06, + "loss": 1.4646, + "step": 8117 + }, + { + "epoch": 1.4005003019063227, + "grad_norm": 0.61328125, + "learning_rate": 4.147279061368729e-06, + "loss": 1.4447, + "step": 8118 + }, + { + "epoch": 1.4006728198050546, + "grad_norm": 0.578125, + "learning_rate": 4.145072278166428e-06, + "loss": 1.3519, + "step": 8119 + }, + { + "epoch": 1.4008453377037868, + "grad_norm": 0.859375, + "learning_rate": 4.142865928733751e-06, + "loss": 1.3877, + "step": 8120 + }, + { + "epoch": 1.4010178556025188, + "grad_norm": 0.55078125, + "learning_rate": 4.1406600132341645e-06, + "loss": 1.385, + "step": 8121 + }, + { + "epoch": 1.4011903735012508, + "grad_norm": 0.6015625, + "learning_rate": 4.1384545318311e-06, + "loss": 1.4587, + "step": 8122 + }, + { + "epoch": 1.4013628913999827, + "grad_norm": 0.5703125, + "learning_rate": 4.1362494846879405e-06, + "loss": 1.4534, + "step": 8123 + }, + { + "epoch": 1.4015354092987147, + "grad_norm": 0.58203125, + "learning_rate": 4.134044871968065e-06, + "loss": 1.3575, + "step": 8124 + }, + { + "epoch": 1.4017079271974469, + "grad_norm": 0.63671875, + "learning_rate": 4.131840693834794e-06, + "loss": 1.3504, + "step": 8125 + }, + { + "epoch": 1.4018804450961788, + "grad_norm": 0.53515625, + "learning_rate": 4.12963695045143e-06, + "loss": 1.3188, + "step": 8126 + }, + { + "epoch": 1.4020529629949108, + "grad_norm": 0.62109375, + "learning_rate": 4.127433641981241e-06, + "loss": 1.481, + "step": 8127 + }, + { + "epoch": 1.4022254808936427, + "grad_norm": 0.57421875, + "learning_rate": 4.125230768587461e-06, + "loss": 1.3856, + "step": 8128 + }, + { + "epoch": 1.4023979987923747, + "grad_norm": 0.609375, + "learning_rate": 4.123028330433294e-06, + "loss": 1.3378, + "step": 8129 + }, + { + "epoch": 1.4025705166911067, + "grad_norm": 0.5703125, + "learning_rate": 4.120826327681911e-06, + "loss": 1.3412, + "step": 8130 + }, + { + "epoch": 1.4027430345898386, + "grad_norm": 0.578125, + "learning_rate": 4.1186247604964425e-06, + "loss": 1.3749, + "step": 8131 + }, + { + "epoch": 1.4029155524885706, + "grad_norm": 0.6015625, + "learning_rate": 4.116423629040007e-06, + "loss": 1.4766, + "step": 8132 + }, + { + "epoch": 1.4030880703873025, + "grad_norm": 0.9609375, + "learning_rate": 4.1142229334756645e-06, + "loss": 1.4402, + "step": 8133 + }, + { + "epoch": 1.4032605882860347, + "grad_norm": 0.66015625, + "learning_rate": 4.112022673966471e-06, + "loss": 1.4126, + "step": 8134 + }, + { + "epoch": 1.4034331061847667, + "grad_norm": 0.6484375, + "learning_rate": 4.109822850675422e-06, + "loss": 1.3777, + "step": 8135 + }, + { + "epoch": 1.4036056240834986, + "grad_norm": 1.7109375, + "learning_rate": 4.107623463765498e-06, + "loss": 1.5189, + "step": 8136 + }, + { + "epoch": 1.4037781419822306, + "grad_norm": 0.58203125, + "learning_rate": 4.105424513399652e-06, + "loss": 1.3747, + "step": 8137 + }, + { + "epoch": 1.4039506598809626, + "grad_norm": 0.63671875, + "learning_rate": 4.1032259997407844e-06, + "loss": 1.4141, + "step": 8138 + }, + { + "epoch": 1.4041231777796948, + "grad_norm": 0.59375, + "learning_rate": 4.10102792295178e-06, + "loss": 1.417, + "step": 8139 + }, + { + "epoch": 1.4042956956784267, + "grad_norm": 0.6171875, + "learning_rate": 4.098830283195485e-06, + "loss": 1.4213, + "step": 8140 + }, + { + "epoch": 1.4044682135771587, + "grad_norm": 0.86328125, + "learning_rate": 4.096633080634717e-06, + "loss": 1.4074, + "step": 8141 + }, + { + "epoch": 1.4046407314758906, + "grad_norm": 0.61328125, + "learning_rate": 4.094436315432254e-06, + "loss": 1.5289, + "step": 8142 + }, + { + "epoch": 1.4048132493746226, + "grad_norm": 0.58984375, + "learning_rate": 4.092239987750852e-06, + "loss": 1.4553, + "step": 8143 + }, + { + "epoch": 1.4049857672733546, + "grad_norm": 0.59765625, + "learning_rate": 4.09004409775322e-06, + "loss": 1.4543, + "step": 8144 + }, + { + "epoch": 1.4051582851720865, + "grad_norm": 0.5859375, + "learning_rate": 4.0878486456020535e-06, + "loss": 1.5197, + "step": 8145 + }, + { + "epoch": 1.4053308030708185, + "grad_norm": 0.6015625, + "learning_rate": 4.085653631459997e-06, + "loss": 1.4162, + "step": 8146 + }, + { + "epoch": 1.4055033209695507, + "grad_norm": 1.21875, + "learning_rate": 4.083459055489675e-06, + "loss": 1.4667, + "step": 8147 + }, + { + "epoch": 1.4056758388682826, + "grad_norm": 0.56640625, + "learning_rate": 4.081264917853674e-06, + "loss": 1.4256, + "step": 8148 + }, + { + "epoch": 1.4058483567670146, + "grad_norm": 0.640625, + "learning_rate": 4.0790712187145486e-06, + "loss": 1.5337, + "step": 8149 + }, + { + "epoch": 1.4060208746657465, + "grad_norm": 0.63671875, + "learning_rate": 4.076877958234825e-06, + "loss": 1.4118, + "step": 8150 + }, + { + "epoch": 1.4061933925644785, + "grad_norm": 0.56640625, + "learning_rate": 4.074685136576993e-06, + "loss": 1.384, + "step": 8151 + }, + { + "epoch": 1.4063659104632107, + "grad_norm": 0.63671875, + "learning_rate": 4.0724927539035045e-06, + "loss": 1.4003, + "step": 8152 + }, + { + "epoch": 1.4065384283619426, + "grad_norm": 0.55859375, + "learning_rate": 4.070300810376792e-06, + "loss": 1.4395, + "step": 8153 + }, + { + "epoch": 1.4067109462606746, + "grad_norm": 0.59375, + "learning_rate": 4.0681093061592495e-06, + "loss": 1.411, + "step": 8154 + }, + { + "epoch": 1.4068834641594066, + "grad_norm": 0.609375, + "learning_rate": 4.065918241413226e-06, + "loss": 1.4444, + "step": 8155 + }, + { + "epoch": 1.4070559820581385, + "grad_norm": 0.59375, + "learning_rate": 4.063727616301064e-06, + "loss": 1.4602, + "step": 8156 + }, + { + "epoch": 1.4072284999568705, + "grad_norm": 0.59375, + "learning_rate": 4.061537430985049e-06, + "loss": 1.4107, + "step": 8157 + }, + { + "epoch": 1.4074010178556025, + "grad_norm": 0.6484375, + "learning_rate": 4.059347685627446e-06, + "loss": 1.4984, + "step": 8158 + }, + { + "epoch": 1.4075735357543344, + "grad_norm": 0.59375, + "learning_rate": 4.057158380390486e-06, + "loss": 1.3805, + "step": 8159 + }, + { + "epoch": 1.4077460536530664, + "grad_norm": 0.5703125, + "learning_rate": 4.054969515436366e-06, + "loss": 1.3743, + "step": 8160 + }, + { + "epoch": 1.4079185715517986, + "grad_norm": 0.63671875, + "learning_rate": 4.052781090927249e-06, + "loss": 1.4984, + "step": 8161 + }, + { + "epoch": 1.4080910894505305, + "grad_norm": 0.58203125, + "learning_rate": 4.050593107025269e-06, + "loss": 1.4587, + "step": 8162 + }, + { + "epoch": 1.4082636073492625, + "grad_norm": 0.5703125, + "learning_rate": 4.048405563892527e-06, + "loss": 1.4661, + "step": 8163 + }, + { + "epoch": 1.4084361252479944, + "grad_norm": 0.5859375, + "learning_rate": 4.046218461691089e-06, + "loss": 1.4686, + "step": 8164 + }, + { + "epoch": 1.4086086431467264, + "grad_norm": 0.57421875, + "learning_rate": 4.044031800582984e-06, + "loss": 1.376, + "step": 8165 + }, + { + "epoch": 1.4087811610454586, + "grad_norm": 0.58203125, + "learning_rate": 4.041845580730223e-06, + "loss": 1.3829, + "step": 8166 + }, + { + "epoch": 1.4089536789441905, + "grad_norm": 0.578125, + "learning_rate": 4.039659802294767e-06, + "loss": 1.418, + "step": 8167 + }, + { + "epoch": 1.4091261968429225, + "grad_norm": 0.56640625, + "learning_rate": 4.037474465438551e-06, + "loss": 1.4028, + "step": 8168 + }, + { + "epoch": 1.4092987147416545, + "grad_norm": 0.5546875, + "learning_rate": 4.035289570323489e-06, + "loss": 1.5105, + "step": 8169 + }, + { + "epoch": 1.4094712326403864, + "grad_norm": 0.5625, + "learning_rate": 4.033105117111441e-06, + "loss": 1.4076, + "step": 8170 + }, + { + "epoch": 1.4096437505391184, + "grad_norm": 0.55859375, + "learning_rate": 4.030921105964249e-06, + "loss": 1.3904, + "step": 8171 + }, + { + "epoch": 1.4098162684378504, + "grad_norm": 0.55078125, + "learning_rate": 4.028737537043719e-06, + "loss": 1.4178, + "step": 8172 + }, + { + "epoch": 1.4099887863365823, + "grad_norm": 0.71484375, + "learning_rate": 4.026554410511622e-06, + "loss": 1.4416, + "step": 8173 + }, + { + "epoch": 1.4101613042353145, + "grad_norm": 0.59765625, + "learning_rate": 4.024371726529698e-06, + "loss": 1.514, + "step": 8174 + }, + { + "epoch": 1.4103338221340465, + "grad_norm": 0.57421875, + "learning_rate": 4.022189485259656e-06, + "loss": 1.5538, + "step": 8175 + }, + { + "epoch": 1.4105063400327784, + "grad_norm": 0.61328125, + "learning_rate": 4.020007686863164e-06, + "loss": 1.3949, + "step": 8176 + }, + { + "epoch": 1.4106788579315104, + "grad_norm": 0.5546875, + "learning_rate": 4.017826331501872e-06, + "loss": 1.3972, + "step": 8177 + }, + { + "epoch": 1.4108513758302423, + "grad_norm": 0.546875, + "learning_rate": 4.015645419337381e-06, + "loss": 1.3803, + "step": 8178 + }, + { + "epoch": 1.4110238937289743, + "grad_norm": 0.54296875, + "learning_rate": 4.013464950531268e-06, + "loss": 1.4745, + "step": 8179 + }, + { + "epoch": 1.4111964116277065, + "grad_norm": 0.6328125, + "learning_rate": 4.011284925245079e-06, + "loss": 1.385, + "step": 8180 + }, + { + "epoch": 1.4113689295264384, + "grad_norm": 0.58984375, + "learning_rate": 4.009105343640321e-06, + "loss": 1.4544, + "step": 8181 + }, + { + "epoch": 1.4115414474251704, + "grad_norm": 0.55078125, + "learning_rate": 4.006926205878472e-06, + "loss": 1.3837, + "step": 8182 + }, + { + "epoch": 1.4117139653239024, + "grad_norm": 0.57421875, + "learning_rate": 4.00474751212098e-06, + "loss": 1.4387, + "step": 8183 + }, + { + "epoch": 1.4118864832226343, + "grad_norm": 0.62109375, + "learning_rate": 4.002569262529244e-06, + "loss": 1.2922, + "step": 8184 + }, + { + "epoch": 1.4120590011213663, + "grad_norm": 0.6171875, + "learning_rate": 4.000391457264656e-06, + "loss": 1.4863, + "step": 8185 + }, + { + "epoch": 1.4122315190200982, + "grad_norm": 0.5859375, + "learning_rate": 3.99821409648856e-06, + "loss": 1.4103, + "step": 8186 + }, + { + "epoch": 1.4124040369188302, + "grad_norm": 0.61328125, + "learning_rate": 3.996037180362256e-06, + "loss": 1.3828, + "step": 8187 + }, + { + "epoch": 1.4125765548175624, + "grad_norm": 0.609375, + "learning_rate": 3.99386070904704e-06, + "loss": 1.3571, + "step": 8188 + }, + { + "epoch": 1.4127490727162944, + "grad_norm": 0.58203125, + "learning_rate": 3.991684682704143e-06, + "loss": 1.5166, + "step": 8189 + }, + { + "epoch": 1.4129215906150263, + "grad_norm": 0.61328125, + "learning_rate": 3.989509101494794e-06, + "loss": 1.5186, + "step": 8190 + }, + { + "epoch": 1.4130941085137583, + "grad_norm": 0.62109375, + "learning_rate": 3.987333965580163e-06, + "loss": 1.4764, + "step": 8191 + }, + { + "epoch": 1.4132666264124902, + "grad_norm": 0.6015625, + "learning_rate": 3.9851592751213995e-06, + "loss": 1.4711, + "step": 8192 + }, + { + "epoch": 1.4134391443112224, + "grad_norm": 0.5703125, + "learning_rate": 3.98298503027962e-06, + "loss": 1.4253, + "step": 8193 + }, + { + "epoch": 1.4136116622099544, + "grad_norm": 0.72265625, + "learning_rate": 3.980811231215905e-06, + "loss": 1.3507, + "step": 8194 + }, + { + "epoch": 1.4137841801086863, + "grad_norm": 0.5546875, + "learning_rate": 3.978637878091305e-06, + "loss": 1.4504, + "step": 8195 + }, + { + "epoch": 1.4139566980074183, + "grad_norm": 0.59375, + "learning_rate": 3.976464971066837e-06, + "loss": 1.4436, + "step": 8196 + }, + { + "epoch": 1.4141292159061503, + "grad_norm": 0.6484375, + "learning_rate": 3.974292510303473e-06, + "loss": 1.4446, + "step": 8197 + }, + { + "epoch": 1.4143017338048822, + "grad_norm": 0.625, + "learning_rate": 3.972120495962178e-06, + "loss": 1.4351, + "step": 8198 + }, + { + "epoch": 1.4144742517036142, + "grad_norm": 0.58984375, + "learning_rate": 3.969948928203856e-06, + "loss": 1.3947, + "step": 8199 + }, + { + "epoch": 1.4146467696023461, + "grad_norm": 0.5625, + "learning_rate": 3.967777807189396e-06, + "loss": 1.454, + "step": 8200 + }, + { + "epoch": 1.4146467696023461, + "eval_loss": 1.4072892665863037, + "eval_runtime": 10.8508, + "eval_samples_per_second": 94.371, + "eval_steps_per_second": 23.593, + "step": 8200 + }, + { + "epoch": 1.414819287501078, + "grad_norm": 0.6171875, + "learning_rate": 3.9656071330796475e-06, + "loss": 1.343, + "step": 8201 + }, + { + "epoch": 1.4149918053998103, + "grad_norm": 0.58203125, + "learning_rate": 3.963436906035426e-06, + "loss": 1.4482, + "step": 8202 + }, + { + "epoch": 1.4151643232985422, + "grad_norm": 0.58984375, + "learning_rate": 3.961267126217517e-06, + "loss": 1.4088, + "step": 8203 + }, + { + "epoch": 1.4153368411972742, + "grad_norm": 0.6328125, + "learning_rate": 3.9590977937866715e-06, + "loss": 1.3249, + "step": 8204 + }, + { + "epoch": 1.4155093590960062, + "grad_norm": 0.703125, + "learning_rate": 3.956928908903607e-06, + "loss": 1.5044, + "step": 8205 + }, + { + "epoch": 1.4156818769947381, + "grad_norm": 0.6171875, + "learning_rate": 3.954760471729007e-06, + "loss": 1.3926, + "step": 8206 + }, + { + "epoch": 1.4158543948934703, + "grad_norm": 0.56640625, + "learning_rate": 3.952592482423527e-06, + "loss": 1.3704, + "step": 8207 + }, + { + "epoch": 1.4160269127922023, + "grad_norm": 0.6484375, + "learning_rate": 3.950424941147776e-06, + "loss": 1.446, + "step": 8208 + }, + { + "epoch": 1.4161994306909342, + "grad_norm": 0.64453125, + "learning_rate": 3.948257848062351e-06, + "loss": 1.4207, + "step": 8209 + }, + { + "epoch": 1.4163719485896662, + "grad_norm": 0.6015625, + "learning_rate": 3.946091203327794e-06, + "loss": 1.3249, + "step": 8210 + }, + { + "epoch": 1.4165444664883982, + "grad_norm": 0.7734375, + "learning_rate": 3.9439250071046274e-06, + "loss": 1.4373, + "step": 8211 + }, + { + "epoch": 1.4167169843871301, + "grad_norm": 0.56640625, + "learning_rate": 3.941759259553336e-06, + "loss": 1.4268, + "step": 8212 + }, + { + "epoch": 1.416889502285862, + "grad_norm": 0.93359375, + "learning_rate": 3.939593960834374e-06, + "loss": 1.3892, + "step": 8213 + }, + { + "epoch": 1.417062020184594, + "grad_norm": 0.625, + "learning_rate": 3.937429111108157e-06, + "loss": 1.3337, + "step": 8214 + }, + { + "epoch": 1.4172345380833262, + "grad_norm": 0.5546875, + "learning_rate": 3.935264710535076e-06, + "loss": 1.4283, + "step": 8215 + }, + { + "epoch": 1.4174070559820582, + "grad_norm": 0.58984375, + "learning_rate": 3.9331007592754725e-06, + "loss": 1.3613, + "step": 8216 + }, + { + "epoch": 1.4175795738807901, + "grad_norm": 0.59375, + "learning_rate": 3.9309372574896755e-06, + "loss": 1.4294, + "step": 8217 + }, + { + "epoch": 1.417752091779522, + "grad_norm": 0.55078125, + "learning_rate": 3.928774205337972e-06, + "loss": 1.5224, + "step": 8218 + }, + { + "epoch": 1.417924609678254, + "grad_norm": 0.57421875, + "learning_rate": 3.926611602980603e-06, + "loss": 1.3941, + "step": 8219 + }, + { + "epoch": 1.4180971275769862, + "grad_norm": 0.6015625, + "learning_rate": 3.924449450577801e-06, + "loss": 1.4262, + "step": 8220 + }, + { + "epoch": 1.4182696454757182, + "grad_norm": 0.59375, + "learning_rate": 3.922287748289739e-06, + "loss": 1.4873, + "step": 8221 + }, + { + "epoch": 1.4184421633744502, + "grad_norm": 0.5625, + "learning_rate": 3.920126496276583e-06, + "loss": 1.5009, + "step": 8222 + }, + { + "epoch": 1.4186146812731821, + "grad_norm": 0.61328125, + "learning_rate": 3.917965694698441e-06, + "loss": 1.3267, + "step": 8223 + }, + { + "epoch": 1.418787199171914, + "grad_norm": 0.65234375, + "learning_rate": 3.9158053437154035e-06, + "loss": 1.4316, + "step": 8224 + }, + { + "epoch": 1.418959717070646, + "grad_norm": 0.55859375, + "learning_rate": 3.9136454434875216e-06, + "loss": 1.4051, + "step": 8225 + }, + { + "epoch": 1.419132234969378, + "grad_norm": 0.59375, + "learning_rate": 3.911485994174814e-06, + "loss": 1.3459, + "step": 8226 + }, + { + "epoch": 1.41930475286811, + "grad_norm": 0.58984375, + "learning_rate": 3.909326995937267e-06, + "loss": 1.4291, + "step": 8227 + }, + { + "epoch": 1.419477270766842, + "grad_norm": 0.6015625, + "learning_rate": 3.907168448934836e-06, + "loss": 1.4023, + "step": 8228 + }, + { + "epoch": 1.4196497886655741, + "grad_norm": 0.6171875, + "learning_rate": 3.9050103533274295e-06, + "loss": 1.54, + "step": 8229 + }, + { + "epoch": 1.419822306564306, + "grad_norm": 0.609375, + "learning_rate": 3.902852709274946e-06, + "loss": 1.4052, + "step": 8230 + }, + { + "epoch": 1.419994824463038, + "grad_norm": 0.5625, + "learning_rate": 3.900695516937226e-06, + "loss": 1.4389, + "step": 8231 + }, + { + "epoch": 1.42016734236177, + "grad_norm": 0.59765625, + "learning_rate": 3.898538776474095e-06, + "loss": 1.4603, + "step": 8232 + }, + { + "epoch": 1.420339860260502, + "grad_norm": 0.54296875, + "learning_rate": 3.896382488045333e-06, + "loss": 1.4328, + "step": 8233 + }, + { + "epoch": 1.4205123781592341, + "grad_norm": 0.62109375, + "learning_rate": 3.894226651810693e-06, + "loss": 1.3588, + "step": 8234 + }, + { + "epoch": 1.420684896057966, + "grad_norm": 0.6484375, + "learning_rate": 3.892071267929894e-06, + "loss": 1.4438, + "step": 8235 + }, + { + "epoch": 1.420857413956698, + "grad_norm": 0.55078125, + "learning_rate": 3.88991633656262e-06, + "loss": 1.4963, + "step": 8236 + }, + { + "epoch": 1.42102993185543, + "grad_norm": 0.625, + "learning_rate": 3.887761857868522e-06, + "loss": 1.3939, + "step": 8237 + }, + { + "epoch": 1.421202449754162, + "grad_norm": 0.59765625, + "learning_rate": 3.885607832007214e-06, + "loss": 1.3764, + "step": 8238 + }, + { + "epoch": 1.421374967652894, + "grad_norm": 0.6171875, + "learning_rate": 3.883454259138289e-06, + "loss": 1.3638, + "step": 8239 + }, + { + "epoch": 1.421547485551626, + "grad_norm": 0.57421875, + "learning_rate": 3.881301139421281e-06, + "loss": 1.3644, + "step": 8240 + }, + { + "epoch": 1.4217200034503579, + "grad_norm": 0.5859375, + "learning_rate": 3.879148473015723e-06, + "loss": 1.4693, + "step": 8241 + }, + { + "epoch": 1.4218925213490898, + "grad_norm": 0.60546875, + "learning_rate": 3.87699626008109e-06, + "loss": 1.4157, + "step": 8242 + }, + { + "epoch": 1.422065039247822, + "grad_norm": 0.62109375, + "learning_rate": 3.87484450077683e-06, + "loss": 1.4135, + "step": 8243 + }, + { + "epoch": 1.422237557146554, + "grad_norm": 0.59375, + "learning_rate": 3.872693195262361e-06, + "loss": 1.5038, + "step": 8244 + }, + { + "epoch": 1.422410075045286, + "grad_norm": 0.6484375, + "learning_rate": 3.870542343697067e-06, + "loss": 1.4158, + "step": 8245 + }, + { + "epoch": 1.422582592944018, + "grad_norm": 0.6171875, + "learning_rate": 3.868391946240294e-06, + "loss": 1.4482, + "step": 8246 + }, + { + "epoch": 1.4227551108427499, + "grad_norm": 0.6328125, + "learning_rate": 3.86624200305136e-06, + "loss": 1.3736, + "step": 8247 + }, + { + "epoch": 1.422927628741482, + "grad_norm": 0.58984375, + "learning_rate": 3.864092514289539e-06, + "loss": 1.3976, + "step": 8248 + }, + { + "epoch": 1.423100146640214, + "grad_norm": 0.58203125, + "learning_rate": 3.861943480114086e-06, + "loss": 1.3573, + "step": 8249 + }, + { + "epoch": 1.423272664538946, + "grad_norm": 0.63671875, + "learning_rate": 3.859794900684212e-06, + "loss": 1.4126, + "step": 8250 + }, + { + "epoch": 1.423445182437678, + "grad_norm": 0.55859375, + "learning_rate": 3.857646776159098e-06, + "loss": 1.4187, + "step": 8251 + }, + { + "epoch": 1.4236177003364099, + "grad_norm": 0.62890625, + "learning_rate": 3.855499106697893e-06, + "loss": 1.4546, + "step": 8252 + }, + { + "epoch": 1.4237902182351418, + "grad_norm": 0.57421875, + "learning_rate": 3.8533518924597e-06, + "loss": 1.3757, + "step": 8253 + }, + { + "epoch": 1.4239627361338738, + "grad_norm": 0.5703125, + "learning_rate": 3.851205133603611e-06, + "loss": 1.4586, + "step": 8254 + }, + { + "epoch": 1.4241352540326058, + "grad_norm": 0.60546875, + "learning_rate": 3.849058830288663e-06, + "loss": 1.4201, + "step": 8255 + }, + { + "epoch": 1.424307771931338, + "grad_norm": 0.69140625, + "learning_rate": 3.846912982673869e-06, + "loss": 1.5396, + "step": 8256 + }, + { + "epoch": 1.42448028983007, + "grad_norm": 0.578125, + "learning_rate": 3.8447675909182095e-06, + "loss": 1.4887, + "step": 8257 + }, + { + "epoch": 1.4246528077288019, + "grad_norm": 0.578125, + "learning_rate": 3.842622655180625e-06, + "loss": 1.4912, + "step": 8258 + }, + { + "epoch": 1.4248253256275338, + "grad_norm": 0.66796875, + "learning_rate": 3.840478175620026e-06, + "loss": 1.402, + "step": 8259 + }, + { + "epoch": 1.4249978435262658, + "grad_norm": 0.57421875, + "learning_rate": 3.838334152395295e-06, + "loss": 1.4811, + "step": 8260 + }, + { + "epoch": 1.425170361424998, + "grad_norm": 0.55078125, + "learning_rate": 3.836190585665263e-06, + "loss": 1.4547, + "step": 8261 + }, + { + "epoch": 1.42534287932373, + "grad_norm": 0.640625, + "learning_rate": 3.834047475588753e-06, + "loss": 1.4213, + "step": 8262 + }, + { + "epoch": 1.425515397222462, + "grad_norm": 0.58984375, + "learning_rate": 3.831904822324527e-06, + "loss": 1.5176, + "step": 8263 + }, + { + "epoch": 1.4256879151211939, + "grad_norm": 0.59375, + "learning_rate": 3.829762626031333e-06, + "loss": 1.4017, + "step": 8264 + }, + { + "epoch": 1.4258604330199258, + "grad_norm": 0.578125, + "learning_rate": 3.827620886867878e-06, + "loss": 1.4718, + "step": 8265 + }, + { + "epoch": 1.4260329509186578, + "grad_norm": 0.65234375, + "learning_rate": 3.825479604992835e-06, + "loss": 1.3909, + "step": 8266 + }, + { + "epoch": 1.4262054688173897, + "grad_norm": 0.6171875, + "learning_rate": 3.823338780564841e-06, + "loss": 1.4747, + "step": 8267 + }, + { + "epoch": 1.4263779867161217, + "grad_norm": 0.6171875, + "learning_rate": 3.821198413742505e-06, + "loss": 1.4617, + "step": 8268 + }, + { + "epoch": 1.4265505046148537, + "grad_norm": 0.82421875, + "learning_rate": 3.8190585046843965e-06, + "loss": 1.3563, + "step": 8269 + }, + { + "epoch": 1.4267230225135858, + "grad_norm": 0.6171875, + "learning_rate": 3.8169190535490555e-06, + "loss": 1.5484, + "step": 8270 + }, + { + "epoch": 1.4268955404123178, + "grad_norm": 0.9296875, + "learning_rate": 3.814780060494987e-06, + "loss": 1.4492, + "step": 8271 + }, + { + "epoch": 1.4270680583110498, + "grad_norm": 0.5546875, + "learning_rate": 3.8126415256806527e-06, + "loss": 1.3372, + "step": 8272 + }, + { + "epoch": 1.4272405762097817, + "grad_norm": 0.671875, + "learning_rate": 3.810503449264501e-06, + "loss": 1.4254, + "step": 8273 + }, + { + "epoch": 1.4274130941085137, + "grad_norm": 0.640625, + "learning_rate": 3.808365831404924e-06, + "loss": 1.4118, + "step": 8274 + }, + { + "epoch": 1.4275856120072459, + "grad_norm": 0.58984375, + "learning_rate": 3.8062286722602936e-06, + "loss": 1.3553, + "step": 8275 + }, + { + "epoch": 1.4277581299059778, + "grad_norm": 0.625, + "learning_rate": 3.8040919719889435e-06, + "loss": 1.4825, + "step": 8276 + }, + { + "epoch": 1.4279306478047098, + "grad_norm": 0.546875, + "learning_rate": 3.801955730749174e-06, + "loss": 1.4204, + "step": 8277 + }, + { + "epoch": 1.4281031657034418, + "grad_norm": 0.60546875, + "learning_rate": 3.7998199486992502e-06, + "loss": 1.4068, + "step": 8278 + }, + { + "epoch": 1.4282756836021737, + "grad_norm": 0.5859375, + "learning_rate": 3.7976846259974098e-06, + "loss": 1.3543, + "step": 8279 + }, + { + "epoch": 1.4284482015009057, + "grad_norm": 0.609375, + "learning_rate": 3.795549762801839e-06, + "loss": 1.4565, + "step": 8280 + }, + { + "epoch": 1.4286207193996376, + "grad_norm": 0.5625, + "learning_rate": 3.7934153592707125e-06, + "loss": 1.3196, + "step": 8281 + }, + { + "epoch": 1.4287932372983696, + "grad_norm": 0.6015625, + "learning_rate": 3.7912814155621568e-06, + "loss": 1.3837, + "step": 8282 + }, + { + "epoch": 1.4289657551971016, + "grad_norm": 0.59375, + "learning_rate": 3.789147931834267e-06, + "loss": 1.4508, + "step": 8283 + }, + { + "epoch": 1.4291382730958337, + "grad_norm": 0.60546875, + "learning_rate": 3.7870149082451104e-06, + "loss": 1.4073, + "step": 8284 + }, + { + "epoch": 1.4293107909945657, + "grad_norm": 0.66015625, + "learning_rate": 3.784882344952702e-06, + "loss": 1.5276, + "step": 8285 + }, + { + "epoch": 1.4294833088932977, + "grad_norm": 0.58203125, + "learning_rate": 3.7827502421150497e-06, + "loss": 1.3499, + "step": 8286 + }, + { + "epoch": 1.4296558267920296, + "grad_norm": 0.58203125, + "learning_rate": 3.7806185998901034e-06, + "loss": 1.4606, + "step": 8287 + }, + { + "epoch": 1.4298283446907616, + "grad_norm": 0.65234375, + "learning_rate": 3.7784874184357923e-06, + "loss": 1.3935, + "step": 8288 + }, + { + "epoch": 1.4300008625894938, + "grad_norm": 0.65234375, + "learning_rate": 3.7763566979100076e-06, + "loss": 1.4289, + "step": 8289 + }, + { + "epoch": 1.4301733804882257, + "grad_norm": 0.609375, + "learning_rate": 3.774226438470605e-06, + "loss": 1.3783, + "step": 8290 + }, + { + "epoch": 1.4303458983869577, + "grad_norm": 0.62109375, + "learning_rate": 3.7720966402754076e-06, + "loss": 1.3995, + "step": 8291 + }, + { + "epoch": 1.4305184162856897, + "grad_norm": 0.58984375, + "learning_rate": 3.7699673034822095e-06, + "loss": 1.4908, + "step": 8292 + }, + { + "epoch": 1.4306909341844216, + "grad_norm": 0.625, + "learning_rate": 3.7678384282487535e-06, + "loss": 1.4742, + "step": 8293 + }, + { + "epoch": 1.4308634520831536, + "grad_norm": 0.671875, + "learning_rate": 3.765710014732774e-06, + "loss": 1.398, + "step": 8294 + }, + { + "epoch": 1.4310359699818855, + "grad_norm": 0.5859375, + "learning_rate": 3.7635820630919464e-06, + "loss": 1.3722, + "step": 8295 + }, + { + "epoch": 1.4312084878806175, + "grad_norm": 0.62109375, + "learning_rate": 3.761454573483927e-06, + "loss": 1.49, + "step": 8296 + }, + { + "epoch": 1.4313810057793497, + "grad_norm": 0.58203125, + "learning_rate": 3.7593275460663323e-06, + "loss": 1.4273, + "step": 8297 + }, + { + "epoch": 1.4315535236780816, + "grad_norm": 0.59375, + "learning_rate": 3.757200980996747e-06, + "loss": 1.4282, + "step": 8298 + }, + { + "epoch": 1.4317260415768136, + "grad_norm": 0.5859375, + "learning_rate": 3.7550748784327195e-06, + "loss": 1.4377, + "step": 8299 + }, + { + "epoch": 1.4318985594755456, + "grad_norm": 0.65625, + "learning_rate": 3.752949238531766e-06, + "loss": 1.4625, + "step": 8300 + }, + { + "epoch": 1.4318985594755456, + "eval_loss": 1.4072835445404053, + "eval_runtime": 10.8629, + "eval_samples_per_second": 94.266, + "eval_steps_per_second": 23.566, + "step": 8300 + }, + { + "epoch": 1.4320710773742775, + "grad_norm": 0.62109375, + "learning_rate": 3.750824061451365e-06, + "loss": 1.3686, + "step": 8301 + }, + { + "epoch": 1.4322435952730097, + "grad_norm": 0.609375, + "learning_rate": 3.7486993473489654e-06, + "loss": 1.4269, + "step": 8302 + }, + { + "epoch": 1.4324161131717417, + "grad_norm": 1.0078125, + "learning_rate": 3.7465750963819802e-06, + "loss": 1.4262, + "step": 8303 + }, + { + "epoch": 1.4325886310704736, + "grad_norm": 0.58984375, + "learning_rate": 3.7444513087077793e-06, + "loss": 1.3747, + "step": 8304 + }, + { + "epoch": 1.4327611489692056, + "grad_norm": 0.59765625, + "learning_rate": 3.742327984483718e-06, + "loss": 1.4343, + "step": 8305 + }, + { + "epoch": 1.4329336668679375, + "grad_norm": 0.61328125, + "learning_rate": 3.740205123867097e-06, + "loss": 1.3822, + "step": 8306 + }, + { + "epoch": 1.4331061847666695, + "grad_norm": 0.57421875, + "learning_rate": 3.738082727015192e-06, + "loss": 1.4472, + "step": 8307 + }, + { + "epoch": 1.4332787026654015, + "grad_norm": 0.546875, + "learning_rate": 3.735960794085246e-06, + "loss": 1.405, + "step": 8308 + }, + { + "epoch": 1.4334512205641334, + "grad_norm": 0.5859375, + "learning_rate": 3.733839325234463e-06, + "loss": 1.4689, + "step": 8309 + }, + { + "epoch": 1.4336237384628654, + "grad_norm": 0.5703125, + "learning_rate": 3.731718320620017e-06, + "loss": 1.4762, + "step": 8310 + }, + { + "epoch": 1.4337962563615976, + "grad_norm": 0.6328125, + "learning_rate": 3.7295977803990426e-06, + "loss": 1.4476, + "step": 8311 + }, + { + "epoch": 1.4339687742603295, + "grad_norm": 0.58203125, + "learning_rate": 3.7274777047286435e-06, + "loss": 1.4185, + "step": 8312 + }, + { + "epoch": 1.4341412921590615, + "grad_norm": 0.55078125, + "learning_rate": 3.7253580937658896e-06, + "loss": 1.2961, + "step": 8313 + }, + { + "epoch": 1.4343138100577935, + "grad_norm": 0.5625, + "learning_rate": 3.723238947667813e-06, + "loss": 1.4259, + "step": 8314 + }, + { + "epoch": 1.4344863279565254, + "grad_norm": 0.6015625, + "learning_rate": 3.721120266591416e-06, + "loss": 1.5421, + "step": 8315 + }, + { + "epoch": 1.4346588458552576, + "grad_norm": 0.6796875, + "learning_rate": 3.719002050693663e-06, + "loss": 1.4687, + "step": 8316 + }, + { + "epoch": 1.4348313637539896, + "grad_norm": 0.61328125, + "learning_rate": 3.716884300131478e-06, + "loss": 1.4174, + "step": 8317 + }, + { + "epoch": 1.4350038816527215, + "grad_norm": 0.62109375, + "learning_rate": 3.7147670150617698e-06, + "loss": 1.4209, + "step": 8318 + }, + { + "epoch": 1.4351763995514535, + "grad_norm": 0.59375, + "learning_rate": 3.7126501956413898e-06, + "loss": 1.3623, + "step": 8319 + }, + { + "epoch": 1.4353489174501854, + "grad_norm": 0.6328125, + "learning_rate": 3.7105338420271673e-06, + "loss": 1.428, + "step": 8320 + }, + { + "epoch": 1.4355214353489174, + "grad_norm": 0.6015625, + "learning_rate": 3.7084179543758968e-06, + "loss": 1.4367, + "step": 8321 + }, + { + "epoch": 1.4356939532476494, + "grad_norm": 0.59375, + "learning_rate": 3.706302532844336e-06, + "loss": 1.4826, + "step": 8322 + }, + { + "epoch": 1.4358664711463813, + "grad_norm": 0.6015625, + "learning_rate": 3.7041875775892077e-06, + "loss": 1.4315, + "step": 8323 + }, + { + "epoch": 1.4360389890451135, + "grad_norm": 0.56640625, + "learning_rate": 3.7020730887672063e-06, + "loss": 1.4028, + "step": 8324 + }, + { + "epoch": 1.4362115069438455, + "grad_norm": 0.5859375, + "learning_rate": 3.699959066534974e-06, + "loss": 1.2957, + "step": 8325 + }, + { + "epoch": 1.4363840248425774, + "grad_norm": 0.6484375, + "learning_rate": 3.697845511049146e-06, + "loss": 1.3867, + "step": 8326 + }, + { + "epoch": 1.4365565427413094, + "grad_norm": 0.63671875, + "learning_rate": 3.695732422466296e-06, + "loss": 1.4988, + "step": 8327 + }, + { + "epoch": 1.4367290606400414, + "grad_norm": 0.55078125, + "learning_rate": 3.69361980094298e-06, + "loss": 1.384, + "step": 8328 + }, + { + "epoch": 1.4369015785387733, + "grad_norm": 0.609375, + "learning_rate": 3.6915076466357123e-06, + "loss": 1.3995, + "step": 8329 + }, + { + "epoch": 1.4370740964375055, + "grad_norm": 0.62109375, + "learning_rate": 3.6893959597009766e-06, + "loss": 1.3905, + "step": 8330 + }, + { + "epoch": 1.4372466143362375, + "grad_norm": 0.640625, + "learning_rate": 3.687284740295217e-06, + "loss": 1.4135, + "step": 8331 + }, + { + "epoch": 1.4374191322349694, + "grad_norm": 0.53125, + "learning_rate": 3.6851739885748495e-06, + "loss": 1.3253, + "step": 8332 + }, + { + "epoch": 1.4375916501337014, + "grad_norm": 0.546875, + "learning_rate": 3.683063704696248e-06, + "loss": 1.4941, + "step": 8333 + }, + { + "epoch": 1.4377641680324333, + "grad_norm": 0.61328125, + "learning_rate": 3.680953888815758e-06, + "loss": 1.4896, + "step": 8334 + }, + { + "epoch": 1.4379366859311653, + "grad_norm": 0.5625, + "learning_rate": 3.678844541089691e-06, + "loss": 1.4125, + "step": 8335 + }, + { + "epoch": 1.4381092038298973, + "grad_norm": 0.57421875, + "learning_rate": 3.6767356616743098e-06, + "loss": 1.3701, + "step": 8336 + }, + { + "epoch": 1.4382817217286292, + "grad_norm": 0.6015625, + "learning_rate": 3.674627250725866e-06, + "loss": 1.4929, + "step": 8337 + }, + { + "epoch": 1.4384542396273614, + "grad_norm": 0.6171875, + "learning_rate": 3.6725193084005527e-06, + "loss": 1.4682, + "step": 8338 + }, + { + "epoch": 1.4386267575260934, + "grad_norm": 0.609375, + "learning_rate": 3.6704118348545516e-06, + "loss": 1.3607, + "step": 8339 + }, + { + "epoch": 1.4387992754248253, + "grad_norm": 0.5859375, + "learning_rate": 3.668304830243987e-06, + "loss": 1.4724, + "step": 8340 + }, + { + "epoch": 1.4389717933235573, + "grad_norm": 0.56640625, + "learning_rate": 3.666198294724963e-06, + "loss": 1.4779, + "step": 8341 + }, + { + "epoch": 1.4391443112222893, + "grad_norm": 0.62890625, + "learning_rate": 3.664092228453545e-06, + "loss": 1.4672, + "step": 8342 + }, + { + "epoch": 1.4393168291210214, + "grad_norm": 0.65234375, + "learning_rate": 3.6619866315857632e-06, + "loss": 1.3252, + "step": 8343 + }, + { + "epoch": 1.4394893470197534, + "grad_norm": 0.5859375, + "learning_rate": 3.6598815042776135e-06, + "loss": 1.4026, + "step": 8344 + }, + { + "epoch": 1.4396618649184854, + "grad_norm": 0.58203125, + "learning_rate": 3.657776846685057e-06, + "loss": 1.5287, + "step": 8345 + }, + { + "epoch": 1.4398343828172173, + "grad_norm": 0.5703125, + "learning_rate": 3.655672658964019e-06, + "loss": 1.4182, + "step": 8346 + }, + { + "epoch": 1.4400069007159493, + "grad_norm": 0.58984375, + "learning_rate": 3.6535689412703923e-06, + "loss": 1.4253, + "step": 8347 + }, + { + "epoch": 1.4401794186146812, + "grad_norm": 0.57421875, + "learning_rate": 3.651465693760037e-06, + "loss": 1.4129, + "step": 8348 + }, + { + "epoch": 1.4403519365134132, + "grad_norm": 1.4765625, + "learning_rate": 3.6493629165887623e-06, + "loss": 1.3906, + "step": 8349 + }, + { + "epoch": 1.4405244544121452, + "grad_norm": 0.57421875, + "learning_rate": 3.647260609912371e-06, + "loss": 1.5282, + "step": 8350 + }, + { + "epoch": 1.4406969723108771, + "grad_norm": 0.6328125, + "learning_rate": 3.6451587738866035e-06, + "loss": 1.4269, + "step": 8351 + }, + { + "epoch": 1.4408694902096093, + "grad_norm": 0.59375, + "learning_rate": 3.643057408667181e-06, + "loss": 1.4873, + "step": 8352 + }, + { + "epoch": 1.4410420081083413, + "grad_norm": 0.61328125, + "learning_rate": 3.6409565144097856e-06, + "loss": 1.4226, + "step": 8353 + }, + { + "epoch": 1.4412145260070732, + "grad_norm": 0.58203125, + "learning_rate": 3.6388560912700642e-06, + "loss": 1.3989, + "step": 8354 + }, + { + "epoch": 1.4413870439058052, + "grad_norm": 0.62109375, + "learning_rate": 3.636756139403631e-06, + "loss": 1.4151, + "step": 8355 + }, + { + "epoch": 1.4415595618045371, + "grad_norm": 0.62109375, + "learning_rate": 3.634656658966066e-06, + "loss": 1.4418, + "step": 8356 + }, + { + "epoch": 1.4417320797032693, + "grad_norm": 0.609375, + "learning_rate": 3.6325576501129002e-06, + "loss": 1.3659, + "step": 8357 + }, + { + "epoch": 1.4419045976020013, + "grad_norm": 0.58984375, + "learning_rate": 3.630459112999657e-06, + "loss": 1.3226, + "step": 8358 + }, + { + "epoch": 1.4420771155007333, + "grad_norm": 0.59765625, + "learning_rate": 3.6283610477817975e-06, + "loss": 1.5103, + "step": 8359 + }, + { + "epoch": 1.4422496333994652, + "grad_norm": 0.66796875, + "learning_rate": 3.6262634546147635e-06, + "loss": 1.4522, + "step": 8360 + }, + { + "epoch": 1.4424221512981972, + "grad_norm": 0.5859375, + "learning_rate": 3.624166333653958e-06, + "loss": 1.4022, + "step": 8361 + }, + { + "epoch": 1.4425946691969291, + "grad_norm": 0.56640625, + "learning_rate": 3.622069685054749e-06, + "loss": 1.547, + "step": 8362 + }, + { + "epoch": 1.442767187095661, + "grad_norm": 0.6640625, + "learning_rate": 3.6199735089724676e-06, + "loss": 1.4204, + "step": 8363 + }, + { + "epoch": 1.442939704994393, + "grad_norm": 0.5703125, + "learning_rate": 3.617877805562413e-06, + "loss": 1.376, + "step": 8364 + }, + { + "epoch": 1.4431122228931252, + "grad_norm": 0.60546875, + "learning_rate": 3.6157825749798493e-06, + "loss": 1.4068, + "step": 8365 + }, + { + "epoch": 1.4432847407918572, + "grad_norm": 0.546875, + "learning_rate": 3.613687817380002e-06, + "loss": 1.3867, + "step": 8366 + }, + { + "epoch": 1.4434572586905892, + "grad_norm": 0.5546875, + "learning_rate": 3.611593532918064e-06, + "loss": 1.3909, + "step": 8367 + }, + { + "epoch": 1.4436297765893211, + "grad_norm": 0.66015625, + "learning_rate": 3.6094997217491944e-06, + "loss": 1.4072, + "step": 8368 + }, + { + "epoch": 1.443802294488053, + "grad_norm": 0.56640625, + "learning_rate": 3.607406384028519e-06, + "loss": 1.4459, + "step": 8369 + }, + { + "epoch": 1.4439748123867853, + "grad_norm": 0.87109375, + "learning_rate": 3.6053135199111144e-06, + "loss": 1.398, + "step": 8370 + }, + { + "epoch": 1.4441473302855172, + "grad_norm": 0.58984375, + "learning_rate": 3.603221129552047e-06, + "loss": 1.5272, + "step": 8371 + }, + { + "epoch": 1.4443198481842492, + "grad_norm": 0.6796875, + "learning_rate": 3.6011292131063227e-06, + "loss": 1.467, + "step": 8372 + }, + { + "epoch": 1.4444923660829811, + "grad_norm": 0.609375, + "learning_rate": 3.5990377707289292e-06, + "loss": 1.4917, + "step": 8373 + }, + { + "epoch": 1.444664883981713, + "grad_norm": 0.58203125, + "learning_rate": 3.5969468025748135e-06, + "loss": 1.4118, + "step": 8374 + }, + { + "epoch": 1.444837401880445, + "grad_norm": 0.625, + "learning_rate": 3.594856308798885e-06, + "loss": 1.3988, + "step": 8375 + }, + { + "epoch": 1.445009919779177, + "grad_norm": 0.6640625, + "learning_rate": 3.5927662895560235e-06, + "loss": 1.562, + "step": 8376 + }, + { + "epoch": 1.445182437677909, + "grad_norm": 0.65234375, + "learning_rate": 3.5906767450010694e-06, + "loss": 1.4244, + "step": 8377 + }, + { + "epoch": 1.445354955576641, + "grad_norm": 0.5703125, + "learning_rate": 3.5885876752888295e-06, + "loss": 1.3598, + "step": 8378 + }, + { + "epoch": 1.4455274734753731, + "grad_norm": 0.5703125, + "learning_rate": 3.586499080574074e-06, + "loss": 1.4699, + "step": 8379 + }, + { + "epoch": 1.445699991374105, + "grad_norm": 0.59765625, + "learning_rate": 3.584410961011544e-06, + "loss": 1.5031, + "step": 8380 + }, + { + "epoch": 1.445872509272837, + "grad_norm": 0.625, + "learning_rate": 3.58232331675593e-06, + "loss": 1.3617, + "step": 8381 + }, + { + "epoch": 1.446045027171569, + "grad_norm": 0.6015625, + "learning_rate": 3.580236147961911e-06, + "loss": 1.4913, + "step": 8382 + }, + { + "epoch": 1.446217545070301, + "grad_norm": 0.6484375, + "learning_rate": 3.5781494547841066e-06, + "loss": 1.4094, + "step": 8383 + }, + { + "epoch": 1.4463900629690332, + "grad_norm": 0.6328125, + "learning_rate": 3.5760632373771163e-06, + "loss": 1.3552, + "step": 8384 + }, + { + "epoch": 1.4465625808677651, + "grad_norm": 0.62109375, + "learning_rate": 3.573977495895501e-06, + "loss": 1.4264, + "step": 8385 + }, + { + "epoch": 1.446735098766497, + "grad_norm": 0.546875, + "learning_rate": 3.571892230493783e-06, + "loss": 1.4048, + "step": 8386 + }, + { + "epoch": 1.446907616665229, + "grad_norm": 0.59375, + "learning_rate": 3.569807441326454e-06, + "loss": 1.384, + "step": 8387 + }, + { + "epoch": 1.447080134563961, + "grad_norm": 0.609375, + "learning_rate": 3.567723128547971e-06, + "loss": 1.3813, + "step": 8388 + }, + { + "epoch": 1.447252652462693, + "grad_norm": 0.57421875, + "learning_rate": 3.5656392923127424e-06, + "loss": 1.3643, + "step": 8389 + }, + { + "epoch": 1.447425170361425, + "grad_norm": 0.57421875, + "learning_rate": 3.5635559327751655e-06, + "loss": 1.4991, + "step": 8390 + }, + { + "epoch": 1.447597688260157, + "grad_norm": 0.61328125, + "learning_rate": 3.561473050089579e-06, + "loss": 1.4303, + "step": 8391 + }, + { + "epoch": 1.4477702061588889, + "grad_norm": 0.62109375, + "learning_rate": 3.559390644410298e-06, + "loss": 1.4665, + "step": 8392 + }, + { + "epoch": 1.447942724057621, + "grad_norm": 0.57421875, + "learning_rate": 3.557308715891601e-06, + "loss": 1.4102, + "step": 8393 + }, + { + "epoch": 1.448115241956353, + "grad_norm": 0.63671875, + "learning_rate": 3.555227264687726e-06, + "loss": 1.5032, + "step": 8394 + }, + { + "epoch": 1.448287759855085, + "grad_norm": 0.58203125, + "learning_rate": 3.5531462909528925e-06, + "loss": 1.4227, + "step": 8395 + }, + { + "epoch": 1.448460277753817, + "grad_norm": 0.5703125, + "learning_rate": 3.5510657948412587e-06, + "loss": 1.3451, + "step": 8396 + }, + { + "epoch": 1.4486327956525489, + "grad_norm": 0.7265625, + "learning_rate": 3.548985776506967e-06, + "loss": 1.4781, + "step": 8397 + }, + { + "epoch": 1.448805313551281, + "grad_norm": 0.59765625, + "learning_rate": 3.5469062361041152e-06, + "loss": 1.4004, + "step": 8398 + }, + { + "epoch": 1.448977831450013, + "grad_norm": 0.6328125, + "learning_rate": 3.5448271737867713e-06, + "loss": 1.4395, + "step": 8399 + }, + { + "epoch": 1.449150349348745, + "grad_norm": 0.62109375, + "learning_rate": 3.542748589708963e-06, + "loss": 1.4146, + "step": 8400 + }, + { + "epoch": 1.449150349348745, + "eval_loss": 1.4072437286376953, + "eval_runtime": 10.8573, + "eval_samples_per_second": 94.315, + "eval_steps_per_second": 23.579, + "step": 8400 + }, + { + "epoch": 1.449322867247477, + "grad_norm": 0.54296875, + "learning_rate": 3.540670484024691e-06, + "loss": 1.2855, + "step": 8401 + }, + { + "epoch": 1.449495385146209, + "grad_norm": 0.578125, + "learning_rate": 3.5385928568879012e-06, + "loss": 1.3483, + "step": 8402 + }, + { + "epoch": 1.4496679030449409, + "grad_norm": 0.64453125, + "learning_rate": 3.5365157084525326e-06, + "loss": 1.48, + "step": 8403 + }, + { + "epoch": 1.4498404209436728, + "grad_norm": 0.58203125, + "learning_rate": 3.5344390388724625e-06, + "loss": 1.388, + "step": 8404 + }, + { + "epoch": 1.4500129388424048, + "grad_norm": 0.61328125, + "learning_rate": 3.532362848301547e-06, + "loss": 1.4408, + "step": 8405 + }, + { + "epoch": 1.450185456741137, + "grad_norm": 0.57421875, + "learning_rate": 3.5302871368936043e-06, + "loss": 1.3503, + "step": 8406 + }, + { + "epoch": 1.450357974639869, + "grad_norm": 0.59765625, + "learning_rate": 3.5282119048024146e-06, + "loss": 1.3585, + "step": 8407 + }, + { + "epoch": 1.450530492538601, + "grad_norm": 0.70703125, + "learning_rate": 3.5261371521817247e-06, + "loss": 1.4256, + "step": 8408 + }, + { + "epoch": 1.4507030104373329, + "grad_norm": 0.57421875, + "learning_rate": 3.524062879185247e-06, + "loss": 1.333, + "step": 8409 + }, + { + "epoch": 1.4508755283360648, + "grad_norm": 0.6953125, + "learning_rate": 3.5219890859666493e-06, + "loss": 1.5221, + "step": 8410 + }, + { + "epoch": 1.451048046234797, + "grad_norm": 0.59375, + "learning_rate": 3.519915772679581e-06, + "loss": 1.431, + "step": 8411 + }, + { + "epoch": 1.451220564133529, + "grad_norm": 0.875, + "learning_rate": 3.5178429394776436e-06, + "loss": 1.5443, + "step": 8412 + }, + { + "epoch": 1.451393082032261, + "grad_norm": 0.578125, + "learning_rate": 3.5157705865143964e-06, + "loss": 1.4704, + "step": 8413 + }, + { + "epoch": 1.4515655999309929, + "grad_norm": 0.8203125, + "learning_rate": 3.5136987139433874e-06, + "loss": 1.5263, + "step": 8414 + }, + { + "epoch": 1.4517381178297248, + "grad_norm": 0.60546875, + "learning_rate": 3.511627321918102e-06, + "loss": 1.4345, + "step": 8415 + }, + { + "epoch": 1.4519106357284568, + "grad_norm": 0.5703125, + "learning_rate": 3.5095564105920065e-06, + "loss": 1.4462, + "step": 8416 + }, + { + "epoch": 1.4520831536271888, + "grad_norm": 0.62890625, + "learning_rate": 3.5074859801185256e-06, + "loss": 1.4408, + "step": 8417 + }, + { + "epoch": 1.4522556715259207, + "grad_norm": 0.55078125, + "learning_rate": 3.505416030651051e-06, + "loss": 1.3899, + "step": 8418 + }, + { + "epoch": 1.4524281894246527, + "grad_norm": 0.55078125, + "learning_rate": 3.5033465623429362e-06, + "loss": 1.4151, + "step": 8419 + }, + { + "epoch": 1.4526007073233849, + "grad_norm": 0.59765625, + "learning_rate": 3.501277575347505e-06, + "loss": 1.4596, + "step": 8420 + }, + { + "epoch": 1.4527732252221168, + "grad_norm": 0.5859375, + "learning_rate": 3.4992090698180293e-06, + "loss": 1.3831, + "step": 8421 + }, + { + "epoch": 1.4529457431208488, + "grad_norm": 0.609375, + "learning_rate": 3.4971410459077716e-06, + "loss": 1.4109, + "step": 8422 + }, + { + "epoch": 1.4531182610195807, + "grad_norm": 0.578125, + "learning_rate": 3.4950735037699334e-06, + "loss": 1.3633, + "step": 8423 + }, + { + "epoch": 1.4532907789183127, + "grad_norm": 0.57421875, + "learning_rate": 3.493006443557696e-06, + "loss": 1.5085, + "step": 8424 + }, + { + "epoch": 1.453463296817045, + "grad_norm": 0.6328125, + "learning_rate": 3.490939865424198e-06, + "loss": 1.4088, + "step": 8425 + }, + { + "epoch": 1.4536358147157769, + "grad_norm": 0.625, + "learning_rate": 3.4888737695225416e-06, + "loss": 1.4707, + "step": 8426 + }, + { + "epoch": 1.4538083326145088, + "grad_norm": 0.60546875, + "learning_rate": 3.4868081560058066e-06, + "loss": 1.4511, + "step": 8427 + }, + { + "epoch": 1.4539808505132408, + "grad_norm": 0.5546875, + "learning_rate": 3.4847430250270165e-06, + "loss": 1.428, + "step": 8428 + }, + { + "epoch": 1.4541533684119727, + "grad_norm": 0.6171875, + "learning_rate": 3.4826783767391727e-06, + "loss": 1.5579, + "step": 8429 + }, + { + "epoch": 1.4543258863107047, + "grad_norm": 0.70703125, + "learning_rate": 3.4806142112952356e-06, + "loss": 1.5196, + "step": 8430 + }, + { + "epoch": 1.4544984042094367, + "grad_norm": 0.6484375, + "learning_rate": 3.478550528848134e-06, + "loss": 1.4335, + "step": 8431 + }, + { + "epoch": 1.4546709221081686, + "grad_norm": 0.578125, + "learning_rate": 3.4764873295507563e-06, + "loss": 1.4063, + "step": 8432 + }, + { + "epoch": 1.4548434400069006, + "grad_norm": 0.60546875, + "learning_rate": 3.4744246135559623e-06, + "loss": 1.3783, + "step": 8433 + }, + { + "epoch": 1.4550159579056328, + "grad_norm": 0.61328125, + "learning_rate": 3.4723623810165584e-06, + "loss": 1.4814, + "step": 8434 + }, + { + "epoch": 1.4551884758043647, + "grad_norm": 0.64453125, + "learning_rate": 3.470300632085344e-06, + "loss": 1.4284, + "step": 8435 + }, + { + "epoch": 1.4553609937030967, + "grad_norm": 0.6796875, + "learning_rate": 3.4682393669150547e-06, + "loss": 1.4607, + "step": 8436 + }, + { + "epoch": 1.4555335116018286, + "grad_norm": 0.71875, + "learning_rate": 3.466178585658405e-06, + "loss": 1.454, + "step": 8437 + }, + { + "epoch": 1.4557060295005606, + "grad_norm": 0.5703125, + "learning_rate": 3.464118288468071e-06, + "loss": 1.4989, + "step": 8438 + }, + { + "epoch": 1.4558785473992928, + "grad_norm": 0.57421875, + "learning_rate": 3.462058475496692e-06, + "loss": 1.4011, + "step": 8439 + }, + { + "epoch": 1.4560510652980247, + "grad_norm": 0.70703125, + "learning_rate": 3.459999146896873e-06, + "loss": 1.4045, + "step": 8440 + }, + { + "epoch": 1.4562235831967567, + "grad_norm": 0.5703125, + "learning_rate": 3.4579403028211835e-06, + "loss": 1.3958, + "step": 8441 + }, + { + "epoch": 1.4563961010954887, + "grad_norm": 0.6015625, + "learning_rate": 3.4558819434221456e-06, + "loss": 1.4219, + "step": 8442 + }, + { + "epoch": 1.4565686189942206, + "grad_norm": 0.5390625, + "learning_rate": 3.4538240688522684e-06, + "loss": 1.3098, + "step": 8443 + }, + { + "epoch": 1.4567411368929526, + "grad_norm": 0.58203125, + "learning_rate": 3.451766679264008e-06, + "loss": 1.4552, + "step": 8444 + }, + { + "epoch": 1.4569136547916846, + "grad_norm": 0.58984375, + "learning_rate": 3.449709774809782e-06, + "loss": 1.3745, + "step": 8445 + }, + { + "epoch": 1.4570861726904165, + "grad_norm": 0.63671875, + "learning_rate": 3.447653355641989e-06, + "loss": 1.4049, + "step": 8446 + }, + { + "epoch": 1.4572586905891487, + "grad_norm": 0.58203125, + "learning_rate": 3.445597421912974e-06, + "loss": 1.4325, + "step": 8447 + }, + { + "epoch": 1.4574312084878807, + "grad_norm": 0.63671875, + "learning_rate": 3.4435419737750566e-06, + "loss": 1.5556, + "step": 8448 + }, + { + "epoch": 1.4576037263866126, + "grad_norm": 0.58203125, + "learning_rate": 3.4414870113805156e-06, + "loss": 1.4107, + "step": 8449 + }, + { + "epoch": 1.4577762442853446, + "grad_norm": 0.61328125, + "learning_rate": 3.439432534881596e-06, + "loss": 1.4473, + "step": 8450 + }, + { + "epoch": 1.4579487621840765, + "grad_norm": 0.62109375, + "learning_rate": 3.437378544430506e-06, + "loss": 1.4035, + "step": 8451 + }, + { + "epoch": 1.4581212800828087, + "grad_norm": 0.5703125, + "learning_rate": 3.4353250401794238e-06, + "loss": 1.4244, + "step": 8452 + }, + { + "epoch": 1.4582937979815407, + "grad_norm": 0.578125, + "learning_rate": 3.433272022280472e-06, + "loss": 1.508, + "step": 8453 + }, + { + "epoch": 1.4584663158802726, + "grad_norm": 0.6015625, + "learning_rate": 3.431219490885768e-06, + "loss": 1.399, + "step": 8454 + }, + { + "epoch": 1.4586388337790046, + "grad_norm": 0.546875, + "learning_rate": 3.429167446147359e-06, + "loss": 1.473, + "step": 8455 + }, + { + "epoch": 1.4588113516777366, + "grad_norm": 0.7109375, + "learning_rate": 3.4271158882172904e-06, + "loss": 1.3995, + "step": 8456 + }, + { + "epoch": 1.4589838695764685, + "grad_norm": 0.55859375, + "learning_rate": 3.425064817247542e-06, + "loss": 1.4109, + "step": 8457 + }, + { + "epoch": 1.4591563874752005, + "grad_norm": 0.609375, + "learning_rate": 3.4230142333900705e-06, + "loss": 1.3566, + "step": 8458 + }, + { + "epoch": 1.4593289053739325, + "grad_norm": 0.6015625, + "learning_rate": 3.420964136796807e-06, + "loss": 1.4539, + "step": 8459 + }, + { + "epoch": 1.4595014232726644, + "grad_norm": 0.578125, + "learning_rate": 3.4189145276196244e-06, + "loss": 1.3497, + "step": 8460 + }, + { + "epoch": 1.4596739411713966, + "grad_norm": 0.59375, + "learning_rate": 3.4168654060103735e-06, + "loss": 1.5443, + "step": 8461 + }, + { + "epoch": 1.4598464590701286, + "grad_norm": 0.58984375, + "learning_rate": 3.414816772120867e-06, + "loss": 1.4199, + "step": 8462 + }, + { + "epoch": 1.4600189769688605, + "grad_norm": 0.66015625, + "learning_rate": 3.4127686261028803e-06, + "loss": 1.4814, + "step": 8463 + }, + { + "epoch": 1.4601914948675925, + "grad_norm": 0.59765625, + "learning_rate": 3.410720968108153e-06, + "loss": 1.5426, + "step": 8464 + }, + { + "epoch": 1.4603640127663244, + "grad_norm": 0.58984375, + "learning_rate": 3.408673798288391e-06, + "loss": 1.4995, + "step": 8465 + }, + { + "epoch": 1.4605365306650566, + "grad_norm": 0.60546875, + "learning_rate": 3.4066271167952524e-06, + "loss": 1.4793, + "step": 8466 + }, + { + "epoch": 1.4607090485637886, + "grad_norm": 0.60546875, + "learning_rate": 3.4045809237803794e-06, + "loss": 1.442, + "step": 8467 + }, + { + "epoch": 1.4608815664625205, + "grad_norm": 0.71484375, + "learning_rate": 3.40253521939536e-06, + "loss": 1.5188, + "step": 8468 + }, + { + "epoch": 1.4610540843612525, + "grad_norm": 0.55859375, + "learning_rate": 3.4004900037917545e-06, + "loss": 1.4222, + "step": 8469 + }, + { + "epoch": 1.4612266022599845, + "grad_norm": 0.5625, + "learning_rate": 3.398445277121084e-06, + "loss": 1.5028, + "step": 8470 + }, + { + "epoch": 1.4613991201587164, + "grad_norm": 0.6484375, + "learning_rate": 3.3964010395348355e-06, + "loss": 1.3889, + "step": 8471 + }, + { + "epoch": 1.4615716380574484, + "grad_norm": 0.58203125, + "learning_rate": 3.3943572911844603e-06, + "loss": 1.3595, + "step": 8472 + }, + { + "epoch": 1.4617441559561803, + "grad_norm": 0.7265625, + "learning_rate": 3.3923140322213744e-06, + "loss": 1.3079, + "step": 8473 + }, + { + "epoch": 1.4619166738549125, + "grad_norm": 0.625, + "learning_rate": 3.3902712627969446e-06, + "loss": 1.4147, + "step": 8474 + }, + { + "epoch": 1.4620891917536445, + "grad_norm": 0.5546875, + "learning_rate": 3.3882289830625227e-06, + "loss": 1.3372, + "step": 8475 + }, + { + "epoch": 1.4622617096523765, + "grad_norm": 0.65625, + "learning_rate": 3.3861871931694124e-06, + "loss": 1.4014, + "step": 8476 + }, + { + "epoch": 1.4624342275511084, + "grad_norm": 0.5625, + "learning_rate": 3.3841458932688744e-06, + "loss": 1.5151, + "step": 8477 + }, + { + "epoch": 1.4626067454498404, + "grad_norm": 0.625, + "learning_rate": 3.382105083512152e-06, + "loss": 1.5069, + "step": 8478 + }, + { + "epoch": 1.4627792633485723, + "grad_norm": 0.6015625, + "learning_rate": 3.3800647640504325e-06, + "loss": 1.3539, + "step": 8479 + }, + { + "epoch": 1.4629517812473045, + "grad_norm": 0.546875, + "learning_rate": 3.378024935034879e-06, + "loss": 1.3442, + "step": 8480 + }, + { + "epoch": 1.4631242991460365, + "grad_norm": 0.59765625, + "learning_rate": 3.3759855966166144e-06, + "loss": 1.4406, + "step": 8481 + }, + { + "epoch": 1.4632968170447684, + "grad_norm": 0.546875, + "learning_rate": 3.3739467489467247e-06, + "loss": 1.446, + "step": 8482 + }, + { + "epoch": 1.4634693349435004, + "grad_norm": 0.6640625, + "learning_rate": 3.371908392176262e-06, + "loss": 1.5256, + "step": 8483 + }, + { + "epoch": 1.4636418528422324, + "grad_norm": 0.63671875, + "learning_rate": 3.3698705264562426e-06, + "loss": 1.4524, + "step": 8484 + }, + { + "epoch": 1.4638143707409643, + "grad_norm": 0.546875, + "learning_rate": 3.367833151937636e-06, + "loss": 1.4549, + "step": 8485 + }, + { + "epoch": 1.4639868886396963, + "grad_norm": 0.62109375, + "learning_rate": 3.365796268771395e-06, + "loss": 1.4497, + "step": 8486 + }, + { + "epoch": 1.4641594065384282, + "grad_norm": 0.59375, + "learning_rate": 3.3637598771084125e-06, + "loss": 1.3537, + "step": 8487 + }, + { + "epoch": 1.4643319244371604, + "grad_norm": 0.62890625, + "learning_rate": 3.3617239770995704e-06, + "loss": 1.3475, + "step": 8488 + }, + { + "epoch": 1.4645044423358924, + "grad_norm": 0.61328125, + "learning_rate": 3.3596885688956893e-06, + "loss": 1.3748, + "step": 8489 + }, + { + "epoch": 1.4646769602346243, + "grad_norm": 0.59765625, + "learning_rate": 3.3576536526475678e-06, + "loss": 1.3505, + "step": 8490 + }, + { + "epoch": 1.4648494781333563, + "grad_norm": 0.625, + "learning_rate": 3.355619228505973e-06, + "loss": 1.308, + "step": 8491 + }, + { + "epoch": 1.4650219960320883, + "grad_norm": 0.55859375, + "learning_rate": 3.3535852966216176e-06, + "loss": 1.3639, + "step": 8492 + }, + { + "epoch": 1.4651945139308205, + "grad_norm": 0.6484375, + "learning_rate": 3.3515518571451933e-06, + "loss": 1.5198, + "step": 8493 + }, + { + "epoch": 1.4653670318295524, + "grad_norm": 0.5859375, + "learning_rate": 3.3495189102273484e-06, + "loss": 1.4561, + "step": 8494 + }, + { + "epoch": 1.4655395497282844, + "grad_norm": 0.56640625, + "learning_rate": 3.347486456018697e-06, + "loss": 1.3444, + "step": 8495 + }, + { + "epoch": 1.4657120676270163, + "grad_norm": 0.61328125, + "learning_rate": 3.3454544946698153e-06, + "loss": 1.4556, + "step": 8496 + }, + { + "epoch": 1.4658845855257483, + "grad_norm": 0.59375, + "learning_rate": 3.3434230263312483e-06, + "loss": 1.4214, + "step": 8497 + }, + { + "epoch": 1.4660571034244803, + "grad_norm": 0.65625, + "learning_rate": 3.3413920511534883e-06, + "loss": 1.4896, + "step": 8498 + }, + { + "epoch": 1.4662296213232122, + "grad_norm": 0.5859375, + "learning_rate": 3.3393615692870175e-06, + "loss": 1.3647, + "step": 8499 + }, + { + "epoch": 1.4664021392219442, + "grad_norm": 0.5859375, + "learning_rate": 3.3373315808822547e-06, + "loss": 1.4469, + "step": 8500 + }, + { + "epoch": 1.4664021392219442, + "eval_loss": 1.4072179794311523, + "eval_runtime": 10.9263, + "eval_samples_per_second": 93.719, + "eval_steps_per_second": 23.43, + "step": 8500 + }, + { + "epoch": 1.4665746571206761, + "grad_norm": 0.64453125, + "learning_rate": 3.3353020860896002e-06, + "loss": 1.4388, + "step": 8501 + }, + { + "epoch": 1.4667471750194083, + "grad_norm": 0.60546875, + "learning_rate": 3.3332730850594088e-06, + "loss": 1.5377, + "step": 8502 + }, + { + "epoch": 1.4669196929181403, + "grad_norm": 0.62890625, + "learning_rate": 3.3312445779420033e-06, + "loss": 1.3911, + "step": 8503 + }, + { + "epoch": 1.4670922108168722, + "grad_norm": 0.5703125, + "learning_rate": 3.3292165648876683e-06, + "loss": 1.4262, + "step": 8504 + }, + { + "epoch": 1.4672647287156042, + "grad_norm": 0.58203125, + "learning_rate": 3.3271890460466537e-06, + "loss": 1.417, + "step": 8505 + }, + { + "epoch": 1.4674372466143362, + "grad_norm": 0.68359375, + "learning_rate": 3.3251620215691614e-06, + "loss": 1.3963, + "step": 8506 + }, + { + "epoch": 1.4676097645130683, + "grad_norm": 0.640625, + "learning_rate": 3.323135491605377e-06, + "loss": 1.4469, + "step": 8507 + }, + { + "epoch": 1.4677822824118003, + "grad_norm": 0.578125, + "learning_rate": 3.321109456305438e-06, + "loss": 1.5031, + "step": 8508 + }, + { + "epoch": 1.4679548003105323, + "grad_norm": 0.55078125, + "learning_rate": 3.3190839158194344e-06, + "loss": 1.3735, + "step": 8509 + }, + { + "epoch": 1.4681273182092642, + "grad_norm": 0.58984375, + "learning_rate": 3.317058870297446e-06, + "loss": 1.4582, + "step": 8510 + }, + { + "epoch": 1.4682998361079962, + "grad_norm": 0.53515625, + "learning_rate": 3.3150343198894897e-06, + "loss": 1.3811, + "step": 8511 + }, + { + "epoch": 1.4684723540067282, + "grad_norm": 0.5625, + "learning_rate": 3.3130102647455608e-06, + "loss": 1.4364, + "step": 8512 + }, + { + "epoch": 1.4686448719054601, + "grad_norm": 0.640625, + "learning_rate": 3.310986705015613e-06, + "loss": 1.3751, + "step": 8513 + }, + { + "epoch": 1.468817389804192, + "grad_norm": 0.57421875, + "learning_rate": 3.3089636408495662e-06, + "loss": 1.504, + "step": 8514 + }, + { + "epoch": 1.4689899077029243, + "grad_norm": 0.65625, + "learning_rate": 3.3069410723972995e-06, + "loss": 1.5245, + "step": 8515 + }, + { + "epoch": 1.4691624256016562, + "grad_norm": 0.6015625, + "learning_rate": 3.3049189998086584e-06, + "loss": 1.4545, + "step": 8516 + }, + { + "epoch": 1.4693349435003882, + "grad_norm": 0.5859375, + "learning_rate": 3.302897423233451e-06, + "loss": 1.3767, + "step": 8517 + }, + { + "epoch": 1.4695074613991201, + "grad_norm": 0.80859375, + "learning_rate": 3.300876342821451e-06, + "loss": 1.3456, + "step": 8518 + }, + { + "epoch": 1.469679979297852, + "grad_norm": 0.63671875, + "learning_rate": 3.2988557587223837e-06, + "loss": 1.4042, + "step": 8519 + }, + { + "epoch": 1.469852497196584, + "grad_norm": 0.61328125, + "learning_rate": 3.296835671085957e-06, + "loss": 1.4078, + "step": 8520 + }, + { + "epoch": 1.4700250150953162, + "grad_norm": 0.6640625, + "learning_rate": 3.2948160800618254e-06, + "loss": 1.3702, + "step": 8521 + }, + { + "epoch": 1.4701975329940482, + "grad_norm": 0.5703125, + "learning_rate": 3.292796985799611e-06, + "loss": 1.4343, + "step": 8522 + }, + { + "epoch": 1.4703700508927802, + "grad_norm": 0.5859375, + "learning_rate": 3.290778388448912e-06, + "loss": 1.4191, + "step": 8523 + }, + { + "epoch": 1.4705425687915121, + "grad_norm": 0.55078125, + "learning_rate": 3.288760288159266e-06, + "loss": 1.3871, + "step": 8524 + }, + { + "epoch": 1.470715086690244, + "grad_norm": 0.57421875, + "learning_rate": 3.2867426850801932e-06, + "loss": 1.4247, + "step": 8525 + }, + { + "epoch": 1.470887604588976, + "grad_norm": 0.5703125, + "learning_rate": 3.2847255793611674e-06, + "loss": 1.3644, + "step": 8526 + }, + { + "epoch": 1.471060122487708, + "grad_norm": 0.58203125, + "learning_rate": 3.2827089711516303e-06, + "loss": 1.4524, + "step": 8527 + }, + { + "epoch": 1.47123264038644, + "grad_norm": 0.55859375, + "learning_rate": 3.2806928606009836e-06, + "loss": 1.4674, + "step": 8528 + }, + { + "epoch": 1.4714051582851722, + "grad_norm": 0.62109375, + "learning_rate": 3.278677247858598e-06, + "loss": 1.4114, + "step": 8529 + }, + { + "epoch": 1.4715776761839041, + "grad_norm": 0.58203125, + "learning_rate": 3.2766621330737923e-06, + "loss": 1.3431, + "step": 8530 + }, + { + "epoch": 1.471750194082636, + "grad_norm": 0.60546875, + "learning_rate": 3.2746475163958714e-06, + "loss": 1.4278, + "step": 8531 + }, + { + "epoch": 1.471922711981368, + "grad_norm": 0.62890625, + "learning_rate": 3.272633397974081e-06, + "loss": 1.4383, + "step": 8532 + }, + { + "epoch": 1.4720952298801, + "grad_norm": 0.6015625, + "learning_rate": 3.270619777957642e-06, + "loss": 1.4291, + "step": 8533 + }, + { + "epoch": 1.4722677477788322, + "grad_norm": 0.6171875, + "learning_rate": 3.2686066564957387e-06, + "loss": 1.3628, + "step": 8534 + }, + { + "epoch": 1.4724402656775641, + "grad_norm": 0.640625, + "learning_rate": 3.2665940337375125e-06, + "loss": 1.4406, + "step": 8535 + }, + { + "epoch": 1.472612783576296, + "grad_norm": 0.61328125, + "learning_rate": 3.264581909832073e-06, + "loss": 1.4555, + "step": 8536 + }, + { + "epoch": 1.472785301475028, + "grad_norm": 0.6953125, + "learning_rate": 3.2625702849284947e-06, + "loss": 1.4099, + "step": 8537 + }, + { + "epoch": 1.47295781937376, + "grad_norm": 0.90625, + "learning_rate": 3.2605591591758e-06, + "loss": 1.4453, + "step": 8538 + }, + { + "epoch": 1.473130337272492, + "grad_norm": 0.59375, + "learning_rate": 3.258548532722995e-06, + "loss": 1.4416, + "step": 8539 + }, + { + "epoch": 1.473302855171224, + "grad_norm": 0.66015625, + "learning_rate": 3.256538405719042e-06, + "loss": 1.3456, + "step": 8540 + }, + { + "epoch": 1.473475373069956, + "grad_norm": 0.609375, + "learning_rate": 3.254528778312852e-06, + "loss": 1.4016, + "step": 8541 + }, + { + "epoch": 1.4736478909686879, + "grad_norm": 0.6171875, + "learning_rate": 3.2525196506533243e-06, + "loss": 1.4421, + "step": 8542 + }, + { + "epoch": 1.47382040886742, + "grad_norm": 0.60546875, + "learning_rate": 3.250511022889293e-06, + "loss": 1.4036, + "step": 8543 + }, + { + "epoch": 1.473992926766152, + "grad_norm": 0.56640625, + "learning_rate": 3.248502895169586e-06, + "loss": 1.3537, + "step": 8544 + }, + { + "epoch": 1.474165444664884, + "grad_norm": 0.5703125, + "learning_rate": 3.246495267642967e-06, + "loss": 1.5216, + "step": 8545 + }, + { + "epoch": 1.474337962563616, + "grad_norm": 0.62109375, + "learning_rate": 3.244488140458175e-06, + "loss": 1.4198, + "step": 8546 + }, + { + "epoch": 1.474510480462348, + "grad_norm": 0.62890625, + "learning_rate": 3.2424815137639132e-06, + "loss": 1.3578, + "step": 8547 + }, + { + "epoch": 1.47468299836108, + "grad_norm": 0.62109375, + "learning_rate": 3.2404753877088437e-06, + "loss": 1.4625, + "step": 8548 + }, + { + "epoch": 1.474855516259812, + "grad_norm": 0.66015625, + "learning_rate": 3.2384697624415915e-06, + "loss": 1.4955, + "step": 8549 + }, + { + "epoch": 1.475028034158544, + "grad_norm": 0.55078125, + "learning_rate": 3.236464638110752e-06, + "loss": 1.3116, + "step": 8550 + }, + { + "epoch": 1.475200552057276, + "grad_norm": 0.68359375, + "learning_rate": 3.2344600148648652e-06, + "loss": 1.4343, + "step": 8551 + }, + { + "epoch": 1.475373069956008, + "grad_norm": 0.66015625, + "learning_rate": 3.232455892852461e-06, + "loss": 1.4746, + "step": 8552 + }, + { + "epoch": 1.4755455878547399, + "grad_norm": 0.73046875, + "learning_rate": 3.230452272222007e-06, + "loss": 1.3621, + "step": 8553 + }, + { + "epoch": 1.4757181057534718, + "grad_norm": 0.55078125, + "learning_rate": 3.228449153121942e-06, + "loss": 1.3907, + "step": 8554 + }, + { + "epoch": 1.4758906236522038, + "grad_norm": 0.6171875, + "learning_rate": 3.2264465357006814e-06, + "loss": 1.5396, + "step": 8555 + }, + { + "epoch": 1.476063141550936, + "grad_norm": 0.578125, + "learning_rate": 3.2244444201065815e-06, + "loss": 1.4031, + "step": 8556 + }, + { + "epoch": 1.476235659449668, + "grad_norm": 0.5703125, + "learning_rate": 3.2224428064879743e-06, + "loss": 1.3998, + "step": 8557 + }, + { + "epoch": 1.4764081773484, + "grad_norm": 0.578125, + "learning_rate": 3.2204416949931516e-06, + "loss": 1.4665, + "step": 8558 + }, + { + "epoch": 1.4765806952471319, + "grad_norm": 0.5625, + "learning_rate": 3.2184410857703695e-06, + "loss": 1.4619, + "step": 8559 + }, + { + "epoch": 1.4767532131458638, + "grad_norm": 1.0234375, + "learning_rate": 3.2164409789678453e-06, + "loss": 1.4198, + "step": 8560 + }, + { + "epoch": 1.476925731044596, + "grad_norm": 0.625, + "learning_rate": 3.2144413747337622e-06, + "loss": 1.4841, + "step": 8561 + }, + { + "epoch": 1.477098248943328, + "grad_norm": 0.59765625, + "learning_rate": 3.212442273216253e-06, + "loss": 1.3529, + "step": 8562 + }, + { + "epoch": 1.47727076684206, + "grad_norm": 0.59765625, + "learning_rate": 3.210443674563437e-06, + "loss": 1.4495, + "step": 8563 + }, + { + "epoch": 1.477443284740792, + "grad_norm": 0.55859375, + "learning_rate": 3.2084455789233748e-06, + "loss": 1.3776, + "step": 8564 + }, + { + "epoch": 1.4776158026395239, + "grad_norm": 0.57421875, + "learning_rate": 3.206447986444099e-06, + "loss": 1.4294, + "step": 8565 + }, + { + "epoch": 1.4777883205382558, + "grad_norm": 0.625, + "learning_rate": 3.2044508972736044e-06, + "loss": 1.4277, + "step": 8566 + }, + { + "epoch": 1.4779608384369878, + "grad_norm": 0.58984375, + "learning_rate": 3.202454311559847e-06, + "loss": 1.4677, + "step": 8567 + }, + { + "epoch": 1.4781333563357197, + "grad_norm": 0.640625, + "learning_rate": 3.200458229450749e-06, + "loss": 1.3658, + "step": 8568 + }, + { + "epoch": 1.4783058742344517, + "grad_norm": 0.5859375, + "learning_rate": 3.198462651094193e-06, + "loss": 1.3626, + "step": 8569 + }, + { + "epoch": 1.4784783921331839, + "grad_norm": 0.59375, + "learning_rate": 3.1964675766380148e-06, + "loss": 1.4493, + "step": 8570 + }, + { + "epoch": 1.4786509100319158, + "grad_norm": 0.58984375, + "learning_rate": 3.194473006230033e-06, + "loss": 1.4275, + "step": 8571 + }, + { + "epoch": 1.4788234279306478, + "grad_norm": 0.640625, + "learning_rate": 3.192478940018018e-06, + "loss": 1.4172, + "step": 8572 + }, + { + "epoch": 1.4789959458293798, + "grad_norm": 0.55859375, + "learning_rate": 3.19048537814969e-06, + "loss": 1.404, + "step": 8573 + }, + { + "epoch": 1.4791684637281117, + "grad_norm": 0.57421875, + "learning_rate": 3.18849232077276e-06, + "loss": 1.4071, + "step": 8574 + }, + { + "epoch": 1.479340981626844, + "grad_norm": 0.57421875, + "learning_rate": 3.186499768034874e-06, + "loss": 1.4131, + "step": 8575 + }, + { + "epoch": 1.4795134995255759, + "grad_norm": 0.57421875, + "learning_rate": 3.1845077200836638e-06, + "loss": 1.4327, + "step": 8576 + }, + { + "epoch": 1.4796860174243078, + "grad_norm": 0.58203125, + "learning_rate": 3.1825161770667023e-06, + "loss": 1.4534, + "step": 8577 + }, + { + "epoch": 1.4798585353230398, + "grad_norm": 0.56640625, + "learning_rate": 3.180525139131542e-06, + "loss": 1.432, + "step": 8578 + }, + { + "epoch": 1.4800310532217718, + "grad_norm": 0.67578125, + "learning_rate": 3.1785346064256884e-06, + "loss": 1.4916, + "step": 8579 + }, + { + "epoch": 1.4802035711205037, + "grad_norm": 0.70703125, + "learning_rate": 3.176544579096613e-06, + "loss": 1.4756, + "step": 8580 + }, + { + "epoch": 1.4803760890192357, + "grad_norm": 0.59375, + "learning_rate": 3.1745550572917503e-06, + "loss": 1.378, + "step": 8581 + }, + { + "epoch": 1.4805486069179676, + "grad_norm": 0.56640625, + "learning_rate": 3.1725660411585e-06, + "loss": 1.4055, + "step": 8582 + }, + { + "epoch": 1.4807211248166996, + "grad_norm": 0.74609375, + "learning_rate": 3.170577530844211e-06, + "loss": 1.437, + "step": 8583 + }, + { + "epoch": 1.4808936427154318, + "grad_norm": 0.62109375, + "learning_rate": 3.1685895264962173e-06, + "loss": 1.4833, + "step": 8584 + }, + { + "epoch": 1.4810661606141637, + "grad_norm": 0.60546875, + "learning_rate": 3.1666020282617925e-06, + "loss": 1.4276, + "step": 8585 + }, + { + "epoch": 1.4812386785128957, + "grad_norm": 0.58203125, + "learning_rate": 3.1646150362881835e-06, + "loss": 1.4181, + "step": 8586 + }, + { + "epoch": 1.4814111964116277, + "grad_norm": 0.62109375, + "learning_rate": 3.1626285507226072e-06, + "loss": 1.4481, + "step": 8587 + }, + { + "epoch": 1.4815837143103596, + "grad_norm": 0.59375, + "learning_rate": 3.160642571712228e-06, + "loss": 1.4209, + "step": 8588 + }, + { + "epoch": 1.4817562322090918, + "grad_norm": 0.58984375, + "learning_rate": 3.158657099404181e-06, + "loss": 1.4144, + "step": 8589 + }, + { + "epoch": 1.4819287501078238, + "grad_norm": 0.65625, + "learning_rate": 3.156672133945563e-06, + "loss": 1.4058, + "step": 8590 + }, + { + "epoch": 1.4821012680065557, + "grad_norm": 0.55078125, + "learning_rate": 3.154687675483432e-06, + "loss": 1.3381, + "step": 8591 + }, + { + "epoch": 1.4822737859052877, + "grad_norm": 0.55078125, + "learning_rate": 3.1527037241648096e-06, + "loss": 1.3612, + "step": 8592 + }, + { + "epoch": 1.4824463038040196, + "grad_norm": 0.7265625, + "learning_rate": 3.150720280136682e-06, + "loss": 1.3687, + "step": 8593 + }, + { + "epoch": 1.4826188217027516, + "grad_norm": 0.5625, + "learning_rate": 3.1487373435459865e-06, + "loss": 1.5162, + "step": 8594 + }, + { + "epoch": 1.4827913396014836, + "grad_norm": 0.609375, + "learning_rate": 3.1467549145396437e-06, + "loss": 1.2491, + "step": 8595 + }, + { + "epoch": 1.4829638575002155, + "grad_norm": 0.59375, + "learning_rate": 3.1447729932645145e-06, + "loss": 1.4794, + "step": 8596 + }, + { + "epoch": 1.4831363753989477, + "grad_norm": 0.578125, + "learning_rate": 3.142791579867435e-06, + "loss": 1.3974, + "step": 8597 + }, + { + "epoch": 1.4833088932976797, + "grad_norm": 0.5546875, + "learning_rate": 3.1408106744952018e-06, + "loss": 1.3553, + "step": 8598 + }, + { + "epoch": 1.4834814111964116, + "grad_norm": 0.78125, + "learning_rate": 3.1388302772945713e-06, + "loss": 1.5289, + "step": 8599 + }, + { + "epoch": 1.4836539290951436, + "grad_norm": 0.6015625, + "learning_rate": 3.136850388412265e-06, + "loss": 1.4299, + "step": 8600 + }, + { + "epoch": 1.4836539290951436, + "eval_loss": 1.4071738719940186, + "eval_runtime": 10.7878, + "eval_samples_per_second": 94.922, + "eval_steps_per_second": 23.731, + "step": 8600 + }, + { + "epoch": 1.4838264469938756, + "grad_norm": 0.56640625, + "learning_rate": 3.1348710079949675e-06, + "loss": 1.4985, + "step": 8601 + }, + { + "epoch": 1.4839989648926077, + "grad_norm": 0.609375, + "learning_rate": 3.1328921361893148e-06, + "loss": 1.4663, + "step": 8602 + }, + { + "epoch": 1.4841714827913397, + "grad_norm": 0.6171875, + "learning_rate": 3.1309137731419236e-06, + "loss": 1.3719, + "step": 8603 + }, + { + "epoch": 1.4843440006900717, + "grad_norm": 0.6171875, + "learning_rate": 3.1289359189993607e-06, + "loss": 1.5156, + "step": 8604 + }, + { + "epoch": 1.4845165185888036, + "grad_norm": 0.59765625, + "learning_rate": 3.1269585739081564e-06, + "loss": 1.373, + "step": 8605 + }, + { + "epoch": 1.4846890364875356, + "grad_norm": 0.5546875, + "learning_rate": 3.1249817380148097e-06, + "loss": 1.4311, + "step": 8606 + }, + { + "epoch": 1.4848615543862675, + "grad_norm": 0.67578125, + "learning_rate": 3.123005411465766e-06, + "loss": 1.4612, + "step": 8607 + }, + { + "epoch": 1.4850340722849995, + "grad_norm": 0.640625, + "learning_rate": 3.1210295944074574e-06, + "loss": 1.4437, + "step": 8608 + }, + { + "epoch": 1.4852065901837315, + "grad_norm": 0.6328125, + "learning_rate": 3.119054286986255e-06, + "loss": 1.4048, + "step": 8609 + }, + { + "epoch": 1.4853791080824634, + "grad_norm": 0.6640625, + "learning_rate": 3.1170794893485047e-06, + "loss": 1.403, + "step": 8610 + }, + { + "epoch": 1.4855516259811956, + "grad_norm": 0.578125, + "learning_rate": 3.1151052016405125e-06, + "loss": 1.3551, + "step": 8611 + }, + { + "epoch": 1.4857241438799276, + "grad_norm": 0.5859375, + "learning_rate": 3.1131314240085465e-06, + "loss": 1.4862, + "step": 8612 + }, + { + "epoch": 1.4858966617786595, + "grad_norm": 0.5390625, + "learning_rate": 3.111158156598836e-06, + "loss": 1.3378, + "step": 8613 + }, + { + "epoch": 1.4860691796773915, + "grad_norm": 0.59765625, + "learning_rate": 3.109185399557575e-06, + "loss": 1.413, + "step": 8614 + }, + { + "epoch": 1.4862416975761235, + "grad_norm": 0.5625, + "learning_rate": 3.107213153030909e-06, + "loss": 1.4823, + "step": 8615 + }, + { + "epoch": 1.4864142154748556, + "grad_norm": 0.60546875, + "learning_rate": 3.105241417164967e-06, + "loss": 1.3963, + "step": 8616 + }, + { + "epoch": 1.4865867333735876, + "grad_norm": 0.54296875, + "learning_rate": 3.1032701921058184e-06, + "loss": 1.3688, + "step": 8617 + }, + { + "epoch": 1.4867592512723196, + "grad_norm": 0.60546875, + "learning_rate": 3.1012994779995077e-06, + "loss": 1.3553, + "step": 8618 + }, + { + "epoch": 1.4869317691710515, + "grad_norm": 0.66796875, + "learning_rate": 3.0993292749920355e-06, + "loss": 1.4014, + "step": 8619 + }, + { + "epoch": 1.4871042870697835, + "grad_norm": 0.58984375, + "learning_rate": 3.097359583229368e-06, + "loss": 1.5023, + "step": 8620 + }, + { + "epoch": 1.4872768049685154, + "grad_norm": 0.56640625, + "learning_rate": 3.095390402857432e-06, + "loss": 1.2465, + "step": 8621 + }, + { + "epoch": 1.4874493228672474, + "grad_norm": 0.5859375, + "learning_rate": 3.0934217340221183e-06, + "loss": 1.4298, + "step": 8622 + }, + { + "epoch": 1.4876218407659794, + "grad_norm": 0.578125, + "learning_rate": 3.0914535768692753e-06, + "loss": 1.3638, + "step": 8623 + }, + { + "epoch": 1.4877943586647115, + "grad_norm": 0.5546875, + "learning_rate": 3.089485931544719e-06, + "loss": 1.4358, + "step": 8624 + }, + { + "epoch": 1.4879668765634435, + "grad_norm": 0.5546875, + "learning_rate": 3.0875187981942266e-06, + "loss": 1.3866, + "step": 8625 + }, + { + "epoch": 1.4881393944621755, + "grad_norm": 0.6484375, + "learning_rate": 3.0855521769635266e-06, + "loss": 1.4508, + "step": 8626 + }, + { + "epoch": 1.4883119123609074, + "grad_norm": 0.5625, + "learning_rate": 3.0835860679983308e-06, + "loss": 1.3201, + "step": 8627 + }, + { + "epoch": 1.4884844302596394, + "grad_norm": 0.56640625, + "learning_rate": 3.081620471444292e-06, + "loss": 1.4419, + "step": 8628 + }, + { + "epoch": 1.4886569481583714, + "grad_norm": 0.5703125, + "learning_rate": 3.079655387447037e-06, + "loss": 1.3916, + "step": 8629 + }, + { + "epoch": 1.4888294660571035, + "grad_norm": 0.578125, + "learning_rate": 3.077690816152151e-06, + "loss": 1.3884, + "step": 8630 + }, + { + "epoch": 1.4890019839558355, + "grad_norm": 0.66796875, + "learning_rate": 3.075726757705182e-06, + "loss": 1.4443, + "step": 8631 + }, + { + "epoch": 1.4891745018545675, + "grad_norm": 0.578125, + "learning_rate": 3.073763212251639e-06, + "loss": 1.3651, + "step": 8632 + }, + { + "epoch": 1.4893470197532994, + "grad_norm": 0.58984375, + "learning_rate": 3.071800179936998e-06, + "loss": 1.4278, + "step": 8633 + }, + { + "epoch": 1.4895195376520314, + "grad_norm": 0.6953125, + "learning_rate": 3.0698376609066828e-06, + "loss": 1.48, + "step": 8634 + }, + { + "epoch": 1.4896920555507633, + "grad_norm": 1.0078125, + "learning_rate": 3.0678756553060984e-06, + "loss": 1.3465, + "step": 8635 + }, + { + "epoch": 1.4898645734494953, + "grad_norm": 0.546875, + "learning_rate": 3.0659141632805987e-06, + "loss": 1.3635, + "step": 8636 + }, + { + "epoch": 1.4900370913482273, + "grad_norm": 0.62109375, + "learning_rate": 3.0639531849755044e-06, + "loss": 1.3396, + "step": 8637 + }, + { + "epoch": 1.4902096092469594, + "grad_norm": 0.62109375, + "learning_rate": 3.0619927205360998e-06, + "loss": 1.4531, + "step": 8638 + }, + { + "epoch": 1.4903821271456914, + "grad_norm": 0.6484375, + "learning_rate": 3.0600327701076193e-06, + "loss": 1.4887, + "step": 8639 + }, + { + "epoch": 1.4905546450444234, + "grad_norm": 0.5546875, + "learning_rate": 3.058073333835281e-06, + "loss": 1.4059, + "step": 8640 + }, + { + "epoch": 1.4907271629431553, + "grad_norm": 0.56640625, + "learning_rate": 3.0561144118642406e-06, + "loss": 1.3808, + "step": 8641 + }, + { + "epoch": 1.4908996808418873, + "grad_norm": 0.5625, + "learning_rate": 3.0541560043396322e-06, + "loss": 1.5228, + "step": 8642 + }, + { + "epoch": 1.4910721987406195, + "grad_norm": 0.60546875, + "learning_rate": 3.052198111406547e-06, + "loss": 1.3432, + "step": 8643 + }, + { + "epoch": 1.4912447166393514, + "grad_norm": 0.59765625, + "learning_rate": 3.0502407332100382e-06, + "loss": 1.4385, + "step": 8644 + }, + { + "epoch": 1.4914172345380834, + "grad_norm": 0.59765625, + "learning_rate": 3.048283869895119e-06, + "loss": 1.4864, + "step": 8645 + }, + { + "epoch": 1.4915897524368154, + "grad_norm": 0.5703125, + "learning_rate": 3.04632752160677e-06, + "loss": 1.3705, + "step": 8646 + }, + { + "epoch": 1.4917622703355473, + "grad_norm": 0.7109375, + "learning_rate": 3.044371688489921e-06, + "loss": 1.4323, + "step": 8647 + }, + { + "epoch": 1.4919347882342793, + "grad_norm": 0.578125, + "learning_rate": 3.042416370689485e-06, + "loss": 1.4554, + "step": 8648 + }, + { + "epoch": 1.4921073061330112, + "grad_norm": 0.6015625, + "learning_rate": 3.0404615683503136e-06, + "loss": 1.4668, + "step": 8649 + }, + { + "epoch": 1.4922798240317432, + "grad_norm": 0.65234375, + "learning_rate": 3.0385072816172344e-06, + "loss": 1.3353, + "step": 8650 + }, + { + "epoch": 1.4924523419304752, + "grad_norm": 0.57421875, + "learning_rate": 3.036553510635033e-06, + "loss": 1.3784, + "step": 8651 + }, + { + "epoch": 1.4926248598292073, + "grad_norm": 0.5703125, + "learning_rate": 3.0346002555484567e-06, + "loss": 1.469, + "step": 8652 + }, + { + "epoch": 1.4927973777279393, + "grad_norm": 0.59765625, + "learning_rate": 3.0326475165022164e-06, + "loss": 1.3914, + "step": 8653 + }, + { + "epoch": 1.4929698956266713, + "grad_norm": 0.58984375, + "learning_rate": 3.030695293640982e-06, + "loss": 1.3753, + "step": 8654 + }, + { + "epoch": 1.4931424135254032, + "grad_norm": 0.5703125, + "learning_rate": 3.028743587109385e-06, + "loss": 1.4428, + "step": 8655 + }, + { + "epoch": 1.4933149314241352, + "grad_norm": 0.59765625, + "learning_rate": 3.026792397052023e-06, + "loss": 1.4154, + "step": 8656 + }, + { + "epoch": 1.4934874493228674, + "grad_norm": 0.61328125, + "learning_rate": 3.024841723613453e-06, + "loss": 1.4908, + "step": 8657 + }, + { + "epoch": 1.4936599672215993, + "grad_norm": 0.66796875, + "learning_rate": 3.022891566938185e-06, + "loss": 1.2884, + "step": 8658 + }, + { + "epoch": 1.4938324851203313, + "grad_norm": 0.60546875, + "learning_rate": 3.0209419271707118e-06, + "loss": 1.4292, + "step": 8659 + }, + { + "epoch": 1.4940050030190632, + "grad_norm": 0.59765625, + "learning_rate": 3.018992804455464e-06, + "loss": 1.4774, + "step": 8660 + }, + { + "epoch": 1.4941775209177952, + "grad_norm": 0.578125, + "learning_rate": 3.017044198936848e-06, + "loss": 1.4197, + "step": 8661 + }, + { + "epoch": 1.4943500388165272, + "grad_norm": 0.58203125, + "learning_rate": 3.01509611075923e-06, + "loss": 1.3863, + "step": 8662 + }, + { + "epoch": 1.4945225567152591, + "grad_norm": 0.6171875, + "learning_rate": 3.013148540066936e-06, + "loss": 1.5052, + "step": 8663 + }, + { + "epoch": 1.494695074613991, + "grad_norm": 0.6171875, + "learning_rate": 3.011201487004254e-06, + "loss": 1.3792, + "step": 8664 + }, + { + "epoch": 1.4948675925127233, + "grad_norm": 0.671875, + "learning_rate": 3.0092549517154336e-06, + "loss": 1.4096, + "step": 8665 + }, + { + "epoch": 1.4950401104114552, + "grad_norm": 0.6015625, + "learning_rate": 3.007308934344686e-06, + "loss": 1.4694, + "step": 8666 + }, + { + "epoch": 1.4952126283101872, + "grad_norm": 0.5625, + "learning_rate": 3.005363435036186e-06, + "loss": 1.408, + "step": 8667 + }, + { + "epoch": 1.4953851462089192, + "grad_norm": 0.5625, + "learning_rate": 3.0034184539340663e-06, + "loss": 1.4432, + "step": 8668 + }, + { + "epoch": 1.4955576641076511, + "grad_norm": 0.57421875, + "learning_rate": 3.0014739911824244e-06, + "loss": 1.4187, + "step": 8669 + }, + { + "epoch": 1.495730182006383, + "grad_norm": 0.65234375, + "learning_rate": 2.999530046925322e-06, + "loss": 1.4553, + "step": 8670 + }, + { + "epoch": 1.4959026999051153, + "grad_norm": 0.61328125, + "learning_rate": 2.9975866213067673e-06, + "loss": 1.4767, + "step": 8671 + }, + { + "epoch": 1.4960752178038472, + "grad_norm": 0.55078125, + "learning_rate": 2.9956437144707552e-06, + "loss": 1.4951, + "step": 8672 + }, + { + "epoch": 1.4962477357025792, + "grad_norm": 0.58203125, + "learning_rate": 2.9937013265612192e-06, + "loss": 1.4536, + "step": 8673 + }, + { + "epoch": 1.4964202536013111, + "grad_norm": 0.70703125, + "learning_rate": 2.9917594577220665e-06, + "loss": 1.4156, + "step": 8674 + }, + { + "epoch": 1.496592771500043, + "grad_norm": 0.6328125, + "learning_rate": 2.989818108097162e-06, + "loss": 1.4029, + "step": 8675 + }, + { + "epoch": 1.496765289398775, + "grad_norm": 0.58203125, + "learning_rate": 2.9878772778303344e-06, + "loss": 1.4684, + "step": 8676 + }, + { + "epoch": 1.496937807297507, + "grad_norm": 0.61328125, + "learning_rate": 2.9859369670653715e-06, + "loss": 1.4403, + "step": 8677 + }, + { + "epoch": 1.497110325196239, + "grad_norm": 0.58984375, + "learning_rate": 2.983997175946027e-06, + "loss": 1.4305, + "step": 8678 + }, + { + "epoch": 1.4972828430949712, + "grad_norm": 0.62890625, + "learning_rate": 2.982057904616004e-06, + "loss": 1.4958, + "step": 8679 + }, + { + "epoch": 1.4974553609937031, + "grad_norm": 0.59375, + "learning_rate": 2.9801191532189876e-06, + "loss": 1.4899, + "step": 8680 + }, + { + "epoch": 1.497627878892435, + "grad_norm": 0.55078125, + "learning_rate": 2.9781809218986036e-06, + "loss": 1.4251, + "step": 8681 + }, + { + "epoch": 1.497800396791167, + "grad_norm": 0.640625, + "learning_rate": 2.9762432107984508e-06, + "loss": 1.3534, + "step": 8682 + }, + { + "epoch": 1.497972914689899, + "grad_norm": 0.6015625, + "learning_rate": 2.974306020062088e-06, + "loss": 1.3928, + "step": 8683 + }, + { + "epoch": 1.4981454325886312, + "grad_norm": 0.5703125, + "learning_rate": 2.972369349833033e-06, + "loss": 1.3605, + "step": 8684 + }, + { + "epoch": 1.4983179504873632, + "grad_norm": 0.5625, + "learning_rate": 2.9704332002547677e-06, + "loss": 1.3593, + "step": 8685 + }, + { + "epoch": 1.4984904683860951, + "grad_norm": 0.5390625, + "learning_rate": 2.9684975714707333e-06, + "loss": 1.3329, + "step": 8686 + }, + { + "epoch": 1.498662986284827, + "grad_norm": 0.6796875, + "learning_rate": 2.966562463624334e-06, + "loss": 1.468, + "step": 8687 + }, + { + "epoch": 1.498835504183559, + "grad_norm": 0.578125, + "learning_rate": 2.9646278768589345e-06, + "loss": 1.4801, + "step": 8688 + }, + { + "epoch": 1.499008022082291, + "grad_norm": 0.64453125, + "learning_rate": 2.962693811317863e-06, + "loss": 1.5002, + "step": 8689 + }, + { + "epoch": 1.499180539981023, + "grad_norm": 0.55078125, + "learning_rate": 2.9607602671443993e-06, + "loss": 1.3173, + "step": 8690 + }, + { + "epoch": 1.499353057879755, + "grad_norm": 0.5859375, + "learning_rate": 2.9588272444818056e-06, + "loss": 1.4184, + "step": 8691 + }, + { + "epoch": 1.4995255757784869, + "grad_norm": 0.6015625, + "learning_rate": 2.9568947434732777e-06, + "loss": 1.4755, + "step": 8692 + }, + { + "epoch": 1.499698093677219, + "grad_norm": 0.59375, + "learning_rate": 2.9549627642620005e-06, + "loss": 1.344, + "step": 8693 + }, + { + "epoch": 1.499870611575951, + "grad_norm": 0.609375, + "learning_rate": 2.9530313069910986e-06, + "loss": 1.4186, + "step": 8694 + }, + { + "epoch": 1.500043129474683, + "grad_norm": 0.609375, + "learning_rate": 2.951100371803669e-06, + "loss": 1.3953, + "step": 8695 + }, + { + "epoch": 1.500215647373415, + "grad_norm": 0.56640625, + "learning_rate": 2.949169958842767e-06, + "loss": 1.4259, + "step": 8696 + }, + { + "epoch": 1.5003881652721471, + "grad_norm": 1.03125, + "learning_rate": 2.9472400682514104e-06, + "loss": 1.4243, + "step": 8697 + }, + { + "epoch": 1.500560683170879, + "grad_norm": 0.57421875, + "learning_rate": 2.945310700172577e-06, + "loss": 1.4715, + "step": 8698 + }, + { + "epoch": 1.500733201069611, + "grad_norm": 0.61328125, + "learning_rate": 2.9433818547492067e-06, + "loss": 1.4727, + "step": 8699 + }, + { + "epoch": 1.500905718968343, + "grad_norm": 0.57421875, + "learning_rate": 2.941453532124201e-06, + "loss": 1.4355, + "step": 8700 + }, + { + "epoch": 1.500905718968343, + "eval_loss": 1.4072085618972778, + "eval_runtime": 10.8369, + "eval_samples_per_second": 94.492, + "eval_steps_per_second": 23.623, + "step": 8700 + }, + { + "epoch": 1.501078236867075, + "grad_norm": 0.58984375, + "learning_rate": 2.9395257324404204e-06, + "loss": 1.5278, + "step": 8701 + }, + { + "epoch": 1.501250754765807, + "grad_norm": 0.6171875, + "learning_rate": 2.9375984558406934e-06, + "loss": 1.4436, + "step": 8702 + }, + { + "epoch": 1.501423272664539, + "grad_norm": 0.5703125, + "learning_rate": 2.935671702467794e-06, + "loss": 1.4489, + "step": 8703 + }, + { + "epoch": 1.5015957905632709, + "grad_norm": 0.55859375, + "learning_rate": 2.933745472464481e-06, + "loss": 1.4356, + "step": 8704 + }, + { + "epoch": 1.5017683084620028, + "grad_norm": 0.58984375, + "learning_rate": 2.9318197659734527e-06, + "loss": 1.3582, + "step": 8705 + }, + { + "epoch": 1.5019408263607348, + "grad_norm": 0.5859375, + "learning_rate": 2.9298945831373803e-06, + "loss": 1.4326, + "step": 8706 + }, + { + "epoch": 1.502113344259467, + "grad_norm": 0.671875, + "learning_rate": 2.9279699240988936e-06, + "loss": 1.4332, + "step": 8707 + }, + { + "epoch": 1.502285862158199, + "grad_norm": 0.9453125, + "learning_rate": 2.9260457890005823e-06, + "loss": 1.3855, + "step": 8708 + }, + { + "epoch": 1.5024583800569309, + "grad_norm": 0.57421875, + "learning_rate": 2.924122177984998e-06, + "loss": 1.4103, + "step": 8709 + }, + { + "epoch": 1.5026308979556628, + "grad_norm": 0.5703125, + "learning_rate": 2.9221990911946595e-06, + "loss": 1.4533, + "step": 8710 + }, + { + "epoch": 1.502803415854395, + "grad_norm": 0.65234375, + "learning_rate": 2.92027652877203e-06, + "loss": 1.5407, + "step": 8711 + }, + { + "epoch": 1.502975933753127, + "grad_norm": 0.5859375, + "learning_rate": 2.9183544908595573e-06, + "loss": 1.4197, + "step": 8712 + }, + { + "epoch": 1.503148451651859, + "grad_norm": 0.60546875, + "learning_rate": 2.9164329775996293e-06, + "loss": 1.4315, + "step": 8713 + }, + { + "epoch": 1.503320969550591, + "grad_norm": 0.62109375, + "learning_rate": 2.9145119891346062e-06, + "loss": 1.4136, + "step": 8714 + }, + { + "epoch": 1.5034934874493229, + "grad_norm": 0.828125, + "learning_rate": 2.912591525606807e-06, + "loss": 1.3841, + "step": 8715 + }, + { + "epoch": 1.5036660053480548, + "grad_norm": 1.15625, + "learning_rate": 2.9106715871585124e-06, + "loss": 1.3314, + "step": 8716 + }, + { + "epoch": 1.5038385232467868, + "grad_norm": 1.5390625, + "learning_rate": 2.9087521739319624e-06, + "loss": 1.4136, + "step": 8717 + }, + { + "epoch": 1.5040110411455188, + "grad_norm": 0.5625, + "learning_rate": 2.9068332860693594e-06, + "loss": 1.4267, + "step": 8718 + }, + { + "epoch": 1.5041835590442507, + "grad_norm": 0.578125, + "learning_rate": 2.904914923712867e-06, + "loss": 1.3874, + "step": 8719 + }, + { + "epoch": 1.5043560769429827, + "grad_norm": 0.609375, + "learning_rate": 2.902997087004609e-06, + "loss": 1.415, + "step": 8720 + }, + { + "epoch": 1.5045285948417149, + "grad_norm": 0.62109375, + "learning_rate": 2.901079776086674e-06, + "loss": 1.3695, + "step": 8721 + }, + { + "epoch": 1.5047011127404468, + "grad_norm": 0.5546875, + "learning_rate": 2.8991629911011e-06, + "loss": 1.4224, + "step": 8722 + }, + { + "epoch": 1.5048736306391788, + "grad_norm": 0.59765625, + "learning_rate": 2.8972467321899045e-06, + "loss": 1.4014, + "step": 8723 + }, + { + "epoch": 1.505046148537911, + "grad_norm": 0.58203125, + "learning_rate": 2.895330999495045e-06, + "loss": 1.4829, + "step": 8724 + }, + { + "epoch": 1.505218666436643, + "grad_norm": 0.57421875, + "learning_rate": 2.893415793158464e-06, + "loss": 1.4623, + "step": 8725 + }, + { + "epoch": 1.5053911843353749, + "grad_norm": 0.55859375, + "learning_rate": 2.891501113322042e-06, + "loss": 1.5667, + "step": 8726 + }, + { + "epoch": 1.5055637022341068, + "grad_norm": 0.625, + "learning_rate": 2.8895869601276326e-06, + "loss": 1.453, + "step": 8727 + }, + { + "epoch": 1.5057362201328388, + "grad_norm": 0.70703125, + "learning_rate": 2.8876733337170503e-06, + "loss": 1.4474, + "step": 8728 + }, + { + "epoch": 1.5059087380315708, + "grad_norm": 0.5859375, + "learning_rate": 2.8857602342320666e-06, + "loss": 1.4552, + "step": 8729 + }, + { + "epoch": 1.5060812559303027, + "grad_norm": 0.578125, + "learning_rate": 2.883847661814416e-06, + "loss": 1.4481, + "step": 8730 + }, + { + "epoch": 1.5062537738290347, + "grad_norm": 0.578125, + "learning_rate": 2.8819356166057953e-06, + "loss": 1.419, + "step": 8731 + }, + { + "epoch": 1.5064262917277667, + "grad_norm": 0.62109375, + "learning_rate": 2.88002409874786e-06, + "loss": 1.38, + "step": 8732 + }, + { + "epoch": 1.5065988096264986, + "grad_norm": 0.59765625, + "learning_rate": 2.8781131083822267e-06, + "loss": 1.3751, + "step": 8733 + }, + { + "epoch": 1.5067713275252308, + "grad_norm": 0.64453125, + "learning_rate": 2.8762026456504767e-06, + "loss": 1.4008, + "step": 8734 + }, + { + "epoch": 1.5069438454239628, + "grad_norm": 1.1484375, + "learning_rate": 2.874292710694141e-06, + "loss": 1.4236, + "step": 8735 + }, + { + "epoch": 1.5071163633226947, + "grad_norm": 0.6484375, + "learning_rate": 2.87238330365473e-06, + "loss": 1.3983, + "step": 8736 + }, + { + "epoch": 1.5072888812214267, + "grad_norm": 0.56640625, + "learning_rate": 2.8704744246736972e-06, + "loss": 1.4195, + "step": 8737 + }, + { + "epoch": 1.5074613991201589, + "grad_norm": 0.65234375, + "learning_rate": 2.868566073892465e-06, + "loss": 1.5079, + "step": 8738 + }, + { + "epoch": 1.5076339170188908, + "grad_norm": 0.6328125, + "learning_rate": 2.8666582514524175e-06, + "loss": 1.4868, + "step": 8739 + }, + { + "epoch": 1.5078064349176228, + "grad_norm": 0.59765625, + "learning_rate": 2.8647509574948997e-06, + "loss": 1.4152, + "step": 8740 + }, + { + "epoch": 1.5079789528163547, + "grad_norm": 0.61328125, + "learning_rate": 2.8628441921612117e-06, + "loss": 1.4388, + "step": 8741 + }, + { + "epoch": 1.5081514707150867, + "grad_norm": 0.5625, + "learning_rate": 2.8609379555926255e-06, + "loss": 1.4466, + "step": 8742 + }, + { + "epoch": 1.5083239886138187, + "grad_norm": 0.6015625, + "learning_rate": 2.8590322479303554e-06, + "loss": 1.3899, + "step": 8743 + }, + { + "epoch": 1.5084965065125506, + "grad_norm": 0.59765625, + "learning_rate": 2.8571270693156028e-06, + "loss": 1.5198, + "step": 8744 + }, + { + "epoch": 1.5086690244112826, + "grad_norm": 0.5859375, + "learning_rate": 2.855222419889503e-06, + "loss": 1.4558, + "step": 8745 + }, + { + "epoch": 1.5088415423100145, + "grad_norm": 0.57421875, + "learning_rate": 2.8533182997931695e-06, + "loss": 1.4012, + "step": 8746 + }, + { + "epoch": 1.5090140602087465, + "grad_norm": 0.55859375, + "learning_rate": 2.8514147091676713e-06, + "loss": 1.4706, + "step": 8747 + }, + { + "epoch": 1.5091865781074787, + "grad_norm": 0.578125, + "learning_rate": 2.8495116481540375e-06, + "loss": 1.4089, + "step": 8748 + }, + { + "epoch": 1.5093590960062107, + "grad_norm": 0.671875, + "learning_rate": 2.8476091168932594e-06, + "loss": 1.4138, + "step": 8749 + }, + { + "epoch": 1.5095316139049426, + "grad_norm": 0.62109375, + "learning_rate": 2.8457071155262885e-06, + "loss": 1.3905, + "step": 8750 + }, + { + "epoch": 1.5097041318036746, + "grad_norm": 0.56640625, + "learning_rate": 2.8438056441940353e-06, + "loss": 1.4285, + "step": 8751 + }, + { + "epoch": 1.5098766497024068, + "grad_norm": 0.578125, + "learning_rate": 2.8419047030373746e-06, + "loss": 1.4192, + "step": 8752 + }, + { + "epoch": 1.5100491676011387, + "grad_norm": 0.63671875, + "learning_rate": 2.8400042921971395e-06, + "loss": 1.3904, + "step": 8753 + }, + { + "epoch": 1.5102216854998707, + "grad_norm": 0.62890625, + "learning_rate": 2.8381044118141244e-06, + "loss": 1.5384, + "step": 8754 + }, + { + "epoch": 1.5103942033986026, + "grad_norm": 0.56640625, + "learning_rate": 2.836205062029086e-06, + "loss": 1.3682, + "step": 8755 + }, + { + "epoch": 1.5105667212973346, + "grad_norm": 0.59765625, + "learning_rate": 2.834306242982733e-06, + "loss": 1.4479, + "step": 8756 + }, + { + "epoch": 1.5107392391960666, + "grad_norm": 0.5546875, + "learning_rate": 2.8324079548157525e-06, + "loss": 1.5293, + "step": 8757 + }, + { + "epoch": 1.5109117570947985, + "grad_norm": 0.57421875, + "learning_rate": 2.830510197668773e-06, + "loss": 1.4863, + "step": 8758 + }, + { + "epoch": 1.5110842749935305, + "grad_norm": 0.62890625, + "learning_rate": 2.828612971682395e-06, + "loss": 1.3653, + "step": 8759 + }, + { + "epoch": 1.5112567928922624, + "grad_norm": 0.5703125, + "learning_rate": 2.826716276997177e-06, + "loss": 1.3778, + "step": 8760 + }, + { + "epoch": 1.5114293107909944, + "grad_norm": 0.609375, + "learning_rate": 2.824820113753638e-06, + "loss": 1.4826, + "step": 8761 + }, + { + "epoch": 1.5116018286897266, + "grad_norm": 0.68359375, + "learning_rate": 2.8229244820922573e-06, + "loss": 1.4547, + "step": 8762 + }, + { + "epoch": 1.5117743465884586, + "grad_norm": 0.6015625, + "learning_rate": 2.8210293821534763e-06, + "loss": 1.4667, + "step": 8763 + }, + { + "epoch": 1.5119468644871905, + "grad_norm": 0.5546875, + "learning_rate": 2.8191348140776942e-06, + "loss": 1.5144, + "step": 8764 + }, + { + "epoch": 1.5121193823859227, + "grad_norm": 0.59765625, + "learning_rate": 2.8172407780052726e-06, + "loss": 1.3551, + "step": 8765 + }, + { + "epoch": 1.5122919002846547, + "grad_norm": 0.59375, + "learning_rate": 2.8153472740765385e-06, + "loss": 1.44, + "step": 8766 + }, + { + "epoch": 1.5124644181833866, + "grad_norm": 0.60546875, + "learning_rate": 2.813454302431763e-06, + "loss": 1.4299, + "step": 8767 + }, + { + "epoch": 1.5126369360821186, + "grad_norm": 0.61328125, + "learning_rate": 2.8115618632112027e-06, + "loss": 1.45, + "step": 8768 + }, + { + "epoch": 1.5128094539808505, + "grad_norm": 0.68359375, + "learning_rate": 2.8096699565550525e-06, + "loss": 1.4528, + "step": 8769 + }, + { + "epoch": 1.5129819718795825, + "grad_norm": 0.61328125, + "learning_rate": 2.807778582603479e-06, + "loss": 1.4855, + "step": 8770 + }, + { + "epoch": 1.5131544897783145, + "grad_norm": 0.55859375, + "learning_rate": 2.805887741496607e-06, + "loss": 1.4791, + "step": 8771 + }, + { + "epoch": 1.5133270076770464, + "grad_norm": 0.59765625, + "learning_rate": 2.803997433374521e-06, + "loss": 1.4079, + "step": 8772 + }, + { + "epoch": 1.5134995255757784, + "grad_norm": 0.796875, + "learning_rate": 2.80210765837727e-06, + "loss": 1.428, + "step": 8773 + }, + { + "epoch": 1.5136720434745103, + "grad_norm": 0.65625, + "learning_rate": 2.80021841664486e-06, + "loss": 1.3966, + "step": 8774 + }, + { + "epoch": 1.5138445613732425, + "grad_norm": 0.6484375, + "learning_rate": 2.7983297083172487e-06, + "loss": 1.4717, + "step": 8775 + }, + { + "epoch": 1.5140170792719745, + "grad_norm": 0.58203125, + "learning_rate": 2.7964415335343785e-06, + "loss": 1.4614, + "step": 8776 + }, + { + "epoch": 1.5141895971707064, + "grad_norm": 0.640625, + "learning_rate": 2.7945538924361258e-06, + "loss": 1.4601, + "step": 8777 + }, + { + "epoch": 1.5143621150694384, + "grad_norm": 0.65234375, + "learning_rate": 2.7926667851623437e-06, + "loss": 1.4284, + "step": 8778 + }, + { + "epoch": 1.5145346329681706, + "grad_norm": 0.5703125, + "learning_rate": 2.790780211852838e-06, + "loss": 1.459, + "step": 8779 + }, + { + "epoch": 1.5147071508669026, + "grad_norm": 0.609375, + "learning_rate": 2.7888941726473773e-06, + "loss": 1.413, + "step": 8780 + }, + { + "epoch": 1.5148796687656345, + "grad_norm": 0.6015625, + "learning_rate": 2.787008667685699e-06, + "loss": 1.3772, + "step": 8781 + }, + { + "epoch": 1.5150521866643665, + "grad_norm": 0.578125, + "learning_rate": 2.7851236971074848e-06, + "loss": 1.5823, + "step": 8782 + }, + { + "epoch": 1.5152247045630984, + "grad_norm": 0.609375, + "learning_rate": 2.783239261052387e-06, + "loss": 1.4732, + "step": 8783 + }, + { + "epoch": 1.5153972224618304, + "grad_norm": 0.578125, + "learning_rate": 2.7813553596600175e-06, + "loss": 1.4055, + "step": 8784 + }, + { + "epoch": 1.5155697403605624, + "grad_norm": 0.58984375, + "learning_rate": 2.7794719930699477e-06, + "loss": 1.3886, + "step": 8785 + }, + { + "epoch": 1.5157422582592943, + "grad_norm": 0.58203125, + "learning_rate": 2.7775891614217075e-06, + "loss": 1.3907, + "step": 8786 + }, + { + "epoch": 1.5159147761580263, + "grad_norm": 0.61328125, + "learning_rate": 2.7757068648547938e-06, + "loss": 1.3861, + "step": 8787 + }, + { + "epoch": 1.5160872940567582, + "grad_norm": 0.58203125, + "learning_rate": 2.7738251035086485e-06, + "loss": 1.4257, + "step": 8788 + }, + { + "epoch": 1.5162598119554904, + "grad_norm": 0.6484375, + "learning_rate": 2.771943877522697e-06, + "loss": 1.4905, + "step": 8789 + }, + { + "epoch": 1.5164323298542224, + "grad_norm": 0.578125, + "learning_rate": 2.7700631870363038e-06, + "loss": 1.3708, + "step": 8790 + }, + { + "epoch": 1.5166048477529543, + "grad_norm": 0.6015625, + "learning_rate": 2.7681830321888035e-06, + "loss": 1.5362, + "step": 8791 + }, + { + "epoch": 1.5167773656516865, + "grad_norm": 0.578125, + "learning_rate": 2.766303413119491e-06, + "loss": 1.3839, + "step": 8792 + }, + { + "epoch": 1.5169498835504185, + "grad_norm": 0.5390625, + "learning_rate": 2.7644243299676197e-06, + "loss": 1.405, + "step": 8793 + }, + { + "epoch": 1.5171224014491504, + "grad_norm": 0.62109375, + "learning_rate": 2.7625457828724034e-06, + "loss": 1.3364, + "step": 8794 + }, + { + "epoch": 1.5172949193478824, + "grad_norm": 0.63671875, + "learning_rate": 2.760667771973018e-06, + "loss": 1.4023, + "step": 8795 + }, + { + "epoch": 1.5174674372466144, + "grad_norm": 0.59375, + "learning_rate": 2.7587902974085977e-06, + "loss": 1.3987, + "step": 8796 + }, + { + "epoch": 1.5176399551453463, + "grad_norm": 0.6015625, + "learning_rate": 2.756913359318237e-06, + "loss": 1.4989, + "step": 8797 + }, + { + "epoch": 1.5178124730440783, + "grad_norm": 0.6171875, + "learning_rate": 2.755036957840994e-06, + "loss": 1.349, + "step": 8798 + }, + { + "epoch": 1.5179849909428103, + "grad_norm": 0.56640625, + "learning_rate": 2.753161093115877e-06, + "loss": 1.3201, + "step": 8799 + }, + { + "epoch": 1.5181575088415422, + "grad_norm": 0.5625, + "learning_rate": 2.7512857652818716e-06, + "loss": 1.517, + "step": 8800 + }, + { + "epoch": 1.5181575088415422, + "eval_loss": 1.407160758972168, + "eval_runtime": 10.9315, + "eval_samples_per_second": 93.674, + "eval_steps_per_second": 23.419, + "step": 8800 + }, + { + "epoch": 1.5183300267402742, + "grad_norm": 0.59765625, + "learning_rate": 2.7494109744779062e-06, + "loss": 1.3835, + "step": 8801 + }, + { + "epoch": 1.5185025446390061, + "grad_norm": 0.68359375, + "learning_rate": 2.7475367208428793e-06, + "loss": 1.5125, + "step": 8802 + }, + { + "epoch": 1.5186750625377383, + "grad_norm": 0.58984375, + "learning_rate": 2.7456630045156473e-06, + "loss": 1.4015, + "step": 8803 + }, + { + "epoch": 1.5188475804364703, + "grad_norm": 0.63671875, + "learning_rate": 2.7437898256350284e-06, + "loss": 1.3302, + "step": 8804 + }, + { + "epoch": 1.5190200983352022, + "grad_norm": 0.65234375, + "learning_rate": 2.7419171843397975e-06, + "loss": 1.4035, + "step": 8805 + }, + { + "epoch": 1.5191926162339344, + "grad_norm": 0.56640625, + "learning_rate": 2.740045080768694e-06, + "loss": 1.5455, + "step": 8806 + }, + { + "epoch": 1.5193651341326664, + "grad_norm": 0.56640625, + "learning_rate": 2.7381735150604083e-06, + "loss": 1.3257, + "step": 8807 + }, + { + "epoch": 1.5195376520313983, + "grad_norm": 0.578125, + "learning_rate": 2.7363024873536093e-06, + "loss": 1.4662, + "step": 8808 + }, + { + "epoch": 1.5197101699301303, + "grad_norm": 0.91796875, + "learning_rate": 2.7344319977869037e-06, + "loss": 1.4842, + "step": 8809 + }, + { + "epoch": 1.5198826878288623, + "grad_norm": 0.58984375, + "learning_rate": 2.7325620464988733e-06, + "loss": 1.4691, + "step": 8810 + }, + { + "epoch": 1.5200552057275942, + "grad_norm": 0.59765625, + "learning_rate": 2.730692633628055e-06, + "loss": 1.3634, + "step": 8811 + }, + { + "epoch": 1.5202277236263262, + "grad_norm": 0.58203125, + "learning_rate": 2.728823759312944e-06, + "loss": 1.5168, + "step": 8812 + }, + { + "epoch": 1.5204002415250581, + "grad_norm": 0.59765625, + "learning_rate": 2.726955423692008e-06, + "loss": 1.4439, + "step": 8813 + }, + { + "epoch": 1.52057275942379, + "grad_norm": 0.7578125, + "learning_rate": 2.7250876269036564e-06, + "loss": 1.3487, + "step": 8814 + }, + { + "epoch": 1.520745277322522, + "grad_norm": 0.5859375, + "learning_rate": 2.723220369086267e-06, + "loss": 1.4754, + "step": 8815 + }, + { + "epoch": 1.5209177952212543, + "grad_norm": 0.55859375, + "learning_rate": 2.7213536503781813e-06, + "loss": 1.4661, + "step": 8816 + }, + { + "epoch": 1.5210903131199862, + "grad_norm": 0.609375, + "learning_rate": 2.7194874709176967e-06, + "loss": 1.4603, + "step": 8817 + }, + { + "epoch": 1.5212628310187182, + "grad_norm": 0.5625, + "learning_rate": 2.7176218308430703e-06, + "loss": 1.4373, + "step": 8818 + }, + { + "epoch": 1.5214353489174501, + "grad_norm": 0.58984375, + "learning_rate": 2.7157567302925235e-06, + "loss": 1.4028, + "step": 8819 + }, + { + "epoch": 1.5216078668161823, + "grad_norm": 0.58984375, + "learning_rate": 2.7138921694042264e-06, + "loss": 1.3868, + "step": 8820 + }, + { + "epoch": 1.5217803847149143, + "grad_norm": 0.57421875, + "learning_rate": 2.71202814831633e-06, + "loss": 1.5207, + "step": 8821 + }, + { + "epoch": 1.5219529026136462, + "grad_norm": 0.55859375, + "learning_rate": 2.7101646671669224e-06, + "loss": 1.3747, + "step": 8822 + }, + { + "epoch": 1.5221254205123782, + "grad_norm": 0.5703125, + "learning_rate": 2.708301726094065e-06, + "loss": 1.3591, + "step": 8823 + }, + { + "epoch": 1.5222979384111102, + "grad_norm": 0.62890625, + "learning_rate": 2.7064393252357757e-06, + "loss": 1.4247, + "step": 8824 + }, + { + "epoch": 1.5224704563098421, + "grad_norm": 0.640625, + "learning_rate": 2.7045774647300347e-06, + "loss": 1.4324, + "step": 8825 + }, + { + "epoch": 1.522642974208574, + "grad_norm": 0.71875, + "learning_rate": 2.7027161447147786e-06, + "loss": 1.4449, + "step": 8826 + }, + { + "epoch": 1.522815492107306, + "grad_norm": 0.56640625, + "learning_rate": 2.700855365327909e-06, + "loss": 1.4112, + "step": 8827 + }, + { + "epoch": 1.522988010006038, + "grad_norm": 0.5625, + "learning_rate": 2.6989951267072744e-06, + "loss": 1.3578, + "step": 8828 + }, + { + "epoch": 1.52316052790477, + "grad_norm": 0.59375, + "learning_rate": 2.697135428990704e-06, + "loss": 1.4005, + "step": 8829 + }, + { + "epoch": 1.5233330458035022, + "grad_norm": 0.57421875, + "learning_rate": 2.695276272315973e-06, + "loss": 1.4191, + "step": 8830 + }, + { + "epoch": 1.5235055637022341, + "grad_norm": 0.63671875, + "learning_rate": 2.6934176568208126e-06, + "loss": 1.4618, + "step": 8831 + }, + { + "epoch": 1.523678081600966, + "grad_norm": 0.59375, + "learning_rate": 2.691559582642932e-06, + "loss": 1.2921, + "step": 8832 + }, + { + "epoch": 1.5238505994996983, + "grad_norm": 0.5703125, + "learning_rate": 2.689702049919979e-06, + "loss": 1.4131, + "step": 8833 + }, + { + "epoch": 1.5240231173984302, + "grad_norm": 0.58984375, + "learning_rate": 2.6878450587895754e-06, + "loss": 1.3017, + "step": 8834 + }, + { + "epoch": 1.5241956352971622, + "grad_norm": 0.55859375, + "learning_rate": 2.6859886093892983e-06, + "loss": 1.5027, + "step": 8835 + }, + { + "epoch": 1.5243681531958941, + "grad_norm": 0.59375, + "learning_rate": 2.6841327018566842e-06, + "loss": 1.4793, + "step": 8836 + }, + { + "epoch": 1.524540671094626, + "grad_norm": 0.57421875, + "learning_rate": 2.6822773363292333e-06, + "loss": 1.3827, + "step": 8837 + }, + { + "epoch": 1.524713188993358, + "grad_norm": 0.60546875, + "learning_rate": 2.6804225129444016e-06, + "loss": 1.4242, + "step": 8838 + }, + { + "epoch": 1.52488570689209, + "grad_norm": 0.625, + "learning_rate": 2.6785682318396e-06, + "loss": 1.4618, + "step": 8839 + }, + { + "epoch": 1.525058224790822, + "grad_norm": 0.5703125, + "learning_rate": 2.676714493152216e-06, + "loss": 1.3629, + "step": 8840 + }, + { + "epoch": 1.525230742689554, + "grad_norm": 0.56640625, + "learning_rate": 2.674861297019574e-06, + "loss": 1.3417, + "step": 8841 + }, + { + "epoch": 1.525403260588286, + "grad_norm": 0.59375, + "learning_rate": 2.6730086435789828e-06, + "loss": 1.5157, + "step": 8842 + }, + { + "epoch": 1.5255757784870179, + "grad_norm": 0.671875, + "learning_rate": 2.671156532967689e-06, + "loss": 1.4606, + "step": 8843 + }, + { + "epoch": 1.52574829638575, + "grad_norm": 0.6328125, + "learning_rate": 2.66930496532291e-06, + "loss": 1.4213, + "step": 8844 + }, + { + "epoch": 1.525920814284482, + "grad_norm": 0.5859375, + "learning_rate": 2.667453940781829e-06, + "loss": 1.4048, + "step": 8845 + }, + { + "epoch": 1.526093332183214, + "grad_norm": 0.59765625, + "learning_rate": 2.665603459481573e-06, + "loss": 1.4262, + "step": 8846 + }, + { + "epoch": 1.5262658500819462, + "grad_norm": 0.60546875, + "learning_rate": 2.6637535215592405e-06, + "loss": 1.4697, + "step": 8847 + }, + { + "epoch": 1.5264383679806781, + "grad_norm": 0.66015625, + "learning_rate": 2.6619041271518854e-06, + "loss": 1.5062, + "step": 8848 + }, + { + "epoch": 1.52661088587941, + "grad_norm": 0.6640625, + "learning_rate": 2.6600552763965238e-06, + "loss": 1.349, + "step": 8849 + }, + { + "epoch": 1.526783403778142, + "grad_norm": 0.5859375, + "learning_rate": 2.6582069694301284e-06, + "loss": 1.4759, + "step": 8850 + }, + { + "epoch": 1.526955921676874, + "grad_norm": 0.58984375, + "learning_rate": 2.6563592063896383e-06, + "loss": 1.413, + "step": 8851 + }, + { + "epoch": 1.527128439575606, + "grad_norm": 0.61328125, + "learning_rate": 2.6545119874119364e-06, + "loss": 1.3788, + "step": 8852 + }, + { + "epoch": 1.527300957474338, + "grad_norm": 0.609375, + "learning_rate": 2.65266531263389e-06, + "loss": 1.4434, + "step": 8853 + }, + { + "epoch": 1.5274734753730699, + "grad_norm": 0.6015625, + "learning_rate": 2.6508191821923023e-06, + "loss": 1.4395, + "step": 8854 + }, + { + "epoch": 1.5276459932718018, + "grad_norm": 0.69921875, + "learning_rate": 2.6489735962239495e-06, + "loss": 1.4694, + "step": 8855 + }, + { + "epoch": 1.5278185111705338, + "grad_norm": 0.60546875, + "learning_rate": 2.6471285548655644e-06, + "loss": 1.4755, + "step": 8856 + }, + { + "epoch": 1.527991029069266, + "grad_norm": 0.6015625, + "learning_rate": 2.6452840582538387e-06, + "loss": 1.331, + "step": 8857 + }, + { + "epoch": 1.528163546967998, + "grad_norm": 0.55859375, + "learning_rate": 2.6434401065254234e-06, + "loss": 1.4054, + "step": 8858 + }, + { + "epoch": 1.52833606486673, + "grad_norm": 0.5625, + "learning_rate": 2.641596699816935e-06, + "loss": 1.4518, + "step": 8859 + }, + { + "epoch": 1.5285085827654619, + "grad_norm": 0.5546875, + "learning_rate": 2.6397538382649334e-06, + "loss": 1.4237, + "step": 8860 + }, + { + "epoch": 1.528681100664194, + "grad_norm": 0.59765625, + "learning_rate": 2.6379115220059604e-06, + "loss": 1.395, + "step": 8861 + }, + { + "epoch": 1.528853618562926, + "grad_norm": 0.56640625, + "learning_rate": 2.6360697511765053e-06, + "loss": 1.429, + "step": 8862 + }, + { + "epoch": 1.529026136461658, + "grad_norm": 0.58203125, + "learning_rate": 2.6342285259130085e-06, + "loss": 1.3681, + "step": 8863 + }, + { + "epoch": 1.52919865436039, + "grad_norm": 0.640625, + "learning_rate": 2.6323878463518915e-06, + "loss": 1.4454, + "step": 8864 + }, + { + "epoch": 1.529371172259122, + "grad_norm": 0.62890625, + "learning_rate": 2.6305477126295152e-06, + "loss": 1.443, + "step": 8865 + }, + { + "epoch": 1.5295436901578539, + "grad_norm": 0.67578125, + "learning_rate": 2.628708124882212e-06, + "loss": 1.5633, + "step": 8866 + }, + { + "epoch": 1.5297162080565858, + "grad_norm": 0.65625, + "learning_rate": 2.626869083246267e-06, + "loss": 1.4025, + "step": 8867 + }, + { + "epoch": 1.5298887259553178, + "grad_norm": 0.5859375, + "learning_rate": 2.625030587857931e-06, + "loss": 1.512, + "step": 8868 + }, + { + "epoch": 1.5300612438540497, + "grad_norm": 0.5859375, + "learning_rate": 2.62319263885341e-06, + "loss": 1.4404, + "step": 8869 + }, + { + "epoch": 1.5302337617527817, + "grad_norm": 0.59375, + "learning_rate": 2.6213552363688734e-06, + "loss": 1.5006, + "step": 8870 + }, + { + "epoch": 1.5304062796515139, + "grad_norm": 0.64453125, + "learning_rate": 2.619518380540439e-06, + "loss": 1.456, + "step": 8871 + }, + { + "epoch": 1.5305787975502458, + "grad_norm": 0.59375, + "learning_rate": 2.617682071504204e-06, + "loss": 1.4089, + "step": 8872 + }, + { + "epoch": 1.5307513154489778, + "grad_norm": 0.60546875, + "learning_rate": 2.6158463093962015e-06, + "loss": 1.423, + "step": 8873 + }, + { + "epoch": 1.53092383334771, + "grad_norm": 0.61328125, + "learning_rate": 2.61401109435245e-06, + "loss": 1.5205, + "step": 8874 + }, + { + "epoch": 1.531096351246442, + "grad_norm": 0.546875, + "learning_rate": 2.612176426508902e-06, + "loss": 1.408, + "step": 8875 + }, + { + "epoch": 1.531268869145174, + "grad_norm": 0.59765625, + "learning_rate": 2.610342306001484e-06, + "loss": 1.4899, + "step": 8876 + }, + { + "epoch": 1.5314413870439059, + "grad_norm": 0.66796875, + "learning_rate": 2.6085087329660864e-06, + "loss": 1.4048, + "step": 8877 + }, + { + "epoch": 1.5316139049426378, + "grad_norm": 0.5703125, + "learning_rate": 2.606675707538542e-06, + "loss": 1.4059, + "step": 8878 + }, + { + "epoch": 1.5317864228413698, + "grad_norm": 0.58203125, + "learning_rate": 2.6048432298546576e-06, + "loss": 1.3969, + "step": 8879 + }, + { + "epoch": 1.5319589407401017, + "grad_norm": 0.59375, + "learning_rate": 2.6030113000501933e-06, + "loss": 1.3853, + "step": 8880 + }, + { + "epoch": 1.5321314586388337, + "grad_norm": 0.58203125, + "learning_rate": 2.601179918260871e-06, + "loss": 1.4777, + "step": 8881 + }, + { + "epoch": 1.5323039765375657, + "grad_norm": 0.56640625, + "learning_rate": 2.59934908462237e-06, + "loss": 1.321, + "step": 8882 + }, + { + "epoch": 1.5324764944362976, + "grad_norm": 0.640625, + "learning_rate": 2.5975187992703333e-06, + "loss": 1.528, + "step": 8883 + }, + { + "epoch": 1.5326490123350298, + "grad_norm": 0.5703125, + "learning_rate": 2.595689062340351e-06, + "loss": 1.4284, + "step": 8884 + }, + { + "epoch": 1.5328215302337618, + "grad_norm": 0.58203125, + "learning_rate": 2.5938598739679934e-06, + "loss": 1.3888, + "step": 8885 + }, + { + "epoch": 1.5329940481324937, + "grad_norm": 0.58984375, + "learning_rate": 2.5920312342887687e-06, + "loss": 1.4692, + "step": 8886 + }, + { + "epoch": 1.5331665660312257, + "grad_norm": 0.58984375, + "learning_rate": 2.590203143438157e-06, + "loss": 1.4418, + "step": 8887 + }, + { + "epoch": 1.5333390839299579, + "grad_norm": 0.59375, + "learning_rate": 2.588375601551595e-06, + "loss": 1.4874, + "step": 8888 + }, + { + "epoch": 1.5335116018286898, + "grad_norm": 0.578125, + "learning_rate": 2.586548608764479e-06, + "loss": 1.4726, + "step": 8889 + }, + { + "epoch": 1.5336841197274218, + "grad_norm": 0.6015625, + "learning_rate": 2.5847221652121634e-06, + "loss": 1.3969, + "step": 8890 + }, + { + "epoch": 1.5338566376261538, + "grad_norm": 0.64453125, + "learning_rate": 2.5828962710299655e-06, + "loss": 1.3807, + "step": 8891 + }, + { + "epoch": 1.5340291555248857, + "grad_norm": 0.8828125, + "learning_rate": 2.5810709263531496e-06, + "loss": 1.4172, + "step": 8892 + }, + { + "epoch": 1.5342016734236177, + "grad_norm": 0.58984375, + "learning_rate": 2.579246131316958e-06, + "loss": 1.5168, + "step": 8893 + }, + { + "epoch": 1.5343741913223496, + "grad_norm": 0.59375, + "learning_rate": 2.5774218860565836e-06, + "loss": 1.4052, + "step": 8894 + }, + { + "epoch": 1.5345467092210816, + "grad_norm": 0.578125, + "learning_rate": 2.5755981907071683e-06, + "loss": 1.393, + "step": 8895 + }, + { + "epoch": 1.5347192271198136, + "grad_norm": 0.59765625, + "learning_rate": 2.5737750454038346e-06, + "loss": 1.5, + "step": 8896 + }, + { + "epoch": 1.5348917450185455, + "grad_norm": 0.55078125, + "learning_rate": 2.571952450281643e-06, + "loss": 1.4573, + "step": 8897 + }, + { + "epoch": 1.5350642629172777, + "grad_norm": 0.61328125, + "learning_rate": 2.570130405475627e-06, + "loss": 1.4203, + "step": 8898 + }, + { + "epoch": 1.5352367808160097, + "grad_norm": 0.578125, + "learning_rate": 2.568308911120775e-06, + "loss": 1.3794, + "step": 8899 + }, + { + "epoch": 1.5354092987147416, + "grad_norm": 0.56640625, + "learning_rate": 2.5664879673520337e-06, + "loss": 1.4801, + "step": 8900 + }, + { + "epoch": 1.5354092987147416, + "eval_loss": 1.4071331024169922, + "eval_runtime": 10.9461, + "eval_samples_per_second": 93.549, + "eval_steps_per_second": 23.387, + "step": 8900 + }, + { + "epoch": 1.5355818166134736, + "grad_norm": 0.58203125, + "learning_rate": 2.5646675743043115e-06, + "loss": 1.5236, + "step": 8901 + }, + { + "epoch": 1.5357543345122058, + "grad_norm": 0.61328125, + "learning_rate": 2.562847732112472e-06, + "loss": 1.4504, + "step": 8902 + }, + { + "epoch": 1.5359268524109377, + "grad_norm": 0.578125, + "learning_rate": 2.561028440911343e-06, + "loss": 1.3734, + "step": 8903 + }, + { + "epoch": 1.5360993703096697, + "grad_norm": 0.828125, + "learning_rate": 2.5592097008357107e-06, + "loss": 1.4178, + "step": 8904 + }, + { + "epoch": 1.5362718882084017, + "grad_norm": 0.59375, + "learning_rate": 2.5573915120203097e-06, + "loss": 1.447, + "step": 8905 + }, + { + "epoch": 1.5364444061071336, + "grad_norm": 0.55859375, + "learning_rate": 2.555573874599856e-06, + "loss": 1.3607, + "step": 8906 + }, + { + "epoch": 1.5366169240058656, + "grad_norm": 0.6484375, + "learning_rate": 2.553756788709001e-06, + "loss": 1.4766, + "step": 8907 + }, + { + "epoch": 1.5367894419045975, + "grad_norm": 0.578125, + "learning_rate": 2.5519402544823656e-06, + "loss": 1.4166, + "step": 8908 + }, + { + "epoch": 1.5369619598033295, + "grad_norm": 0.6171875, + "learning_rate": 2.5501242720545404e-06, + "loss": 1.4674, + "step": 8909 + }, + { + "epoch": 1.5371344777020615, + "grad_norm": 1.2734375, + "learning_rate": 2.5483088415600552e-06, + "loss": 1.5013, + "step": 8910 + }, + { + "epoch": 1.5373069956007934, + "grad_norm": 0.5625, + "learning_rate": 2.5464939631334106e-06, + "loss": 1.4255, + "step": 8911 + }, + { + "epoch": 1.5374795134995256, + "grad_norm": 0.546875, + "learning_rate": 2.5446796369090665e-06, + "loss": 1.3638, + "step": 8912 + }, + { + "epoch": 1.5376520313982576, + "grad_norm": 0.64453125, + "learning_rate": 2.5428658630214365e-06, + "loss": 1.4622, + "step": 8913 + }, + { + "epoch": 1.5378245492969895, + "grad_norm": 0.58203125, + "learning_rate": 2.5410526416048986e-06, + "loss": 1.3916, + "step": 8914 + }, + { + "epoch": 1.5379970671957217, + "grad_norm": 0.5859375, + "learning_rate": 2.539239972793789e-06, + "loss": 1.4569, + "step": 8915 + }, + { + "epoch": 1.5381695850944537, + "grad_norm": 0.55859375, + "learning_rate": 2.537427856722393e-06, + "loss": 1.4265, + "step": 8916 + }, + { + "epoch": 1.5383421029931856, + "grad_norm": 0.5625, + "learning_rate": 2.5356162935249762e-06, + "loss": 1.4431, + "step": 8917 + }, + { + "epoch": 1.5385146208919176, + "grad_norm": 0.56640625, + "learning_rate": 2.5338052833357406e-06, + "loss": 1.4271, + "step": 8918 + }, + { + "epoch": 1.5386871387906496, + "grad_norm": 0.59375, + "learning_rate": 2.5319948262888604e-06, + "loss": 1.3865, + "step": 8919 + }, + { + "epoch": 1.5388596566893815, + "grad_norm": 0.59375, + "learning_rate": 2.5301849225184673e-06, + "loss": 1.497, + "step": 8920 + }, + { + "epoch": 1.5390321745881135, + "grad_norm": 0.62109375, + "learning_rate": 2.528375572158647e-06, + "loss": 1.3021, + "step": 8921 + }, + { + "epoch": 1.5392046924868454, + "grad_norm": 0.55859375, + "learning_rate": 2.5265667753434498e-06, + "loss": 1.3756, + "step": 8922 + }, + { + "epoch": 1.5393772103855774, + "grad_norm": 0.6015625, + "learning_rate": 2.5247585322068847e-06, + "loss": 1.4439, + "step": 8923 + }, + { + "epoch": 1.5395497282843094, + "grad_norm": 0.59765625, + "learning_rate": 2.52295084288291e-06, + "loss": 1.4724, + "step": 8924 + }, + { + "epoch": 1.5397222461830415, + "grad_norm": 0.62109375, + "learning_rate": 2.521143707505457e-06, + "loss": 1.3917, + "step": 8925 + }, + { + "epoch": 1.5398947640817735, + "grad_norm": 0.58984375, + "learning_rate": 2.519337126208412e-06, + "loss": 1.3374, + "step": 8926 + }, + { + "epoch": 1.5400672819805055, + "grad_norm": 0.58203125, + "learning_rate": 2.5175310991256085e-06, + "loss": 1.4126, + "step": 8927 + }, + { + "epoch": 1.5402397998792374, + "grad_norm": 0.61328125, + "learning_rate": 2.515725626390859e-06, + "loss": 1.4605, + "step": 8928 + }, + { + "epoch": 1.5404123177779696, + "grad_norm": 0.5859375, + "learning_rate": 2.5139207081379134e-06, + "loss": 1.4168, + "step": 8929 + }, + { + "epoch": 1.5405848356767016, + "grad_norm": 0.5703125, + "learning_rate": 2.5121163445005025e-06, + "loss": 1.5422, + "step": 8930 + }, + { + "epoch": 1.5407573535754335, + "grad_norm": 0.6015625, + "learning_rate": 2.510312535612297e-06, + "loss": 1.3716, + "step": 8931 + }, + { + "epoch": 1.5409298714741655, + "grad_norm": 0.57421875, + "learning_rate": 2.5085092816069367e-06, + "loss": 1.4441, + "step": 8932 + }, + { + "epoch": 1.5411023893728975, + "grad_norm": 0.5859375, + "learning_rate": 2.506706582618017e-06, + "loss": 1.5556, + "step": 8933 + }, + { + "epoch": 1.5412749072716294, + "grad_norm": 0.60546875, + "learning_rate": 2.5049044387790943e-06, + "loss": 1.3855, + "step": 8934 + }, + { + "epoch": 1.5414474251703614, + "grad_norm": 0.58984375, + "learning_rate": 2.503102850223682e-06, + "loss": 1.4681, + "step": 8935 + }, + { + "epoch": 1.5416199430690933, + "grad_norm": 0.6328125, + "learning_rate": 2.5013018170852566e-06, + "loss": 1.4861, + "step": 8936 + }, + { + "epoch": 1.5417924609678253, + "grad_norm": 0.59765625, + "learning_rate": 2.499501339497241e-06, + "loss": 1.3168, + "step": 8937 + }, + { + "epoch": 1.5419649788665573, + "grad_norm": 0.62890625, + "learning_rate": 2.4977014175930368e-06, + "loss": 1.5329, + "step": 8938 + }, + { + "epoch": 1.5421374967652894, + "grad_norm": 0.578125, + "learning_rate": 2.495902051505986e-06, + "loss": 1.4227, + "step": 8939 + }, + { + "epoch": 1.5423100146640214, + "grad_norm": 0.55078125, + "learning_rate": 2.4941032413693955e-06, + "loss": 1.4139, + "step": 8940 + }, + { + "epoch": 1.5424825325627534, + "grad_norm": 0.57421875, + "learning_rate": 2.4923049873165415e-06, + "loss": 1.3561, + "step": 8941 + }, + { + "epoch": 1.5426550504614855, + "grad_norm": 0.5703125, + "learning_rate": 2.4905072894806414e-06, + "loss": 1.3619, + "step": 8942 + }, + { + "epoch": 1.5428275683602175, + "grad_norm": 0.62109375, + "learning_rate": 2.4887101479948826e-06, + "loss": 1.438, + "step": 8943 + }, + { + "epoch": 1.5430000862589495, + "grad_norm": 0.5703125, + "learning_rate": 2.486913562992409e-06, + "loss": 1.3082, + "step": 8944 + }, + { + "epoch": 1.5431726041576814, + "grad_norm": 0.56640625, + "learning_rate": 2.4851175346063227e-06, + "loss": 1.3638, + "step": 8945 + }, + { + "epoch": 1.5433451220564134, + "grad_norm": 0.56640625, + "learning_rate": 2.4833220629696852e-06, + "loss": 1.459, + "step": 8946 + }, + { + "epoch": 1.5435176399551453, + "grad_norm": 0.578125, + "learning_rate": 2.481527148215518e-06, + "loss": 1.5179, + "step": 8947 + }, + { + "epoch": 1.5436901578538773, + "grad_norm": 0.5859375, + "learning_rate": 2.479732790476791e-06, + "loss": 1.3455, + "step": 8948 + }, + { + "epoch": 1.5438626757526093, + "grad_norm": 0.6171875, + "learning_rate": 2.4779389898864538e-06, + "loss": 1.5416, + "step": 8949 + }, + { + "epoch": 1.5440351936513412, + "grad_norm": 0.60546875, + "learning_rate": 2.476145746577394e-06, + "loss": 1.5077, + "step": 8950 + }, + { + "epoch": 1.5442077115500732, + "grad_norm": 0.62890625, + "learning_rate": 2.474353060682467e-06, + "loss": 1.4062, + "step": 8951 + }, + { + "epoch": 1.5443802294488052, + "grad_norm": 0.58203125, + "learning_rate": 2.472560932334489e-06, + "loss": 1.4813, + "step": 8952 + }, + { + "epoch": 1.5445527473475373, + "grad_norm": 0.6328125, + "learning_rate": 2.470769361666231e-06, + "loss": 1.407, + "step": 8953 + }, + { + "epoch": 1.5447252652462693, + "grad_norm": 0.6171875, + "learning_rate": 2.4689783488104223e-06, + "loss": 1.4856, + "step": 8954 + }, + { + "epoch": 1.5448977831450013, + "grad_norm": 0.65234375, + "learning_rate": 2.4671878938997572e-06, + "loss": 1.4617, + "step": 8955 + }, + { + "epoch": 1.5450703010437334, + "grad_norm": 0.58984375, + "learning_rate": 2.465397997066874e-06, + "loss": 1.4065, + "step": 8956 + }, + { + "epoch": 1.5452428189424654, + "grad_norm": 0.60546875, + "learning_rate": 2.4636086584443885e-06, + "loss": 1.4875, + "step": 8957 + }, + { + "epoch": 1.5454153368411974, + "grad_norm": 0.66796875, + "learning_rate": 2.4618198781648663e-06, + "loss": 1.3696, + "step": 8958 + }, + { + "epoch": 1.5455878547399293, + "grad_norm": 0.57421875, + "learning_rate": 2.460031656360822e-06, + "loss": 1.3794, + "step": 8959 + }, + { + "epoch": 1.5457603726386613, + "grad_norm": 0.67578125, + "learning_rate": 2.4582439931647507e-06, + "loss": 1.4181, + "step": 8960 + }, + { + "epoch": 1.5459328905373932, + "grad_norm": 0.62109375, + "learning_rate": 2.4564568887090814e-06, + "loss": 1.54, + "step": 8961 + }, + { + "epoch": 1.5461054084361252, + "grad_norm": 0.59765625, + "learning_rate": 2.4546703431262253e-06, + "loss": 1.4814, + "step": 8962 + }, + { + "epoch": 1.5462779263348572, + "grad_norm": 0.609375, + "learning_rate": 2.452884356548533e-06, + "loss": 1.3949, + "step": 8963 + }, + { + "epoch": 1.5464504442335891, + "grad_norm": 0.62109375, + "learning_rate": 2.4510989291083245e-06, + "loss": 1.3333, + "step": 8964 + }, + { + "epoch": 1.546622962132321, + "grad_norm": 0.5625, + "learning_rate": 2.4493140609378753e-06, + "loss": 1.3779, + "step": 8965 + }, + { + "epoch": 1.5467954800310533, + "grad_norm": 0.5703125, + "learning_rate": 2.4475297521694187e-06, + "loss": 1.4457, + "step": 8966 + }, + { + "epoch": 1.5469679979297852, + "grad_norm": 0.5703125, + "learning_rate": 2.4457460029351476e-06, + "loss": 1.4055, + "step": 8967 + }, + { + "epoch": 1.5471405158285172, + "grad_norm": 0.55859375, + "learning_rate": 2.443962813367218e-06, + "loss": 1.4068, + "step": 8968 + }, + { + "epoch": 1.5473130337272492, + "grad_norm": 0.83984375, + "learning_rate": 2.4421801835977286e-06, + "loss": 1.521, + "step": 8969 + }, + { + "epoch": 1.5474855516259813, + "grad_norm": 0.58203125, + "learning_rate": 2.440398113758761e-06, + "loss": 1.4514, + "step": 8970 + }, + { + "epoch": 1.5476580695247133, + "grad_norm": 0.640625, + "learning_rate": 2.438616603982332e-06, + "loss": 1.5065, + "step": 8971 + }, + { + "epoch": 1.5478305874234453, + "grad_norm": 0.6171875, + "learning_rate": 2.436835654400429e-06, + "loss": 1.4306, + "step": 8972 + }, + { + "epoch": 1.5480031053221772, + "grad_norm": 0.66796875, + "learning_rate": 2.435055265145003e-06, + "loss": 1.4412, + "step": 8973 + }, + { + "epoch": 1.5481756232209092, + "grad_norm": 0.59765625, + "learning_rate": 2.4332754363479483e-06, + "loss": 1.4578, + "step": 8974 + }, + { + "epoch": 1.5483481411196411, + "grad_norm": 0.5546875, + "learning_rate": 2.4314961681411276e-06, + "loss": 1.4251, + "step": 8975 + }, + { + "epoch": 1.548520659018373, + "grad_norm": 0.6640625, + "learning_rate": 2.429717460656361e-06, + "loss": 1.5216, + "step": 8976 + }, + { + "epoch": 1.548693176917105, + "grad_norm": 0.58203125, + "learning_rate": 2.427939314025427e-06, + "loss": 1.4381, + "step": 8977 + }, + { + "epoch": 1.548865694815837, + "grad_norm": 0.546875, + "learning_rate": 2.4261617283800597e-06, + "loss": 1.4402, + "step": 8978 + }, + { + "epoch": 1.549038212714569, + "grad_norm": 0.625, + "learning_rate": 2.42438470385196e-06, + "loss": 1.3946, + "step": 8979 + }, + { + "epoch": 1.5492107306133012, + "grad_norm": 0.58984375, + "learning_rate": 2.422608240572768e-06, + "loss": 1.4762, + "step": 8980 + }, + { + "epoch": 1.5493832485120331, + "grad_norm": 0.6640625, + "learning_rate": 2.4208323386741107e-06, + "loss": 1.3965, + "step": 8981 + }, + { + "epoch": 1.549555766410765, + "grad_norm": 0.62890625, + "learning_rate": 2.419056998287547e-06, + "loss": 1.4151, + "step": 8982 + }, + { + "epoch": 1.5497282843094973, + "grad_norm": 0.88671875, + "learning_rate": 2.417282219544609e-06, + "loss": 1.538, + "step": 8983 + }, + { + "epoch": 1.5499008022082292, + "grad_norm": 0.59765625, + "learning_rate": 2.415508002576783e-06, + "loss": 1.3951, + "step": 8984 + }, + { + "epoch": 1.5500733201069612, + "grad_norm": 0.69921875, + "learning_rate": 2.413734347515514e-06, + "loss": 1.3653, + "step": 8985 + }, + { + "epoch": 1.5502458380056932, + "grad_norm": 0.5859375, + "learning_rate": 2.411961254492207e-06, + "loss": 1.5114, + "step": 8986 + }, + { + "epoch": 1.5504183559044251, + "grad_norm": 0.61328125, + "learning_rate": 2.4101887236382237e-06, + "loss": 1.4353, + "step": 8987 + }, + { + "epoch": 1.550590873803157, + "grad_norm": 0.58984375, + "learning_rate": 2.408416755084878e-06, + "loss": 1.3855, + "step": 8988 + }, + { + "epoch": 1.550763391701889, + "grad_norm": 0.625, + "learning_rate": 2.4066453489634565e-06, + "loss": 1.4579, + "step": 8989 + }, + { + "epoch": 1.550935909600621, + "grad_norm": 0.55078125, + "learning_rate": 2.4048745054051924e-06, + "loss": 1.3656, + "step": 8990 + }, + { + "epoch": 1.551108427499353, + "grad_norm": 0.80078125, + "learning_rate": 2.403104224541283e-06, + "loss": 1.4416, + "step": 8991 + }, + { + "epoch": 1.551280945398085, + "grad_norm": 0.57421875, + "learning_rate": 2.4013345065028816e-06, + "loss": 1.3904, + "step": 8992 + }, + { + "epoch": 1.5514534632968169, + "grad_norm": 0.59375, + "learning_rate": 2.3995653514210936e-06, + "loss": 1.4777, + "step": 8993 + }, + { + "epoch": 1.551625981195549, + "grad_norm": 0.6484375, + "learning_rate": 2.3977967594270003e-06, + "loss": 1.5117, + "step": 8994 + }, + { + "epoch": 1.551798499094281, + "grad_norm": 0.5703125, + "learning_rate": 2.3960287306516193e-06, + "loss": 1.4209, + "step": 8995 + }, + { + "epoch": 1.551971016993013, + "grad_norm": 0.578125, + "learning_rate": 2.3942612652259436e-06, + "loss": 1.4499, + "step": 8996 + }, + { + "epoch": 1.5521435348917452, + "grad_norm": 0.58203125, + "learning_rate": 2.392494363280915e-06, + "loss": 1.3548, + "step": 8997 + }, + { + "epoch": 1.5523160527904771, + "grad_norm": 0.625, + "learning_rate": 2.3907280249474384e-06, + "loss": 1.4644, + "step": 8998 + }, + { + "epoch": 1.552488570689209, + "grad_norm": 0.58203125, + "learning_rate": 2.3889622503563734e-06, + "loss": 1.52, + "step": 8999 + }, + { + "epoch": 1.552661088587941, + "grad_norm": 0.5625, + "learning_rate": 2.3871970396385457e-06, + "loss": 1.4037, + "step": 9000 + }, + { + "epoch": 1.552661088587941, + "eval_loss": 1.4071366786956787, + "eval_runtime": 10.8655, + "eval_samples_per_second": 94.243, + "eval_steps_per_second": 23.561, + "step": 9000 + }, + { + "epoch": 1.552833606486673, + "grad_norm": 0.5546875, + "learning_rate": 2.3854323929247214e-06, + "loss": 1.36, + "step": 9001 + }, + { + "epoch": 1.553006124385405, + "grad_norm": 0.609375, + "learning_rate": 2.3836683103456493e-06, + "loss": 1.5039, + "step": 9002 + }, + { + "epoch": 1.553178642284137, + "grad_norm": 0.578125, + "learning_rate": 2.381904792032015e-06, + "loss": 1.3965, + "step": 9003 + }, + { + "epoch": 1.553351160182869, + "grad_norm": 0.59375, + "learning_rate": 2.3801418381144712e-06, + "loss": 1.4204, + "step": 9004 + }, + { + "epoch": 1.5535236780816009, + "grad_norm": 8.9375, + "learning_rate": 2.3783794487236367e-06, + "loss": 1.4602, + "step": 9005 + }, + { + "epoch": 1.5536961959803328, + "grad_norm": 0.59765625, + "learning_rate": 2.3766176239900717e-06, + "loss": 1.4365, + "step": 9006 + }, + { + "epoch": 1.553868713879065, + "grad_norm": 0.6875, + "learning_rate": 2.3748563640443066e-06, + "loss": 1.3844, + "step": 9007 + }, + { + "epoch": 1.554041231777797, + "grad_norm": 0.59375, + "learning_rate": 2.373095669016825e-06, + "loss": 1.4673, + "step": 9008 + }, + { + "epoch": 1.554213749676529, + "grad_norm": 0.65234375, + "learning_rate": 2.371335539038073e-06, + "loss": 1.3874, + "step": 9009 + }, + { + "epoch": 1.5543862675752609, + "grad_norm": 0.6171875, + "learning_rate": 2.3695759742384495e-06, + "loss": 1.485, + "step": 9010 + }, + { + "epoch": 1.554558785473993, + "grad_norm": 0.703125, + "learning_rate": 2.367816974748317e-06, + "loss": 1.4039, + "step": 9011 + }, + { + "epoch": 1.554731303372725, + "grad_norm": 0.58984375, + "learning_rate": 2.3660585406979865e-06, + "loss": 1.3879, + "step": 9012 + }, + { + "epoch": 1.554903821271457, + "grad_norm": 0.59765625, + "learning_rate": 2.364300672217744e-06, + "loss": 1.3803, + "step": 9013 + }, + { + "epoch": 1.555076339170189, + "grad_norm": 0.578125, + "learning_rate": 2.3625433694378143e-06, + "loss": 1.3202, + "step": 9014 + }, + { + "epoch": 1.555248857068921, + "grad_norm": 0.59375, + "learning_rate": 2.3607866324883934e-06, + "loss": 1.4688, + "step": 9015 + }, + { + "epoch": 1.5554213749676529, + "grad_norm": 0.671875, + "learning_rate": 2.3590304614996305e-06, + "loss": 1.4, + "step": 9016 + }, + { + "epoch": 1.5555938928663848, + "grad_norm": 0.55859375, + "learning_rate": 2.3572748566016345e-06, + "loss": 1.4108, + "step": 9017 + }, + { + "epoch": 1.5557664107651168, + "grad_norm": 0.5703125, + "learning_rate": 2.3555198179244707e-06, + "loss": 1.3687, + "step": 9018 + }, + { + "epoch": 1.5559389286638488, + "grad_norm": 0.68359375, + "learning_rate": 2.3537653455981655e-06, + "loss": 1.4195, + "step": 9019 + }, + { + "epoch": 1.5561114465625807, + "grad_norm": 0.7734375, + "learning_rate": 2.352011439752695e-06, + "loss": 1.3848, + "step": 9020 + }, + { + "epoch": 1.556283964461313, + "grad_norm": 0.55859375, + "learning_rate": 2.350258100518007e-06, + "loss": 1.4125, + "step": 9021 + }, + { + "epoch": 1.5564564823600449, + "grad_norm": 0.6015625, + "learning_rate": 2.3485053280239955e-06, + "loss": 1.4128, + "step": 9022 + }, + { + "epoch": 1.5566290002587768, + "grad_norm": 0.58203125, + "learning_rate": 2.3467531224005192e-06, + "loss": 1.4206, + "step": 9023 + }, + { + "epoch": 1.556801518157509, + "grad_norm": 0.5546875, + "learning_rate": 2.3450014837773937e-06, + "loss": 1.4219, + "step": 9024 + }, + { + "epoch": 1.556974036056241, + "grad_norm": 0.5703125, + "learning_rate": 2.3432504122843826e-06, + "loss": 1.4805, + "step": 9025 + }, + { + "epoch": 1.557146553954973, + "grad_norm": 0.703125, + "learning_rate": 2.341499908051229e-06, + "loss": 1.3797, + "step": 9026 + }, + { + "epoch": 1.5573190718537049, + "grad_norm": 0.56640625, + "learning_rate": 2.3397499712076124e-06, + "loss": 1.3865, + "step": 9027 + }, + { + "epoch": 1.5574915897524368, + "grad_norm": 0.62890625, + "learning_rate": 2.3380006018831804e-06, + "loss": 1.3843, + "step": 9028 + }, + { + "epoch": 1.5576641076511688, + "grad_norm": 0.58203125, + "learning_rate": 2.3362518002075375e-06, + "loss": 1.3683, + "step": 9029 + }, + { + "epoch": 1.5578366255499008, + "grad_norm": 0.734375, + "learning_rate": 2.3345035663102477e-06, + "loss": 1.5048, + "step": 9030 + }, + { + "epoch": 1.5580091434486327, + "grad_norm": 0.60546875, + "learning_rate": 2.33275590032083e-06, + "loss": 1.4577, + "step": 9031 + }, + { + "epoch": 1.5581816613473647, + "grad_norm": 0.59375, + "learning_rate": 2.331008802368765e-06, + "loss": 1.3813, + "step": 9032 + }, + { + "epoch": 1.5583541792460966, + "grad_norm": 0.56640625, + "learning_rate": 2.32926227258348e-06, + "loss": 1.4236, + "step": 9033 + }, + { + "epoch": 1.5585266971448286, + "grad_norm": 0.59375, + "learning_rate": 2.3275163110943812e-06, + "loss": 1.3342, + "step": 9034 + }, + { + "epoch": 1.5586992150435608, + "grad_norm": 0.609375, + "learning_rate": 2.325770918030811e-06, + "loss": 1.4575, + "step": 9035 + }, + { + "epoch": 1.5588717329422928, + "grad_norm": 0.60546875, + "learning_rate": 2.3240260935220804e-06, + "loss": 1.4458, + "step": 9036 + }, + { + "epoch": 1.5590442508410247, + "grad_norm": 0.59375, + "learning_rate": 2.32228183769746e-06, + "loss": 1.4072, + "step": 9037 + }, + { + "epoch": 1.559216768739757, + "grad_norm": 0.5703125, + "learning_rate": 2.3205381506861735e-06, + "loss": 1.3684, + "step": 9038 + }, + { + "epoch": 1.5593892866384889, + "grad_norm": 0.65234375, + "learning_rate": 2.3187950326174048e-06, + "loss": 1.4303, + "step": 9039 + }, + { + "epoch": 1.5595618045372208, + "grad_norm": 0.58203125, + "learning_rate": 2.3170524836202936e-06, + "loss": 1.4655, + "step": 9040 + }, + { + "epoch": 1.5597343224359528, + "grad_norm": 0.60546875, + "learning_rate": 2.31531050382394e-06, + "loss": 1.3948, + "step": 9041 + }, + { + "epoch": 1.5599068403346847, + "grad_norm": 0.58203125, + "learning_rate": 2.3135690933574007e-06, + "loss": 1.4346, + "step": 9042 + }, + { + "epoch": 1.5600793582334167, + "grad_norm": 0.640625, + "learning_rate": 2.3118282523496917e-06, + "loss": 1.4503, + "step": 9043 + }, + { + "epoch": 1.5602518761321487, + "grad_norm": 0.5859375, + "learning_rate": 2.3100879809297793e-06, + "loss": 1.5228, + "step": 9044 + }, + { + "epoch": 1.5604243940308806, + "grad_norm": 0.5625, + "learning_rate": 2.3083482792266032e-06, + "loss": 1.4708, + "step": 9045 + }, + { + "epoch": 1.5605969119296126, + "grad_norm": 0.5546875, + "learning_rate": 2.3066091473690433e-06, + "loss": 1.3849, + "step": 9046 + }, + { + "epoch": 1.5607694298283445, + "grad_norm": 0.625, + "learning_rate": 2.304870585485949e-06, + "loss": 1.4502, + "step": 9047 + }, + { + "epoch": 1.5609419477270767, + "grad_norm": 0.640625, + "learning_rate": 2.3031325937061222e-06, + "loss": 1.3883, + "step": 9048 + }, + { + "epoch": 1.5611144656258087, + "grad_norm": 0.55859375, + "learning_rate": 2.301395172158325e-06, + "loss": 1.4215, + "step": 9049 + }, + { + "epoch": 1.5612869835245407, + "grad_norm": 0.578125, + "learning_rate": 2.2996583209712776e-06, + "loss": 1.4415, + "step": 9050 + }, + { + "epoch": 1.5614595014232726, + "grad_norm": 0.57421875, + "learning_rate": 2.2979220402736547e-06, + "loss": 1.4267, + "step": 9051 + }, + { + "epoch": 1.5616320193220048, + "grad_norm": 0.55859375, + "learning_rate": 2.2961863301940924e-06, + "loss": 1.4367, + "step": 9052 + }, + { + "epoch": 1.5618045372207368, + "grad_norm": 0.6171875, + "learning_rate": 2.294451190861182e-06, + "loss": 1.4087, + "step": 9053 + }, + { + "epoch": 1.5619770551194687, + "grad_norm": 0.640625, + "learning_rate": 2.292716622403475e-06, + "loss": 1.4869, + "step": 9054 + }, + { + "epoch": 1.5621495730182007, + "grad_norm": 0.6171875, + "learning_rate": 2.290982624949477e-06, + "loss": 1.4822, + "step": 9055 + }, + { + "epoch": 1.5623220909169326, + "grad_norm": 0.5625, + "learning_rate": 2.2892491986276578e-06, + "loss": 1.437, + "step": 9056 + }, + { + "epoch": 1.5624946088156646, + "grad_norm": 0.62109375, + "learning_rate": 2.2875163435664306e-06, + "loss": 1.4063, + "step": 9057 + }, + { + "epoch": 1.5626671267143966, + "grad_norm": 0.58203125, + "learning_rate": 2.285784059894188e-06, + "loss": 1.4217, + "step": 9058 + }, + { + "epoch": 1.5628396446131285, + "grad_norm": 0.6015625, + "learning_rate": 2.2840523477392606e-06, + "loss": 1.3661, + "step": 9059 + }, + { + "epoch": 1.5630121625118605, + "grad_norm": 0.625, + "learning_rate": 2.2823212072299463e-06, + "loss": 1.4059, + "step": 9060 + }, + { + "epoch": 1.5631846804105924, + "grad_norm": 0.56640625, + "learning_rate": 2.2805906384945e-06, + "loss": 1.4256, + "step": 9061 + }, + { + "epoch": 1.5633571983093246, + "grad_norm": 0.578125, + "learning_rate": 2.2788606416611314e-06, + "loss": 1.514, + "step": 9062 + }, + { + "epoch": 1.5635297162080566, + "grad_norm": 0.58984375, + "learning_rate": 2.277131216858011e-06, + "loss": 1.5115, + "step": 9063 + }, + { + "epoch": 1.5637022341067885, + "grad_norm": 0.5859375, + "learning_rate": 2.275402364213267e-06, + "loss": 1.4034, + "step": 9064 + }, + { + "epoch": 1.5638747520055207, + "grad_norm": 0.5546875, + "learning_rate": 2.2736740838549765e-06, + "loss": 1.3902, + "step": 9065 + }, + { + "epoch": 1.5640472699042527, + "grad_norm": 0.58203125, + "learning_rate": 2.2719463759111914e-06, + "loss": 1.5219, + "step": 9066 + }, + { + "epoch": 1.5642197878029847, + "grad_norm": 0.55859375, + "learning_rate": 2.2702192405099035e-06, + "loss": 1.414, + "step": 9067 + }, + { + "epoch": 1.5643923057017166, + "grad_norm": 0.5703125, + "learning_rate": 2.2684926777790717e-06, + "loss": 1.5138, + "step": 9068 + }, + { + "epoch": 1.5645648236004486, + "grad_norm": 0.57421875, + "learning_rate": 2.266766687846611e-06, + "loss": 1.4112, + "step": 9069 + }, + { + "epoch": 1.5647373414991805, + "grad_norm": 0.578125, + "learning_rate": 2.2650412708403934e-06, + "loss": 1.4174, + "step": 9070 + }, + { + "epoch": 1.5649098593979125, + "grad_norm": 0.609375, + "learning_rate": 2.2633164268882492e-06, + "loss": 1.4252, + "step": 9071 + }, + { + "epoch": 1.5650823772966445, + "grad_norm": 0.5625, + "learning_rate": 2.261592156117964e-06, + "loss": 1.4543, + "step": 9072 + }, + { + "epoch": 1.5652548951953764, + "grad_norm": 0.62109375, + "learning_rate": 2.259868458657285e-06, + "loss": 1.4375, + "step": 9073 + }, + { + "epoch": 1.5654274130941084, + "grad_norm": 0.61328125, + "learning_rate": 2.258145334633912e-06, + "loss": 1.4361, + "step": 9074 + }, + { + "epoch": 1.5655999309928406, + "grad_norm": 0.578125, + "learning_rate": 2.2564227841755105e-06, + "loss": 1.3977, + "step": 9075 + }, + { + "epoch": 1.5657724488915725, + "grad_norm": 0.5625, + "learning_rate": 2.2547008074096864e-06, + "loss": 1.37, + "step": 9076 + }, + { + "epoch": 1.5659449667903045, + "grad_norm": 0.5703125, + "learning_rate": 2.252979404464027e-06, + "loss": 1.3545, + "step": 9077 + }, + { + "epoch": 1.5661174846890364, + "grad_norm": 0.625, + "learning_rate": 2.2512585754660533e-06, + "loss": 1.4416, + "step": 9078 + }, + { + "epoch": 1.5662900025877686, + "grad_norm": 0.59765625, + "learning_rate": 2.2495383205432665e-06, + "loss": 1.4559, + "step": 9079 + }, + { + "epoch": 1.5664625204865006, + "grad_norm": 0.59375, + "learning_rate": 2.247818639823105e-06, + "loss": 1.4815, + "step": 9080 + }, + { + "epoch": 1.5666350383852325, + "grad_norm": 0.61328125, + "learning_rate": 2.2460995334329773e-06, + "loss": 1.4407, + "step": 9081 + }, + { + "epoch": 1.5668075562839645, + "grad_norm": 0.56640625, + "learning_rate": 2.244381001500244e-06, + "loss": 1.3868, + "step": 9082 + }, + { + "epoch": 1.5669800741826965, + "grad_norm": 0.578125, + "learning_rate": 2.2426630441522257e-06, + "loss": 1.458, + "step": 9083 + }, + { + "epoch": 1.5671525920814284, + "grad_norm": 0.55859375, + "learning_rate": 2.2409456615161996e-06, + "loss": 1.406, + "step": 9084 + }, + { + "epoch": 1.5673251099801604, + "grad_norm": 0.5859375, + "learning_rate": 2.2392288537193987e-06, + "loss": 1.4267, + "step": 9085 + }, + { + "epoch": 1.5674976278788924, + "grad_norm": 0.64453125, + "learning_rate": 2.2375126208890164e-06, + "loss": 1.4312, + "step": 9086 + }, + { + "epoch": 1.5676701457776243, + "grad_norm": 0.58984375, + "learning_rate": 2.2357969631522016e-06, + "loss": 1.3354, + "step": 9087 + }, + { + "epoch": 1.5678426636763563, + "grad_norm": 0.56640625, + "learning_rate": 2.234081880636063e-06, + "loss": 1.4659, + "step": 9088 + }, + { + "epoch": 1.5680151815750885, + "grad_norm": 0.57421875, + "learning_rate": 2.2323673734676576e-06, + "loss": 1.3875, + "step": 9089 + }, + { + "epoch": 1.5681876994738204, + "grad_norm": 0.56640625, + "learning_rate": 2.2306534417740167e-06, + "loss": 1.5242, + "step": 9090 + }, + { + "epoch": 1.5683602173725524, + "grad_norm": 0.625, + "learning_rate": 2.228940085682111e-06, + "loss": 1.4999, + "step": 9091 + }, + { + "epoch": 1.5685327352712846, + "grad_norm": 0.59375, + "learning_rate": 2.22722730531888e-06, + "loss": 1.3251, + "step": 9092 + }, + { + "epoch": 1.5687052531700165, + "grad_norm": 0.5859375, + "learning_rate": 2.2255151008112164e-06, + "loss": 1.4632, + "step": 9093 + }, + { + "epoch": 1.5688777710687485, + "grad_norm": 0.5859375, + "learning_rate": 2.2238034722859715e-06, + "loss": 1.4337, + "step": 9094 + }, + { + "epoch": 1.5690502889674804, + "grad_norm": 0.5625, + "learning_rate": 2.2220924198699532e-06, + "loss": 1.511, + "step": 9095 + }, + { + "epoch": 1.5692228068662124, + "grad_norm": 0.625, + "learning_rate": 2.2203819436899297e-06, + "loss": 1.4009, + "step": 9096 + }, + { + "epoch": 1.5693953247649444, + "grad_norm": 0.5859375, + "learning_rate": 2.2186720438726163e-06, + "loss": 1.5326, + "step": 9097 + }, + { + "epoch": 1.5695678426636763, + "grad_norm": 0.59375, + "learning_rate": 2.216962720544703e-06, + "loss": 1.3456, + "step": 9098 + }, + { + "epoch": 1.5697403605624083, + "grad_norm": 0.671875, + "learning_rate": 2.2152539738328185e-06, + "loss": 1.533, + "step": 9099 + }, + { + "epoch": 1.5699128784611402, + "grad_norm": 0.71875, + "learning_rate": 2.2135458038635614e-06, + "loss": 1.4004, + "step": 9100 + }, + { + "epoch": 1.5699128784611402, + "eval_loss": 1.4070823192596436, + "eval_runtime": 10.9402, + "eval_samples_per_second": 93.6, + "eval_steps_per_second": 23.4, + "step": 9100 + }, + { + "epoch": 1.5700853963598722, + "grad_norm": 0.68359375, + "learning_rate": 2.211838210763484e-06, + "loss": 1.4252, + "step": 9101 + }, + { + "epoch": 1.5702579142586042, + "grad_norm": 0.625, + "learning_rate": 2.2101311946590942e-06, + "loss": 1.4406, + "step": 9102 + }, + { + "epoch": 1.5704304321573364, + "grad_norm": 0.578125, + "learning_rate": 2.2084247556768577e-06, + "loss": 1.4362, + "step": 9103 + }, + { + "epoch": 1.5706029500560683, + "grad_norm": 0.60546875, + "learning_rate": 2.2067188939432006e-06, + "loss": 1.4133, + "step": 9104 + }, + { + "epoch": 1.5707754679548003, + "grad_norm": 0.58984375, + "learning_rate": 2.205013609584502e-06, + "loss": 1.4302, + "step": 9105 + }, + { + "epoch": 1.5709479858535325, + "grad_norm": 0.65625, + "learning_rate": 2.2033089027271003e-06, + "loss": 1.4432, + "step": 9106 + }, + { + "epoch": 1.5711205037522644, + "grad_norm": 0.57421875, + "learning_rate": 2.2016047734972923e-06, + "loss": 1.3627, + "step": 9107 + }, + { + "epoch": 1.5712930216509964, + "grad_norm": 0.62109375, + "learning_rate": 2.1999012220213277e-06, + "loss": 1.4397, + "step": 9108 + }, + { + "epoch": 1.5714655395497283, + "grad_norm": 0.58203125, + "learning_rate": 2.1981982484254214e-06, + "loss": 1.4597, + "step": 9109 + }, + { + "epoch": 1.5716380574484603, + "grad_norm": 0.58203125, + "learning_rate": 2.1964958528357317e-06, + "loss": 1.3625, + "step": 9110 + }, + { + "epoch": 1.5718105753471923, + "grad_norm": 0.5859375, + "learning_rate": 2.1947940353783927e-06, + "loss": 1.4335, + "step": 9111 + }, + { + "epoch": 1.5719830932459242, + "grad_norm": 0.921875, + "learning_rate": 2.1930927961794777e-06, + "loss": 1.4665, + "step": 9112 + }, + { + "epoch": 1.5721556111446562, + "grad_norm": 0.6796875, + "learning_rate": 2.191392135365029e-06, + "loss": 1.3722, + "step": 9113 + }, + { + "epoch": 1.5723281290433881, + "grad_norm": 0.62890625, + "learning_rate": 2.189692053061041e-06, + "loss": 1.5526, + "step": 9114 + }, + { + "epoch": 1.57250064694212, + "grad_norm": 0.5703125, + "learning_rate": 2.1879925493934663e-06, + "loss": 1.4509, + "step": 9115 + }, + { + "epoch": 1.5726731648408523, + "grad_norm": 0.55078125, + "learning_rate": 2.186293624488216e-06, + "loss": 1.4767, + "step": 9116 + }, + { + "epoch": 1.5728456827395842, + "grad_norm": 0.64453125, + "learning_rate": 2.1845952784711555e-06, + "loss": 1.4349, + "step": 9117 + }, + { + "epoch": 1.5730182006383162, + "grad_norm": 0.625, + "learning_rate": 2.1828975114681097e-06, + "loss": 1.4376, + "step": 9118 + }, + { + "epoch": 1.5731907185370482, + "grad_norm": 0.6484375, + "learning_rate": 2.1812003236048607e-06, + "loss": 1.4136, + "step": 9119 + }, + { + "epoch": 1.5733632364357804, + "grad_norm": 0.5703125, + "learning_rate": 2.1795037150071473e-06, + "loss": 1.4266, + "step": 9120 + }, + { + "epoch": 1.5735357543345123, + "grad_norm": 0.59375, + "learning_rate": 2.177807685800659e-06, + "loss": 1.4399, + "step": 9121 + }, + { + "epoch": 1.5737082722332443, + "grad_norm": 0.57421875, + "learning_rate": 2.1761122361110576e-06, + "loss": 1.4447, + "step": 9122 + }, + { + "epoch": 1.5738807901319762, + "grad_norm": 0.66796875, + "learning_rate": 2.1744173660639446e-06, + "loss": 1.4029, + "step": 9123 + }, + { + "epoch": 1.5740533080307082, + "grad_norm": 0.54296875, + "learning_rate": 2.1727230757848903e-06, + "loss": 1.3948, + "step": 9124 + }, + { + "epoch": 1.5742258259294402, + "grad_norm": 0.578125, + "learning_rate": 2.1710293653994173e-06, + "loss": 1.4587, + "step": 9125 + }, + { + "epoch": 1.5743983438281721, + "grad_norm": 0.60546875, + "learning_rate": 2.169336235033007e-06, + "loss": 1.4837, + "step": 9126 + }, + { + "epoch": 1.574570861726904, + "grad_norm": 0.5703125, + "learning_rate": 2.167643684811096e-06, + "loss": 1.4112, + "step": 9127 + }, + { + "epoch": 1.574743379625636, + "grad_norm": 0.58203125, + "learning_rate": 2.1659517148590825e-06, + "loss": 1.4641, + "step": 9128 + }, + { + "epoch": 1.574915897524368, + "grad_norm": 0.58984375, + "learning_rate": 2.164260325302311e-06, + "loss": 1.4535, + "step": 9129 + }, + { + "epoch": 1.5750884154231002, + "grad_norm": 0.5546875, + "learning_rate": 2.1625695162660986e-06, + "loss": 1.2898, + "step": 9130 + }, + { + "epoch": 1.5752609333218321, + "grad_norm": 0.546875, + "learning_rate": 2.1608792878757044e-06, + "loss": 1.4519, + "step": 9131 + }, + { + "epoch": 1.575433451220564, + "grad_norm": 0.6015625, + "learning_rate": 2.159189640256354e-06, + "loss": 1.5392, + "step": 9132 + }, + { + "epoch": 1.5756059691192963, + "grad_norm": 0.609375, + "learning_rate": 2.1575005735332267e-06, + "loss": 1.3497, + "step": 9133 + }, + { + "epoch": 1.5757784870180283, + "grad_norm": 0.58203125, + "learning_rate": 2.155812087831457e-06, + "loss": 1.4987, + "step": 9134 + }, + { + "epoch": 1.5759510049167602, + "grad_norm": 0.578125, + "learning_rate": 2.154124183276145e-06, + "loss": 1.4392, + "step": 9135 + }, + { + "epoch": 1.5761235228154922, + "grad_norm": 0.578125, + "learning_rate": 2.152436859992335e-06, + "loss": 1.3764, + "step": 9136 + }, + { + "epoch": 1.5762960407142241, + "grad_norm": 0.58203125, + "learning_rate": 2.150750118105035e-06, + "loss": 1.37, + "step": 9137 + }, + { + "epoch": 1.576468558612956, + "grad_norm": 0.578125, + "learning_rate": 2.1490639577392115e-06, + "loss": 1.3533, + "step": 9138 + }, + { + "epoch": 1.576641076511688, + "grad_norm": 0.56640625, + "learning_rate": 2.1473783790197854e-06, + "loss": 1.4118, + "step": 9139 + }, + { + "epoch": 1.57681359441042, + "grad_norm": 0.82421875, + "learning_rate": 2.145693382071633e-06, + "loss": 1.3812, + "step": 9140 + }, + { + "epoch": 1.576986112309152, + "grad_norm": 0.58984375, + "learning_rate": 2.144008967019595e-06, + "loss": 1.5229, + "step": 9141 + }, + { + "epoch": 1.577158630207884, + "grad_norm": 0.56640625, + "learning_rate": 2.1423251339884534e-06, + "loss": 1.4583, + "step": 9142 + }, + { + "epoch": 1.577331148106616, + "grad_norm": 0.6171875, + "learning_rate": 2.1406418831029673e-06, + "loss": 1.4862, + "step": 9143 + }, + { + "epoch": 1.577503666005348, + "grad_norm": 0.59765625, + "learning_rate": 2.138959214487837e-06, + "loss": 1.4565, + "step": 9144 + }, + { + "epoch": 1.57767618390408, + "grad_norm": 0.60546875, + "learning_rate": 2.137277128267725e-06, + "loss": 1.4422, + "step": 9145 + }, + { + "epoch": 1.577848701802812, + "grad_norm": 0.5859375, + "learning_rate": 2.1355956245672518e-06, + "loss": 1.388, + "step": 9146 + }, + { + "epoch": 1.5780212197015442, + "grad_norm": 0.546875, + "learning_rate": 2.1339147035109943e-06, + "loss": 1.26, + "step": 9147 + }, + { + "epoch": 1.5781937376002761, + "grad_norm": 0.58984375, + "learning_rate": 2.1322343652234855e-06, + "loss": 1.5043, + "step": 9148 + }, + { + "epoch": 1.578366255499008, + "grad_norm": 0.59765625, + "learning_rate": 2.130554609829214e-06, + "loss": 1.4859, + "step": 9149 + }, + { + "epoch": 1.57853877339774, + "grad_norm": 0.578125, + "learning_rate": 2.1288754374526275e-06, + "loss": 1.3498, + "step": 9150 + }, + { + "epoch": 1.578711291296472, + "grad_norm": 0.58984375, + "learning_rate": 2.1271968482181306e-06, + "loss": 1.4313, + "step": 9151 + }, + { + "epoch": 1.578883809195204, + "grad_norm": 0.5546875, + "learning_rate": 2.125518842250085e-06, + "loss": 1.4392, + "step": 9152 + }, + { + "epoch": 1.579056327093936, + "grad_norm": 0.58984375, + "learning_rate": 2.1238414196728007e-06, + "loss": 1.3872, + "step": 9153 + }, + { + "epoch": 1.579228844992668, + "grad_norm": 0.5703125, + "learning_rate": 2.1221645806105617e-06, + "loss": 1.4653, + "step": 9154 + }, + { + "epoch": 1.5794013628913999, + "grad_norm": 0.57421875, + "learning_rate": 2.120488325187591e-06, + "loss": 1.437, + "step": 9155 + }, + { + "epoch": 1.5795738807901318, + "grad_norm": 0.59375, + "learning_rate": 2.118812653528077e-06, + "loss": 1.3615, + "step": 9156 + }, + { + "epoch": 1.579746398688864, + "grad_norm": 0.5703125, + "learning_rate": 2.1171375657561677e-06, + "loss": 1.5104, + "step": 9157 + }, + { + "epoch": 1.579918916587596, + "grad_norm": 0.57421875, + "learning_rate": 2.1154630619959605e-06, + "loss": 1.4759, + "step": 9158 + }, + { + "epoch": 1.580091434486328, + "grad_norm": 0.5546875, + "learning_rate": 2.113789142371515e-06, + "loss": 1.3506, + "step": 9159 + }, + { + "epoch": 1.58026395238506, + "grad_norm": 0.64453125, + "learning_rate": 2.112115807006848e-06, + "loss": 1.3673, + "step": 9160 + }, + { + "epoch": 1.580436470283792, + "grad_norm": 0.578125, + "learning_rate": 2.1104430560259227e-06, + "loss": 1.4195, + "step": 9161 + }, + { + "epoch": 1.580608988182524, + "grad_norm": 0.58203125, + "learning_rate": 2.108770889552677e-06, + "loss": 1.5068, + "step": 9162 + }, + { + "epoch": 1.580781506081256, + "grad_norm": 0.5703125, + "learning_rate": 2.107099307710988e-06, + "loss": 1.3579, + "step": 9163 + }, + { + "epoch": 1.580954023979988, + "grad_norm": 0.5703125, + "learning_rate": 2.105428310624699e-06, + "loss": 1.4347, + "step": 9164 + }, + { + "epoch": 1.58112654187872, + "grad_norm": 0.65625, + "learning_rate": 2.103757898417609e-06, + "loss": 1.316, + "step": 9165 + }, + { + "epoch": 1.5812990597774519, + "grad_norm": 0.578125, + "learning_rate": 2.102088071213468e-06, + "loss": 1.3706, + "step": 9166 + }, + { + "epoch": 1.5814715776761838, + "grad_norm": 0.58984375, + "learning_rate": 2.1004188291359973e-06, + "loss": 1.4241, + "step": 9167 + }, + { + "epoch": 1.5816440955749158, + "grad_norm": 0.6015625, + "learning_rate": 2.0987501723088564e-06, + "loss": 1.4717, + "step": 9168 + }, + { + "epoch": 1.5818166134736478, + "grad_norm": 0.5625, + "learning_rate": 2.0970821008556706e-06, + "loss": 1.489, + "step": 9169 + }, + { + "epoch": 1.5819891313723797, + "grad_norm": 0.55859375, + "learning_rate": 2.0954146149000243e-06, + "loss": 1.3503, + "step": 9170 + }, + { + "epoch": 1.582161649271112, + "grad_norm": 0.62890625, + "learning_rate": 2.093747714565453e-06, + "loss": 1.4744, + "step": 9171 + }, + { + "epoch": 1.5823341671698439, + "grad_norm": 0.671875, + "learning_rate": 2.092081399975451e-06, + "loss": 1.4649, + "step": 9172 + }, + { + "epoch": 1.5825066850685758, + "grad_norm": 0.57421875, + "learning_rate": 2.0904156712534718e-06, + "loss": 1.3834, + "step": 9173 + }, + { + "epoch": 1.582679202967308, + "grad_norm": 0.59765625, + "learning_rate": 2.088750528522917e-06, + "loss": 1.5699, + "step": 9174 + }, + { + "epoch": 1.58285172086604, + "grad_norm": 0.578125, + "learning_rate": 2.087085971907159e-06, + "loss": 1.438, + "step": 9175 + }, + { + "epoch": 1.583024238764772, + "grad_norm": 0.56640625, + "learning_rate": 2.0854220015295125e-06, + "loss": 1.3781, + "step": 9176 + }, + { + "epoch": 1.583196756663504, + "grad_norm": 0.57421875, + "learning_rate": 2.083758617513256e-06, + "loss": 1.39, + "step": 9177 + }, + { + "epoch": 1.5833692745622359, + "grad_norm": 0.57421875, + "learning_rate": 2.0820958199816253e-06, + "loss": 1.3723, + "step": 9178 + }, + { + "epoch": 1.5835417924609678, + "grad_norm": 0.55859375, + "learning_rate": 2.0804336090578094e-06, + "loss": 1.4139, + "step": 9179 + }, + { + "epoch": 1.5837143103596998, + "grad_norm": 0.60546875, + "learning_rate": 2.0787719848649544e-06, + "loss": 1.3735, + "step": 9180 + }, + { + "epoch": 1.5838868282584317, + "grad_norm": 0.59375, + "learning_rate": 2.0771109475261654e-06, + "loss": 1.4363, + "step": 9181 + }, + { + "epoch": 1.5840593461571637, + "grad_norm": 0.5859375, + "learning_rate": 2.075450497164503e-06, + "loss": 1.4841, + "step": 9182 + }, + { + "epoch": 1.5842318640558957, + "grad_norm": 0.60546875, + "learning_rate": 2.0737906339029813e-06, + "loss": 1.4085, + "step": 9183 + }, + { + "epoch": 1.5844043819546276, + "grad_norm": 0.5859375, + "learning_rate": 2.0721313578645773e-06, + "loss": 1.3854, + "step": 9184 + }, + { + "epoch": 1.5845768998533598, + "grad_norm": 0.58203125, + "learning_rate": 2.0704726691722134e-06, + "loss": 1.4014, + "step": 9185 + }, + { + "epoch": 1.5847494177520918, + "grad_norm": 0.56640625, + "learning_rate": 2.068814567948785e-06, + "loss": 1.4236, + "step": 9186 + }, + { + "epoch": 1.5849219356508237, + "grad_norm": 0.61328125, + "learning_rate": 2.067157054317127e-06, + "loss": 1.4418, + "step": 9187 + }, + { + "epoch": 1.585094453549556, + "grad_norm": 0.57421875, + "learning_rate": 2.065500128400041e-06, + "loss": 1.546, + "step": 9188 + }, + { + "epoch": 1.5852669714482879, + "grad_norm": 0.6875, + "learning_rate": 2.0638437903202825e-06, + "loss": 1.336, + "step": 9189 + }, + { + "epoch": 1.5854394893470198, + "grad_norm": 0.6640625, + "learning_rate": 2.0621880402005644e-06, + "loss": 1.4443, + "step": 9190 + }, + { + "epoch": 1.5856120072457518, + "grad_norm": 0.61328125, + "learning_rate": 2.0605328781635524e-06, + "loss": 1.5043, + "step": 9191 + }, + { + "epoch": 1.5857845251444838, + "grad_norm": 0.609375, + "learning_rate": 2.058878304331877e-06, + "loss": 1.3014, + "step": 9192 + }, + { + "epoch": 1.5859570430432157, + "grad_norm": 0.64453125, + "learning_rate": 2.0572243188281093e-06, + "loss": 1.3639, + "step": 9193 + }, + { + "epoch": 1.5861295609419477, + "grad_norm": 0.5625, + "learning_rate": 2.055570921774799e-06, + "loss": 1.2815, + "step": 9194 + }, + { + "epoch": 1.5863020788406796, + "grad_norm": 0.6015625, + "learning_rate": 2.0539181132944273e-06, + "loss": 1.4472, + "step": 9195 + }, + { + "epoch": 1.5864745967394116, + "grad_norm": 0.6328125, + "learning_rate": 2.0522658935094565e-06, + "loss": 1.3883, + "step": 9196 + }, + { + "epoch": 1.5866471146381436, + "grad_norm": 0.671875, + "learning_rate": 2.0506142625422863e-06, + "loss": 1.3905, + "step": 9197 + }, + { + "epoch": 1.5868196325368757, + "grad_norm": 0.6171875, + "learning_rate": 2.0489632205152787e-06, + "loss": 1.5331, + "step": 9198 + }, + { + "epoch": 1.5869921504356077, + "grad_norm": 0.62890625, + "learning_rate": 2.047312767550761e-06, + "loss": 1.4573, + "step": 9199 + }, + { + "epoch": 1.5871646683343397, + "grad_norm": 0.5859375, + "learning_rate": 2.045662903771002e-06, + "loss": 1.4495, + "step": 9200 + }, + { + "epoch": 1.5871646683343397, + "eval_loss": 1.407097578048706, + "eval_runtime": 10.8771, + "eval_samples_per_second": 94.143, + "eval_steps_per_second": 23.536, + "step": 9200 + }, + { + "epoch": 1.5873371862330716, + "grad_norm": 0.60546875, + "learning_rate": 2.044013629298235e-06, + "loss": 1.4389, + "step": 9201 + }, + { + "epoch": 1.5875097041318038, + "grad_norm": 0.76171875, + "learning_rate": 2.042364944254651e-06, + "loss": 1.4526, + "step": 9202 + }, + { + "epoch": 1.5876822220305358, + "grad_norm": 0.5703125, + "learning_rate": 2.040716848762393e-06, + "loss": 1.3507, + "step": 9203 + }, + { + "epoch": 1.5878547399292677, + "grad_norm": 0.58203125, + "learning_rate": 2.0390693429435626e-06, + "loss": 1.4062, + "step": 9204 + }, + { + "epoch": 1.5880272578279997, + "grad_norm": 0.58984375, + "learning_rate": 2.0374224269202204e-06, + "loss": 1.4531, + "step": 9205 + }, + { + "epoch": 1.5881997757267317, + "grad_norm": 0.56640625, + "learning_rate": 2.0357761008143715e-06, + "loss": 1.451, + "step": 9206 + }, + { + "epoch": 1.5883722936254636, + "grad_norm": 0.5546875, + "learning_rate": 2.034130364747997e-06, + "loss": 1.4734, + "step": 9207 + }, + { + "epoch": 1.5885448115241956, + "grad_norm": 0.58984375, + "learning_rate": 2.0324852188430167e-06, + "loss": 1.339, + "step": 9208 + }, + { + "epoch": 1.5887173294229275, + "grad_norm": 0.62109375, + "learning_rate": 2.030840663221313e-06, + "loss": 1.4443, + "step": 9209 + }, + { + "epoch": 1.5888898473216595, + "grad_norm": 0.60546875, + "learning_rate": 2.0291966980047282e-06, + "loss": 1.4756, + "step": 9210 + }, + { + "epoch": 1.5890623652203915, + "grad_norm": 0.59375, + "learning_rate": 2.027553323315055e-06, + "loss": 1.4799, + "step": 9211 + }, + { + "epoch": 1.5892348831191236, + "grad_norm": 0.578125, + "learning_rate": 2.0259105392740462e-06, + "loss": 1.3639, + "step": 9212 + }, + { + "epoch": 1.5894074010178556, + "grad_norm": 0.56640625, + "learning_rate": 2.0242683460034084e-06, + "loss": 1.5483, + "step": 9213 + }, + { + "epoch": 1.5895799189165876, + "grad_norm": 0.55078125, + "learning_rate": 2.022626743624807e-06, + "loss": 1.5209, + "step": 9214 + }, + { + "epoch": 1.5897524368153197, + "grad_norm": 0.5625, + "learning_rate": 2.02098573225986e-06, + "loss": 1.3927, + "step": 9215 + }, + { + "epoch": 1.5899249547140517, + "grad_norm": 0.57421875, + "learning_rate": 2.0193453120301496e-06, + "loss": 1.3861, + "step": 9216 + }, + { + "epoch": 1.5900974726127837, + "grad_norm": 0.5703125, + "learning_rate": 2.0177054830571974e-06, + "loss": 1.3874, + "step": 9217 + }, + { + "epoch": 1.5902699905115156, + "grad_norm": 0.58203125, + "learning_rate": 2.0160662454625045e-06, + "loss": 1.3126, + "step": 9218 + }, + { + "epoch": 1.5904425084102476, + "grad_norm": 0.57421875, + "learning_rate": 2.0144275993675077e-06, + "loss": 1.359, + "step": 9219 + }, + { + "epoch": 1.5906150263089796, + "grad_norm": 0.6015625, + "learning_rate": 2.01278954489361e-06, + "loss": 1.5007, + "step": 9220 + }, + { + "epoch": 1.5907875442077115, + "grad_norm": 0.65625, + "learning_rate": 2.0111520821621686e-06, + "loss": 1.4547, + "step": 9221 + }, + { + "epoch": 1.5909600621064435, + "grad_norm": 0.58203125, + "learning_rate": 2.0095152112944983e-06, + "loss": 1.4354, + "step": 9222 + }, + { + "epoch": 1.5911325800051754, + "grad_norm": 0.609375, + "learning_rate": 2.0078789324118686e-06, + "loss": 1.3445, + "step": 9223 + }, + { + "epoch": 1.5913050979039074, + "grad_norm": 0.5703125, + "learning_rate": 2.0062432456355064e-06, + "loss": 1.3494, + "step": 9224 + }, + { + "epoch": 1.5914776158026396, + "grad_norm": 0.57421875, + "learning_rate": 2.0046081510865865e-06, + "loss": 1.4539, + "step": 9225 + }, + { + "epoch": 1.5916501337013715, + "grad_norm": 0.5703125, + "learning_rate": 2.0029736488862585e-06, + "loss": 1.4198, + "step": 9226 + }, + { + "epoch": 1.5918226516001035, + "grad_norm": 0.6015625, + "learning_rate": 2.0013397391556042e-06, + "loss": 1.4261, + "step": 9227 + }, + { + "epoch": 1.5919951694988355, + "grad_norm": 0.62890625, + "learning_rate": 1.9997064220156857e-06, + "loss": 1.3818, + "step": 9228 + }, + { + "epoch": 1.5921676873975676, + "grad_norm": 0.5859375, + "learning_rate": 1.9980736975875005e-06, + "loss": 1.3872, + "step": 9229 + }, + { + "epoch": 1.5923402052962996, + "grad_norm": 0.6328125, + "learning_rate": 1.9964415659920123e-06, + "loss": 1.4362, + "step": 9230 + }, + { + "epoch": 1.5925127231950316, + "grad_norm": 0.5390625, + "learning_rate": 1.9948100273501456e-06, + "loss": 1.3703, + "step": 9231 + }, + { + "epoch": 1.5926852410937635, + "grad_norm": 0.58203125, + "learning_rate": 1.993179081782769e-06, + "loss": 1.3841, + "step": 9232 + }, + { + "epoch": 1.5928577589924955, + "grad_norm": 0.5859375, + "learning_rate": 1.991548729410715e-06, + "loss": 1.4737, + "step": 9233 + }, + { + "epoch": 1.5930302768912274, + "grad_norm": 0.64453125, + "learning_rate": 1.9899189703547694e-06, + "loss": 1.4497, + "step": 9234 + }, + { + "epoch": 1.5932027947899594, + "grad_norm": 0.5625, + "learning_rate": 1.988289804735677e-06, + "loss": 1.337, + "step": 9235 + }, + { + "epoch": 1.5933753126886914, + "grad_norm": 0.58203125, + "learning_rate": 1.986661232674134e-06, + "loss": 1.4287, + "step": 9236 + }, + { + "epoch": 1.5935478305874233, + "grad_norm": 0.60546875, + "learning_rate": 1.9850332542908e-06, + "loss": 1.4264, + "step": 9237 + }, + { + "epoch": 1.5937203484861553, + "grad_norm": 0.84375, + "learning_rate": 1.9834058697062776e-06, + "loss": 1.5309, + "step": 9238 + }, + { + "epoch": 1.5938928663848875, + "grad_norm": 0.625, + "learning_rate": 1.9817790790411428e-06, + "loss": 1.3928, + "step": 9239 + }, + { + "epoch": 1.5940653842836194, + "grad_norm": 0.58984375, + "learning_rate": 1.980152882415911e-06, + "loss": 1.342, + "step": 9240 + }, + { + "epoch": 1.5942379021823514, + "grad_norm": 0.68359375, + "learning_rate": 1.978527279951065e-06, + "loss": 1.3418, + "step": 9241 + }, + { + "epoch": 1.5944104200810834, + "grad_norm": 0.6171875, + "learning_rate": 1.976902271767037e-06, + "loss": 1.4355, + "step": 9242 + }, + { + "epoch": 1.5945829379798155, + "grad_norm": 0.59765625, + "learning_rate": 1.9752778579842214e-06, + "loss": 1.3677, + "step": 9243 + }, + { + "epoch": 1.5947554558785475, + "grad_norm": 0.58203125, + "learning_rate": 1.973654038722962e-06, + "loss": 1.45, + "step": 9244 + }, + { + "epoch": 1.5949279737772795, + "grad_norm": 0.5625, + "learning_rate": 1.9720308141035647e-06, + "loss": 1.407, + "step": 9245 + }, + { + "epoch": 1.5951004916760114, + "grad_norm": 0.5546875, + "learning_rate": 1.9704081842462806e-06, + "loss": 1.3835, + "step": 9246 + }, + { + "epoch": 1.5952730095747434, + "grad_norm": 0.56640625, + "learning_rate": 1.9687861492713323e-06, + "loss": 1.2955, + "step": 9247 + }, + { + "epoch": 1.5954455274734753, + "grad_norm": 0.6015625, + "learning_rate": 1.967164709298889e-06, + "loss": 1.3696, + "step": 9248 + }, + { + "epoch": 1.5956180453722073, + "grad_norm": 0.63671875, + "learning_rate": 1.965543864449071e-06, + "loss": 1.3885, + "step": 9249 + }, + { + "epoch": 1.5957905632709393, + "grad_norm": 0.62890625, + "learning_rate": 1.9639236148419705e-06, + "loss": 1.4505, + "step": 9250 + }, + { + "epoch": 1.5959630811696712, + "grad_norm": 0.58203125, + "learning_rate": 1.962303960597618e-06, + "loss": 1.4014, + "step": 9251 + }, + { + "epoch": 1.5961355990684032, + "grad_norm": 0.5546875, + "learning_rate": 1.9606849018360096e-06, + "loss": 1.4158, + "step": 9252 + }, + { + "epoch": 1.5963081169671354, + "grad_norm": 0.5703125, + "learning_rate": 1.9590664386770953e-06, + "loss": 1.4112, + "step": 9253 + }, + { + "epoch": 1.5964806348658673, + "grad_norm": 0.6484375, + "learning_rate": 1.957448571240782e-06, + "loss": 1.3941, + "step": 9254 + }, + { + "epoch": 1.5966531527645993, + "grad_norm": 0.62890625, + "learning_rate": 1.9558312996469296e-06, + "loss": 1.4583, + "step": 9255 + }, + { + "epoch": 1.5968256706633315, + "grad_norm": 0.59375, + "learning_rate": 1.954214624015358e-06, + "loss": 1.4152, + "step": 9256 + }, + { + "epoch": 1.5969981885620634, + "grad_norm": 1.0390625, + "learning_rate": 1.9525985444658393e-06, + "loss": 1.4532, + "step": 9257 + }, + { + "epoch": 1.5971707064607954, + "grad_norm": 0.88671875, + "learning_rate": 1.9509830611181047e-06, + "loss": 1.3964, + "step": 9258 + }, + { + "epoch": 1.5973432243595274, + "grad_norm": 0.57421875, + "learning_rate": 1.9493681740918335e-06, + "loss": 1.4545, + "step": 9259 + }, + { + "epoch": 1.5975157422582593, + "grad_norm": 0.6015625, + "learning_rate": 1.9477538835066747e-06, + "loss": 1.3786, + "step": 9260 + }, + { + "epoch": 1.5976882601569913, + "grad_norm": 0.57421875, + "learning_rate": 1.9461401894822187e-06, + "loss": 1.336, + "step": 9261 + }, + { + "epoch": 1.5978607780557232, + "grad_norm": 0.5859375, + "learning_rate": 1.944527092138018e-06, + "loss": 1.431, + "step": 9262 + }, + { + "epoch": 1.5980332959544552, + "grad_norm": 0.578125, + "learning_rate": 1.9429145915935886e-06, + "loss": 1.4712, + "step": 9263 + }, + { + "epoch": 1.5982058138531872, + "grad_norm": 0.58203125, + "learning_rate": 1.941302687968386e-06, + "loss": 1.3915, + "step": 9264 + }, + { + "epoch": 1.5983783317519191, + "grad_norm": 0.546875, + "learning_rate": 1.939691381381834e-06, + "loss": 1.3599, + "step": 9265 + }, + { + "epoch": 1.5985508496506513, + "grad_norm": 0.62109375, + "learning_rate": 1.938080671953307e-06, + "loss": 1.294, + "step": 9266 + }, + { + "epoch": 1.5987233675493833, + "grad_norm": 0.58203125, + "learning_rate": 1.9364705598021373e-06, + "loss": 1.407, + "step": 9267 + }, + { + "epoch": 1.5988958854481152, + "grad_norm": 0.59765625, + "learning_rate": 1.9348610450476112e-06, + "loss": 1.3775, + "step": 9268 + }, + { + "epoch": 1.5990684033468472, + "grad_norm": 0.58203125, + "learning_rate": 1.9332521278089756e-06, + "loss": 1.473, + "step": 9269 + }, + { + "epoch": 1.5992409212455794, + "grad_norm": 0.58984375, + "learning_rate": 1.9316438082054203e-06, + "loss": 1.4518, + "step": 9270 + }, + { + "epoch": 1.5994134391443113, + "grad_norm": 0.57421875, + "learning_rate": 1.930036086356111e-06, + "loss": 1.4212, + "step": 9271 + }, + { + "epoch": 1.5995859570430433, + "grad_norm": 0.546875, + "learning_rate": 1.928428962380148e-06, + "loss": 1.4948, + "step": 9272 + }, + { + "epoch": 1.5997584749417753, + "grad_norm": 0.5859375, + "learning_rate": 1.9268224363966016e-06, + "loss": 1.3819, + "step": 9273 + }, + { + "epoch": 1.5999309928405072, + "grad_norm": 0.6484375, + "learning_rate": 1.925216508524492e-06, + "loss": 1.4281, + "step": 9274 + }, + { + "epoch": 1.6001035107392392, + "grad_norm": 0.5703125, + "learning_rate": 1.9236111788827983e-06, + "loss": 1.4988, + "step": 9275 + }, + { + "epoch": 1.6002760286379711, + "grad_norm": 0.703125, + "learning_rate": 1.922006447590451e-06, + "loss": 1.4607, + "step": 9276 + }, + { + "epoch": 1.600448546536703, + "grad_norm": 0.55859375, + "learning_rate": 1.920402314766343e-06, + "loss": 1.3998, + "step": 9277 + }, + { + "epoch": 1.600621064435435, + "grad_norm": 0.625, + "learning_rate": 1.9187987805293096e-06, + "loss": 1.4442, + "step": 9278 + }, + { + "epoch": 1.600793582334167, + "grad_norm": 0.5859375, + "learning_rate": 1.9171958449981587e-06, + "loss": 1.3879, + "step": 9279 + }, + { + "epoch": 1.6009661002328992, + "grad_norm": 0.5625, + "learning_rate": 1.9155935082916467e-06, + "loss": 1.3548, + "step": 9280 + }, + { + "epoch": 1.6011386181316312, + "grad_norm": 0.5703125, + "learning_rate": 1.913991770528475e-06, + "loss": 1.5053, + "step": 9281 + }, + { + "epoch": 1.6013111360303631, + "grad_norm": 0.7578125, + "learning_rate": 1.9123906318273234e-06, + "loss": 1.4098, + "step": 9282 + }, + { + "epoch": 1.6014836539290953, + "grad_norm": 0.703125, + "learning_rate": 1.9107900923068013e-06, + "loss": 1.4627, + "step": 9283 + }, + { + "epoch": 1.6016561718278273, + "grad_norm": 0.58984375, + "learning_rate": 1.909190152085497e-06, + "loss": 1.4288, + "step": 9284 + }, + { + "epoch": 1.6018286897265592, + "grad_norm": 0.76171875, + "learning_rate": 1.9075908112819387e-06, + "loss": 1.376, + "step": 9285 + }, + { + "epoch": 1.6020012076252912, + "grad_norm": 0.57421875, + "learning_rate": 1.9059920700146163e-06, + "loss": 1.3245, + "step": 9286 + }, + { + "epoch": 1.6021737255240232, + "grad_norm": 0.58203125, + "learning_rate": 1.9043939284019753e-06, + "loss": 1.4066, + "step": 9287 + }, + { + "epoch": 1.6023462434227551, + "grad_norm": 0.9375, + "learning_rate": 1.9027963865624166e-06, + "loss": 1.3476, + "step": 9288 + }, + { + "epoch": 1.602518761321487, + "grad_norm": 0.6015625, + "learning_rate": 1.9011994446142935e-06, + "loss": 1.3944, + "step": 9289 + }, + { + "epoch": 1.602691279220219, + "grad_norm": 0.58203125, + "learning_rate": 1.8996031026759232e-06, + "loss": 1.4003, + "step": 9290 + }, + { + "epoch": 1.602863797118951, + "grad_norm": 0.609375, + "learning_rate": 1.8980073608655625e-06, + "loss": 1.4291, + "step": 9291 + }, + { + "epoch": 1.603036315017683, + "grad_norm": 0.609375, + "learning_rate": 1.8964122193014456e-06, + "loss": 1.4436, + "step": 9292 + }, + { + "epoch": 1.603208832916415, + "grad_norm": 0.5546875, + "learning_rate": 1.8948176781017435e-06, + "loss": 1.3513, + "step": 9293 + }, + { + "epoch": 1.603381350815147, + "grad_norm": 0.6484375, + "learning_rate": 1.8932237373845874e-06, + "loss": 1.4303, + "step": 9294 + }, + { + "epoch": 1.603553868713879, + "grad_norm": 0.58984375, + "learning_rate": 1.891630397268076e-06, + "loss": 1.287, + "step": 9295 + }, + { + "epoch": 1.603726386612611, + "grad_norm": 0.578125, + "learning_rate": 1.890037657870245e-06, + "loss": 1.4175, + "step": 9296 + }, + { + "epoch": 1.6038989045113432, + "grad_norm": 0.5703125, + "learning_rate": 1.8884455193090989e-06, + "loss": 1.4304, + "step": 9297 + }, + { + "epoch": 1.6040714224100752, + "grad_norm": 0.58984375, + "learning_rate": 1.886853981702591e-06, + "loss": 1.4971, + "step": 9298 + }, + { + "epoch": 1.6042439403088071, + "grad_norm": 0.58203125, + "learning_rate": 1.8852630451686337e-06, + "loss": 1.4024, + "step": 9299 + }, + { + "epoch": 1.604416458207539, + "grad_norm": 0.59375, + "learning_rate": 1.883672709825094e-06, + "loss": 1.4162, + "step": 9300 + }, + { + "epoch": 1.604416458207539, + "eval_loss": 1.4071283340454102, + "eval_runtime": 10.92, + "eval_samples_per_second": 93.773, + "eval_steps_per_second": 23.443, + "step": 9300 + }, + { + "epoch": 1.604588976106271, + "grad_norm": 0.59375, + "learning_rate": 1.8820829757897952e-06, + "loss": 1.3499, + "step": 9301 + }, + { + "epoch": 1.604761494005003, + "grad_norm": 0.65625, + "learning_rate": 1.8804938431805064e-06, + "loss": 1.4917, + "step": 9302 + }, + { + "epoch": 1.604934011903735, + "grad_norm": 0.63671875, + "learning_rate": 1.8789053121149736e-06, + "loss": 1.3938, + "step": 9303 + }, + { + "epoch": 1.605106529802467, + "grad_norm": 0.609375, + "learning_rate": 1.877317382710875e-06, + "loss": 1.4025, + "step": 9304 + }, + { + "epoch": 1.605279047701199, + "grad_norm": 0.6640625, + "learning_rate": 1.8757300550858571e-06, + "loss": 1.3692, + "step": 9305 + }, + { + "epoch": 1.6054515655999309, + "grad_norm": 0.60546875, + "learning_rate": 1.8741433293575196e-06, + "loss": 1.5194, + "step": 9306 + }, + { + "epoch": 1.605624083498663, + "grad_norm": 0.7265625, + "learning_rate": 1.8725572056434172e-06, + "loss": 1.4504, + "step": 9307 + }, + { + "epoch": 1.605796601397395, + "grad_norm": 0.609375, + "learning_rate": 1.8709716840610592e-06, + "loss": 1.4591, + "step": 9308 + }, + { + "epoch": 1.605969119296127, + "grad_norm": 0.9921875, + "learning_rate": 1.869386764727914e-06, + "loss": 1.4517, + "step": 9309 + }, + { + "epoch": 1.606141637194859, + "grad_norm": 0.609375, + "learning_rate": 1.8678024477613954e-06, + "loss": 1.4905, + "step": 9310 + }, + { + "epoch": 1.606314155093591, + "grad_norm": 0.55859375, + "learning_rate": 1.8662187332788861e-06, + "loss": 1.4565, + "step": 9311 + }, + { + "epoch": 1.606486672992323, + "grad_norm": 0.57421875, + "learning_rate": 1.8646356213977167e-06, + "loss": 1.3746, + "step": 9312 + }, + { + "epoch": 1.606659190891055, + "grad_norm": 0.58984375, + "learning_rate": 1.863053112235168e-06, + "loss": 1.4626, + "step": 9313 + }, + { + "epoch": 1.606831708789787, + "grad_norm": 0.60546875, + "learning_rate": 1.8614712059084927e-06, + "loss": 1.4173, + "step": 9314 + }, + { + "epoch": 1.607004226688519, + "grad_norm": 0.58984375, + "learning_rate": 1.8598899025348771e-06, + "loss": 1.3984, + "step": 9315 + }, + { + "epoch": 1.607176744587251, + "grad_norm": 0.5703125, + "learning_rate": 1.858309202231483e-06, + "loss": 1.4292, + "step": 9316 + }, + { + "epoch": 1.6073492624859829, + "grad_norm": 0.55078125, + "learning_rate": 1.8567291051154135e-06, + "loss": 1.4097, + "step": 9317 + }, + { + "epoch": 1.6075217803847148, + "grad_norm": 0.6171875, + "learning_rate": 1.8551496113037336e-06, + "loss": 1.4493, + "step": 9318 + }, + { + "epoch": 1.6076942982834468, + "grad_norm": 0.5859375, + "learning_rate": 1.8535707209134613e-06, + "loss": 1.2968, + "step": 9319 + }, + { + "epoch": 1.6078668161821787, + "grad_norm": 0.578125, + "learning_rate": 1.8519924340615713e-06, + "loss": 1.4715, + "step": 9320 + }, + { + "epoch": 1.608039334080911, + "grad_norm": 0.5625, + "learning_rate": 1.8504147508649928e-06, + "loss": 1.3962, + "step": 9321 + }, + { + "epoch": 1.608211851979643, + "grad_norm": 0.6015625, + "learning_rate": 1.8488376714406131e-06, + "loss": 1.4307, + "step": 9322 + }, + { + "epoch": 1.6083843698783749, + "grad_norm": 0.5390625, + "learning_rate": 1.8472611959052644e-06, + "loss": 1.4533, + "step": 9323 + }, + { + "epoch": 1.608556887777107, + "grad_norm": 0.58984375, + "learning_rate": 1.8456853243757522e-06, + "loss": 1.4285, + "step": 9324 + }, + { + "epoch": 1.608729405675839, + "grad_norm": 0.59765625, + "learning_rate": 1.8441100569688186e-06, + "loss": 1.4613, + "step": 9325 + }, + { + "epoch": 1.608901923574571, + "grad_norm": 0.65234375, + "learning_rate": 1.8425353938011702e-06, + "loss": 1.4717, + "step": 9326 + }, + { + "epoch": 1.609074441473303, + "grad_norm": 0.5703125, + "learning_rate": 1.8409613349894761e-06, + "loss": 1.4131, + "step": 9327 + }, + { + "epoch": 1.6092469593720349, + "grad_norm": 0.59375, + "learning_rate": 1.839387880650343e-06, + "loss": 1.4449, + "step": 9328 + }, + { + "epoch": 1.6094194772707668, + "grad_norm": 0.609375, + "learning_rate": 1.837815030900345e-06, + "loss": 1.3643, + "step": 9329 + }, + { + "epoch": 1.6095919951694988, + "grad_norm": 1.15625, + "learning_rate": 1.8362427858560094e-06, + "loss": 1.379, + "step": 9330 + }, + { + "epoch": 1.6097645130682308, + "grad_norm": 0.63671875, + "learning_rate": 1.8346711456338185e-06, + "loss": 1.4487, + "step": 9331 + }, + { + "epoch": 1.6099370309669627, + "grad_norm": 0.5546875, + "learning_rate": 1.8331001103502077e-06, + "loss": 1.411, + "step": 9332 + }, + { + "epoch": 1.6101095488656947, + "grad_norm": 0.578125, + "learning_rate": 1.8315296801215721e-06, + "loss": 1.3986, + "step": 9333 + }, + { + "epoch": 1.6102820667644266, + "grad_norm": 0.6015625, + "learning_rate": 1.8299598550642528e-06, + "loss": 1.5243, + "step": 9334 + }, + { + "epoch": 1.6104545846631588, + "grad_norm": 0.5859375, + "learning_rate": 1.82839063529456e-06, + "loss": 1.4422, + "step": 9335 + }, + { + "epoch": 1.6106271025618908, + "grad_norm": 0.578125, + "learning_rate": 1.8268220209287457e-06, + "loss": 1.3994, + "step": 9336 + }, + { + "epoch": 1.6107996204606227, + "grad_norm": 0.55859375, + "learning_rate": 1.8252540120830253e-06, + "loss": 1.3916, + "step": 9337 + }, + { + "epoch": 1.610972138359355, + "grad_norm": 0.59765625, + "learning_rate": 1.823686608873565e-06, + "loss": 1.4315, + "step": 9338 + }, + { + "epoch": 1.611144656258087, + "grad_norm": 0.59765625, + "learning_rate": 1.8221198114164885e-06, + "loss": 1.3636, + "step": 9339 + }, + { + "epoch": 1.6113171741568189, + "grad_norm": 0.5625, + "learning_rate": 1.8205536198278739e-06, + "loss": 1.4466, + "step": 9340 + }, + { + "epoch": 1.6114896920555508, + "grad_norm": 0.5625, + "learning_rate": 1.8189880342237576e-06, + "loss": 1.4002, + "step": 9341 + }, + { + "epoch": 1.6116622099542828, + "grad_norm": 0.67578125, + "learning_rate": 1.81742305472012e-06, + "loss": 1.5221, + "step": 9342 + }, + { + "epoch": 1.6118347278530147, + "grad_norm": 0.5859375, + "learning_rate": 1.8158586814329126e-06, + "loss": 1.3755, + "step": 9343 + }, + { + "epoch": 1.6120072457517467, + "grad_norm": 0.55859375, + "learning_rate": 1.8142949144780297e-06, + "loss": 1.4148, + "step": 9344 + }, + { + "epoch": 1.6121797636504787, + "grad_norm": 0.58203125, + "learning_rate": 1.8127317539713273e-06, + "loss": 1.4302, + "step": 9345 + }, + { + "epoch": 1.6123522815492106, + "grad_norm": 0.59765625, + "learning_rate": 1.811169200028615e-06, + "loss": 1.4514, + "step": 9346 + }, + { + "epoch": 1.6125247994479426, + "grad_norm": 0.55859375, + "learning_rate": 1.8096072527656505e-06, + "loss": 1.4674, + "step": 9347 + }, + { + "epoch": 1.6126973173466748, + "grad_norm": 0.59765625, + "learning_rate": 1.8080459122981609e-06, + "loss": 1.5049, + "step": 9348 + }, + { + "epoch": 1.6128698352454067, + "grad_norm": 0.55859375, + "learning_rate": 1.8064851787418126e-06, + "loss": 1.4882, + "step": 9349 + }, + { + "epoch": 1.6130423531441387, + "grad_norm": 0.54296875, + "learning_rate": 1.8049250522122386e-06, + "loss": 1.4113, + "step": 9350 + }, + { + "epoch": 1.6132148710428706, + "grad_norm": 0.59765625, + "learning_rate": 1.8033655328250222e-06, + "loss": 1.5452, + "step": 9351 + }, + { + "epoch": 1.6133873889416028, + "grad_norm": 0.5859375, + "learning_rate": 1.8018066206957007e-06, + "loss": 1.5255, + "step": 9352 + }, + { + "epoch": 1.6135599068403348, + "grad_norm": 0.6015625, + "learning_rate": 1.800248315939771e-06, + "loss": 1.4721, + "step": 9353 + }, + { + "epoch": 1.6137324247390668, + "grad_norm": 0.61328125, + "learning_rate": 1.7986906186726815e-06, + "loss": 1.3949, + "step": 9354 + }, + { + "epoch": 1.6139049426377987, + "grad_norm": 0.5625, + "learning_rate": 1.7971335290098314e-06, + "loss": 1.4231, + "step": 9355 + }, + { + "epoch": 1.6140774605365307, + "grad_norm": 0.578125, + "learning_rate": 1.7955770470665879e-06, + "loss": 1.441, + "step": 9356 + }, + { + "epoch": 1.6142499784352626, + "grad_norm": 0.55859375, + "learning_rate": 1.794021172958258e-06, + "loss": 1.4214, + "step": 9357 + }, + { + "epoch": 1.6144224963339946, + "grad_norm": 0.62890625, + "learning_rate": 1.7924659068001094e-06, + "loss": 1.402, + "step": 9358 + }, + { + "epoch": 1.6145950142327266, + "grad_norm": 0.546875, + "learning_rate": 1.7909112487073754e-06, + "loss": 1.4344, + "step": 9359 + }, + { + "epoch": 1.6147675321314585, + "grad_norm": 0.59375, + "learning_rate": 1.7893571987952262e-06, + "loss": 1.5812, + "step": 9360 + }, + { + "epoch": 1.6149400500301905, + "grad_norm": 0.625, + "learning_rate": 1.7878037571787987e-06, + "loss": 1.4667, + "step": 9361 + }, + { + "epoch": 1.6151125679289227, + "grad_norm": 0.6171875, + "learning_rate": 1.7862509239731806e-06, + "loss": 1.4051, + "step": 9362 + }, + { + "epoch": 1.6152850858276546, + "grad_norm": 0.59765625, + "learning_rate": 1.7846986992934157e-06, + "loss": 1.3622, + "step": 9363 + }, + { + "epoch": 1.6154576037263866, + "grad_norm": 0.5703125, + "learning_rate": 1.7831470832545028e-06, + "loss": 1.4268, + "step": 9364 + }, + { + "epoch": 1.6156301216251188, + "grad_norm": 0.640625, + "learning_rate": 1.781596075971398e-06, + "loss": 1.3411, + "step": 9365 + }, + { + "epoch": 1.6158026395238507, + "grad_norm": 0.59375, + "learning_rate": 1.7800456775590014e-06, + "loss": 1.4521, + "step": 9366 + }, + { + "epoch": 1.6159751574225827, + "grad_norm": 0.56640625, + "learning_rate": 1.7784958881321868e-06, + "loss": 1.4027, + "step": 9367 + }, + { + "epoch": 1.6161476753213146, + "grad_norm": 0.56640625, + "learning_rate": 1.7769467078057644e-06, + "loss": 1.4094, + "step": 9368 + }, + { + "epoch": 1.6163201932200466, + "grad_norm": 0.61328125, + "learning_rate": 1.7753981366945105e-06, + "loss": 1.5092, + "step": 9369 + }, + { + "epoch": 1.6164927111187786, + "grad_norm": 0.5859375, + "learning_rate": 1.773850174913152e-06, + "loss": 1.3982, + "step": 9370 + }, + { + "epoch": 1.6166652290175105, + "grad_norm": 0.6484375, + "learning_rate": 1.7723028225763727e-06, + "loss": 1.3922, + "step": 9371 + }, + { + "epoch": 1.6168377469162425, + "grad_norm": 0.5625, + "learning_rate": 1.770756079798809e-06, + "loss": 1.4047, + "step": 9372 + }, + { + "epoch": 1.6170102648149745, + "grad_norm": 0.58203125, + "learning_rate": 1.7692099466950551e-06, + "loss": 1.4964, + "step": 9373 + }, + { + "epoch": 1.6171827827137064, + "grad_norm": 0.61328125, + "learning_rate": 1.7676644233796525e-06, + "loss": 1.4487, + "step": 9374 + }, + { + "epoch": 1.6173553006124386, + "grad_norm": 0.58203125, + "learning_rate": 1.766119509967109e-06, + "loss": 1.5349, + "step": 9375 + }, + { + "epoch": 1.6175278185111706, + "grad_norm": 0.5625, + "learning_rate": 1.7645752065718814e-06, + "loss": 1.4116, + "step": 9376 + }, + { + "epoch": 1.6177003364099025, + "grad_norm": 0.56640625, + "learning_rate": 1.7630315133083786e-06, + "loss": 1.3519, + "step": 9377 + }, + { + "epoch": 1.6178728543086345, + "grad_norm": 0.546875, + "learning_rate": 1.7614884302909719e-06, + "loss": 1.3943, + "step": 9378 + }, + { + "epoch": 1.6180453722073667, + "grad_norm": 0.6015625, + "learning_rate": 1.7599459576339729e-06, + "loss": 1.5073, + "step": 9379 + }, + { + "epoch": 1.6182178901060986, + "grad_norm": 0.578125, + "learning_rate": 1.758404095451669e-06, + "loss": 1.402, + "step": 9380 + }, + { + "epoch": 1.6183904080048306, + "grad_norm": 0.6171875, + "learning_rate": 1.7568628438582825e-06, + "loss": 1.4218, + "step": 9381 + }, + { + "epoch": 1.6185629259035625, + "grad_norm": 0.6796875, + "learning_rate": 1.7553222029680028e-06, + "loss": 1.4593, + "step": 9382 + }, + { + "epoch": 1.6187354438022945, + "grad_norm": 0.59765625, + "learning_rate": 1.7537821728949688e-06, + "loss": 1.3169, + "step": 9383 + }, + { + "epoch": 1.6189079617010265, + "grad_norm": 0.8984375, + "learning_rate": 1.752242753753276e-06, + "loss": 1.3619, + "step": 9384 + }, + { + "epoch": 1.6190804795997584, + "grad_norm": 0.54296875, + "learning_rate": 1.7507039456569753e-06, + "loss": 1.4082, + "step": 9385 + }, + { + "epoch": 1.6192529974984904, + "grad_norm": 0.58984375, + "learning_rate": 1.749165748720072e-06, + "loss": 1.4374, + "step": 9386 + }, + { + "epoch": 1.6194255153972223, + "grad_norm": 0.77734375, + "learning_rate": 1.7476281630565195e-06, + "loss": 1.4549, + "step": 9387 + }, + { + "epoch": 1.6195980332959543, + "grad_norm": 0.5625, + "learning_rate": 1.74609118878024e-06, + "loss": 1.4547, + "step": 9388 + }, + { + "epoch": 1.6197705511946865, + "grad_norm": 0.58203125, + "learning_rate": 1.744554826005096e-06, + "loss": 1.4083, + "step": 9389 + }, + { + "epoch": 1.6199430690934185, + "grad_norm": 0.56640625, + "learning_rate": 1.7430190748449105e-06, + "loss": 1.4198, + "step": 9390 + }, + { + "epoch": 1.6201155869921504, + "grad_norm": 0.58203125, + "learning_rate": 1.7414839354134684e-06, + "loss": 1.3815, + "step": 9391 + }, + { + "epoch": 1.6202881048908824, + "grad_norm": 0.62890625, + "learning_rate": 1.739949407824496e-06, + "loss": 1.4645, + "step": 9392 + }, + { + "epoch": 1.6204606227896146, + "grad_norm": 0.640625, + "learning_rate": 1.7384154921916819e-06, + "loss": 1.4625, + "step": 9393 + }, + { + "epoch": 1.6206331406883465, + "grad_norm": 0.59375, + "learning_rate": 1.7368821886286691e-06, + "loss": 1.3335, + "step": 9394 + }, + { + "epoch": 1.6208056585870785, + "grad_norm": 0.62109375, + "learning_rate": 1.7353494972490548e-06, + "loss": 1.5648, + "step": 9395 + }, + { + "epoch": 1.6209781764858104, + "grad_norm": 0.5546875, + "learning_rate": 1.7338174181663891e-06, + "loss": 1.4175, + "step": 9396 + }, + { + "epoch": 1.6211506943845424, + "grad_norm": 0.59375, + "learning_rate": 1.7322859514941813e-06, + "loss": 1.3912, + "step": 9397 + }, + { + "epoch": 1.6213232122832744, + "grad_norm": 0.609375, + "learning_rate": 1.7307550973458843e-06, + "loss": 1.3587, + "step": 9398 + }, + { + "epoch": 1.6214957301820063, + "grad_norm": 0.609375, + "learning_rate": 1.7292248558349233e-06, + "loss": 1.5153, + "step": 9399 + }, + { + "epoch": 1.6216682480807383, + "grad_norm": 0.6015625, + "learning_rate": 1.7276952270746606e-06, + "loss": 1.3662, + "step": 9400 + }, + { + "epoch": 1.6216682480807383, + "eval_loss": 1.4071524143218994, + "eval_runtime": 10.8335, + "eval_samples_per_second": 94.522, + "eval_steps_per_second": 23.631, + "step": 9400 + }, + { + "epoch": 1.6218407659794702, + "grad_norm": 0.58203125, + "learning_rate": 1.7261662111784229e-06, + "loss": 1.4412, + "step": 9401 + }, + { + "epoch": 1.6220132838782022, + "grad_norm": 0.5703125, + "learning_rate": 1.7246378082594906e-06, + "loss": 1.3786, + "step": 9402 + }, + { + "epoch": 1.6221858017769344, + "grad_norm": 0.609375, + "learning_rate": 1.7231100184310955e-06, + "loss": 1.3919, + "step": 9403 + }, + { + "epoch": 1.6223583196756663, + "grad_norm": 0.59765625, + "learning_rate": 1.7215828418064263e-06, + "loss": 1.3885, + "step": 9404 + }, + { + "epoch": 1.6225308375743983, + "grad_norm": 0.56640625, + "learning_rate": 1.7200562784986263e-06, + "loss": 1.3885, + "step": 9405 + }, + { + "epoch": 1.6227033554731305, + "grad_norm": 0.58984375, + "learning_rate": 1.7185303286207932e-06, + "loss": 1.4058, + "step": 9406 + }, + { + "epoch": 1.6228758733718625, + "grad_norm": 0.625, + "learning_rate": 1.7170049922859788e-06, + "loss": 1.4768, + "step": 9407 + }, + { + "epoch": 1.6230483912705944, + "grad_norm": 0.60546875, + "learning_rate": 1.7154802696071882e-06, + "loss": 1.4446, + "step": 9408 + }, + { + "epoch": 1.6232209091693264, + "grad_norm": 0.5625, + "learning_rate": 1.7139561606973832e-06, + "loss": 1.4344, + "step": 9409 + }, + { + "epoch": 1.6233934270680583, + "grad_norm": 0.59765625, + "learning_rate": 1.7124326656694823e-06, + "loss": 1.4298, + "step": 9410 + }, + { + "epoch": 1.6235659449667903, + "grad_norm": 0.5859375, + "learning_rate": 1.7109097846363476e-06, + "loss": 1.435, + "step": 9411 + }, + { + "epoch": 1.6237384628655223, + "grad_norm": 0.57421875, + "learning_rate": 1.7093875177108131e-06, + "loss": 1.4331, + "step": 9412 + }, + { + "epoch": 1.6239109807642542, + "grad_norm": 0.59765625, + "learning_rate": 1.7078658650056524e-06, + "loss": 1.405, + "step": 9413 + }, + { + "epoch": 1.6240834986629862, + "grad_norm": 0.640625, + "learning_rate": 1.7063448266335991e-06, + "loss": 1.5126, + "step": 9414 + }, + { + "epoch": 1.6242560165617181, + "grad_norm": 0.59375, + "learning_rate": 1.7048244027073424e-06, + "loss": 1.3834, + "step": 9415 + }, + { + "epoch": 1.6244285344604503, + "grad_norm": 0.57421875, + "learning_rate": 1.7033045933395242e-06, + "loss": 1.4099, + "step": 9416 + }, + { + "epoch": 1.6246010523591823, + "grad_norm": 0.64453125, + "learning_rate": 1.7017853986427423e-06, + "loss": 1.3877, + "step": 9417 + }, + { + "epoch": 1.6247735702579142, + "grad_norm": 0.5546875, + "learning_rate": 1.70026681872955e-06, + "loss": 1.3025, + "step": 9418 + }, + { + "epoch": 1.6249460881566462, + "grad_norm": 0.625, + "learning_rate": 1.6987488537124454e-06, + "loss": 1.4223, + "step": 9419 + }, + { + "epoch": 1.6251186060553784, + "grad_norm": 0.5625, + "learning_rate": 1.6972315037039e-06, + "loss": 1.4389, + "step": 9420 + }, + { + "epoch": 1.6252911239541104, + "grad_norm": 0.58203125, + "learning_rate": 1.69571476881632e-06, + "loss": 1.4033, + "step": 9421 + }, + { + "epoch": 1.6254636418528423, + "grad_norm": 0.55859375, + "learning_rate": 1.6941986491620754e-06, + "loss": 1.3848, + "step": 9422 + }, + { + "epoch": 1.6256361597515743, + "grad_norm": 0.5625, + "learning_rate": 1.6926831448534964e-06, + "loss": 1.448, + "step": 9423 + }, + { + "epoch": 1.6258086776503062, + "grad_norm": 0.77734375, + "learning_rate": 1.6911682560028552e-06, + "loss": 1.4198, + "step": 9424 + }, + { + "epoch": 1.6259811955490382, + "grad_norm": 0.55859375, + "learning_rate": 1.6896539827223845e-06, + "loss": 1.4735, + "step": 9425 + }, + { + "epoch": 1.6261537134477702, + "grad_norm": 0.66015625, + "learning_rate": 1.6881403251242723e-06, + "loss": 1.2885, + "step": 9426 + }, + { + "epoch": 1.6263262313465021, + "grad_norm": 0.60546875, + "learning_rate": 1.6866272833206598e-06, + "loss": 1.332, + "step": 9427 + }, + { + "epoch": 1.626498749245234, + "grad_norm": 0.54296875, + "learning_rate": 1.685114857423643e-06, + "loss": 1.3159, + "step": 9428 + }, + { + "epoch": 1.626671267143966, + "grad_norm": 0.5625, + "learning_rate": 1.6836030475452737e-06, + "loss": 1.3439, + "step": 9429 + }, + { + "epoch": 1.6268437850426982, + "grad_norm": 0.57421875, + "learning_rate": 1.6820918537975484e-06, + "loss": 1.4719, + "step": 9430 + }, + { + "epoch": 1.6270163029414302, + "grad_norm": 0.625, + "learning_rate": 1.6805812762924367e-06, + "loss": 1.4097, + "step": 9431 + }, + { + "epoch": 1.6271888208401621, + "grad_norm": 0.5859375, + "learning_rate": 1.6790713151418403e-06, + "loss": 1.3939, + "step": 9432 + }, + { + "epoch": 1.6273613387388943, + "grad_norm": 0.59765625, + "learning_rate": 1.6775619704576385e-06, + "loss": 1.4855, + "step": 9433 + }, + { + "epoch": 1.6275338566376263, + "grad_norm": 0.9453125, + "learning_rate": 1.676053242351644e-06, + "loss": 1.3427, + "step": 9434 + }, + { + "epoch": 1.6277063745363582, + "grad_norm": 0.57421875, + "learning_rate": 1.6745451309356354e-06, + "loss": 1.4919, + "step": 9435 + }, + { + "epoch": 1.6278788924350902, + "grad_norm": 0.578125, + "learning_rate": 1.673037636321344e-06, + "loss": 1.4468, + "step": 9436 + }, + { + "epoch": 1.6280514103338222, + "grad_norm": 0.5625, + "learning_rate": 1.6715307586204533e-06, + "loss": 1.4331, + "step": 9437 + }, + { + "epoch": 1.6282239282325541, + "grad_norm": 0.61328125, + "learning_rate": 1.6700244979446034e-06, + "loss": 1.3679, + "step": 9438 + }, + { + "epoch": 1.628396446131286, + "grad_norm": 0.5625, + "learning_rate": 1.6685188544053855e-06, + "loss": 1.317, + "step": 9439 + }, + { + "epoch": 1.628568964030018, + "grad_norm": 0.59765625, + "learning_rate": 1.66701382811435e-06, + "loss": 1.4306, + "step": 9440 + }, + { + "epoch": 1.62874148192875, + "grad_norm": 0.57421875, + "learning_rate": 1.6655094191829957e-06, + "loss": 1.3777, + "step": 9441 + }, + { + "epoch": 1.628913999827482, + "grad_norm": 0.53515625, + "learning_rate": 1.6640056277227824e-06, + "loss": 1.3518, + "step": 9442 + }, + { + "epoch": 1.629086517726214, + "grad_norm": 0.5859375, + "learning_rate": 1.662502453845114e-06, + "loss": 1.5547, + "step": 9443 + }, + { + "epoch": 1.6292590356249461, + "grad_norm": 0.56640625, + "learning_rate": 1.660999897661363e-06, + "loss": 1.4116, + "step": 9444 + }, + { + "epoch": 1.629431553523678, + "grad_norm": 0.5703125, + "learning_rate": 1.659497959282842e-06, + "loss": 1.3623, + "step": 9445 + }, + { + "epoch": 1.62960407142241, + "grad_norm": 0.6640625, + "learning_rate": 1.6579966388208257e-06, + "loss": 1.4435, + "step": 9446 + }, + { + "epoch": 1.6297765893211422, + "grad_norm": 0.57421875, + "learning_rate": 1.6564959363865418e-06, + "loss": 1.4531, + "step": 9447 + }, + { + "epoch": 1.6299491072198742, + "grad_norm": 0.58984375, + "learning_rate": 1.6549958520911713e-06, + "loss": 1.3742, + "step": 9448 + }, + { + "epoch": 1.6301216251186061, + "grad_norm": 0.55859375, + "learning_rate": 1.653496386045851e-06, + "loss": 1.341, + "step": 9449 + }, + { + "epoch": 1.630294143017338, + "grad_norm": 0.5703125, + "learning_rate": 1.6519975383616716e-06, + "loss": 1.3705, + "step": 9450 + }, + { + "epoch": 1.63046666091607, + "grad_norm": 0.57421875, + "learning_rate": 1.6504993091496701e-06, + "loss": 1.4054, + "step": 9451 + }, + { + "epoch": 1.630639178814802, + "grad_norm": 0.58984375, + "learning_rate": 1.6490016985208546e-06, + "loss": 1.4413, + "step": 9452 + }, + { + "epoch": 1.630811696713534, + "grad_norm": 0.671875, + "learning_rate": 1.6475047065861705e-06, + "loss": 1.4356, + "step": 9453 + }, + { + "epoch": 1.630984214612266, + "grad_norm": 1.046875, + "learning_rate": 1.6460083334565248e-06, + "loss": 1.3905, + "step": 9454 + }, + { + "epoch": 1.631156732510998, + "grad_norm": 0.78515625, + "learning_rate": 1.644512579242784e-06, + "loss": 1.3791, + "step": 9455 + }, + { + "epoch": 1.6313292504097299, + "grad_norm": 0.5625, + "learning_rate": 1.6430174440557567e-06, + "loss": 1.4764, + "step": 9456 + }, + { + "epoch": 1.631501768308462, + "grad_norm": 0.53515625, + "learning_rate": 1.6415229280062139e-06, + "loss": 1.39, + "step": 9457 + }, + { + "epoch": 1.631674286207194, + "grad_norm": 0.55859375, + "learning_rate": 1.6400290312048794e-06, + "loss": 1.4338, + "step": 9458 + }, + { + "epoch": 1.631846804105926, + "grad_norm": 0.578125, + "learning_rate": 1.638535753762429e-06, + "loss": 1.4187, + "step": 9459 + }, + { + "epoch": 1.632019322004658, + "grad_norm": 0.6171875, + "learning_rate": 1.6370430957894945e-06, + "loss": 1.4055, + "step": 9460 + }, + { + "epoch": 1.6321918399033901, + "grad_norm": 0.5703125, + "learning_rate": 1.6355510573966648e-06, + "loss": 1.5129, + "step": 9461 + }, + { + "epoch": 1.632364357802122, + "grad_norm": 0.55859375, + "learning_rate": 1.6340596386944707e-06, + "loss": 1.3567, + "step": 9462 + }, + { + "epoch": 1.632536875700854, + "grad_norm": 0.59765625, + "learning_rate": 1.632568839793417e-06, + "loss": 1.379, + "step": 9463 + }, + { + "epoch": 1.632709393599586, + "grad_norm": 0.6015625, + "learning_rate": 1.6310786608039408e-06, + "loss": 1.4042, + "step": 9464 + }, + { + "epoch": 1.632881911498318, + "grad_norm": 0.57421875, + "learning_rate": 1.629589101836454e-06, + "loss": 1.3955, + "step": 9465 + }, + { + "epoch": 1.63305442939705, + "grad_norm": 0.53515625, + "learning_rate": 1.6281001630013038e-06, + "loss": 1.3853, + "step": 9466 + }, + { + "epoch": 1.6332269472957819, + "grad_norm": 0.55078125, + "learning_rate": 1.6266118444088041e-06, + "loss": 1.3801, + "step": 9467 + }, + { + "epoch": 1.6333994651945138, + "grad_norm": 0.61328125, + "learning_rate": 1.625124146169218e-06, + "loss": 1.5522, + "step": 9468 + }, + { + "epoch": 1.6335719830932458, + "grad_norm": 0.59375, + "learning_rate": 1.623637068392765e-06, + "loss": 1.4954, + "step": 9469 + }, + { + "epoch": 1.6337445009919778, + "grad_norm": 0.65625, + "learning_rate": 1.6221506111896147e-06, + "loss": 1.4894, + "step": 9470 + }, + { + "epoch": 1.63391701889071, + "grad_norm": 0.6328125, + "learning_rate": 1.6206647746698945e-06, + "loss": 1.4969, + "step": 9471 + }, + { + "epoch": 1.634089536789442, + "grad_norm": 0.57421875, + "learning_rate": 1.6191795589436843e-06, + "loss": 1.4825, + "step": 9472 + }, + { + "epoch": 1.6342620546881739, + "grad_norm": 0.609375, + "learning_rate": 1.6176949641210183e-06, + "loss": 1.4227, + "step": 9473 + }, + { + "epoch": 1.634434572586906, + "grad_norm": 0.5625, + "learning_rate": 1.6162109903118872e-06, + "loss": 1.4529, + "step": 9474 + }, + { + "epoch": 1.634607090485638, + "grad_norm": 0.58984375, + "learning_rate": 1.6147276376262256e-06, + "loss": 1.4036, + "step": 9475 + }, + { + "epoch": 1.63477960838437, + "grad_norm": 0.56640625, + "learning_rate": 1.6132449061739386e-06, + "loss": 1.367, + "step": 9476 + }, + { + "epoch": 1.634952126283102, + "grad_norm": 0.56640625, + "learning_rate": 1.6117627960648684e-06, + "loss": 1.4574, + "step": 9477 + }, + { + "epoch": 1.635124644181834, + "grad_norm": 0.59375, + "learning_rate": 1.6102813074088242e-06, + "loss": 1.4849, + "step": 9478 + }, + { + "epoch": 1.6352971620805659, + "grad_norm": 0.69140625, + "learning_rate": 1.6088004403155611e-06, + "loss": 1.3788, + "step": 9479 + }, + { + "epoch": 1.6354696799792978, + "grad_norm": 0.5703125, + "learning_rate": 1.6073201948947925e-06, + "loss": 1.3557, + "step": 9480 + }, + { + "epoch": 1.6356421978780298, + "grad_norm": 0.6015625, + "learning_rate": 1.605840571256183e-06, + "loss": 1.5011, + "step": 9481 + }, + { + "epoch": 1.6358147157767617, + "grad_norm": 0.58984375, + "learning_rate": 1.6043615695093561e-06, + "loss": 1.3889, + "step": 9482 + }, + { + "epoch": 1.6359872336754937, + "grad_norm": 0.61328125, + "learning_rate": 1.6028831897638774e-06, + "loss": 1.3383, + "step": 9483 + }, + { + "epoch": 1.6361597515742257, + "grad_norm": 0.5546875, + "learning_rate": 1.6014054321292848e-06, + "loss": 1.3963, + "step": 9484 + }, + { + "epoch": 1.6363322694729578, + "grad_norm": 0.6171875, + "learning_rate": 1.599928296715052e-06, + "loss": 1.4085, + "step": 9485 + }, + { + "epoch": 1.6365047873716898, + "grad_norm": 0.59375, + "learning_rate": 1.5984517836306168e-06, + "loss": 1.4987, + "step": 9486 + }, + { + "epoch": 1.6366773052704218, + "grad_norm": 0.57421875, + "learning_rate": 1.5969758929853697e-06, + "loss": 1.4016, + "step": 9487 + }, + { + "epoch": 1.636849823169154, + "grad_norm": 0.59375, + "learning_rate": 1.5955006248886519e-06, + "loss": 1.386, + "step": 9488 + }, + { + "epoch": 1.637022341067886, + "grad_norm": 0.65234375, + "learning_rate": 1.5940259794497614e-06, + "loss": 1.3809, + "step": 9489 + }, + { + "epoch": 1.6371948589666179, + "grad_norm": 0.56640625, + "learning_rate": 1.5925519567779502e-06, + "loss": 1.5296, + "step": 9490 + }, + { + "epoch": 1.6373673768653498, + "grad_norm": 0.59375, + "learning_rate": 1.5910785569824217e-06, + "loss": 1.4201, + "step": 9491 + }, + { + "epoch": 1.6375398947640818, + "grad_norm": 0.546875, + "learning_rate": 1.5896057801723352e-06, + "loss": 1.467, + "step": 9492 + }, + { + "epoch": 1.6377124126628138, + "grad_norm": 0.74609375, + "learning_rate": 1.5881336264568037e-06, + "loss": 1.4942, + "step": 9493 + }, + { + "epoch": 1.6378849305615457, + "grad_norm": 0.5859375, + "learning_rate": 1.586662095944892e-06, + "loss": 1.4433, + "step": 9494 + }, + { + "epoch": 1.6380574484602777, + "grad_norm": 0.5859375, + "learning_rate": 1.585191188745624e-06, + "loss": 1.3434, + "step": 9495 + }, + { + "epoch": 1.6382299663590096, + "grad_norm": 0.58203125, + "learning_rate": 1.5837209049679657e-06, + "loss": 1.5629, + "step": 9496 + }, + { + "epoch": 1.6384024842577416, + "grad_norm": 0.56640625, + "learning_rate": 1.582251244720855e-06, + "loss": 1.4401, + "step": 9497 + }, + { + "epoch": 1.6385750021564738, + "grad_norm": 0.67578125, + "learning_rate": 1.5807822081131664e-06, + "loss": 1.4043, + "step": 9498 + }, + { + "epoch": 1.6387475200552057, + "grad_norm": 0.71484375, + "learning_rate": 1.5793137952537374e-06, + "loss": 1.4337, + "step": 9499 + }, + { + "epoch": 1.6389200379539377, + "grad_norm": 0.578125, + "learning_rate": 1.5778460062513578e-06, + "loss": 1.4094, + "step": 9500 + }, + { + "epoch": 1.6389200379539377, + "eval_loss": 1.4071273803710938, + "eval_runtime": 11.0018, + "eval_samples_per_second": 93.076, + "eval_steps_per_second": 23.269, + "step": 9500 + }, + { + "epoch": 1.6390925558526697, + "grad_norm": 0.58203125, + "learning_rate": 1.5763788412147695e-06, + "loss": 1.3746, + "step": 9501 + }, + { + "epoch": 1.6392650737514018, + "grad_norm": 0.59375, + "learning_rate": 1.57491230025267e-06, + "loss": 1.4003, + "step": 9502 + }, + { + "epoch": 1.6394375916501338, + "grad_norm": 0.6171875, + "learning_rate": 1.5734463834737102e-06, + "loss": 1.4765, + "step": 9503 + }, + { + "epoch": 1.6396101095488658, + "grad_norm": 0.62109375, + "learning_rate": 1.5719810909864941e-06, + "loss": 1.4375, + "step": 9504 + }, + { + "epoch": 1.6397826274475977, + "grad_norm": 0.640625, + "learning_rate": 1.5705164228995783e-06, + "loss": 1.4209, + "step": 9505 + }, + { + "epoch": 1.6399551453463297, + "grad_norm": 0.66015625, + "learning_rate": 1.5690523793214785e-06, + "loss": 1.4444, + "step": 9506 + }, + { + "epoch": 1.6401276632450617, + "grad_norm": 0.5625, + "learning_rate": 1.567588960360653e-06, + "loss": 1.4155, + "step": 9507 + }, + { + "epoch": 1.6403001811437936, + "grad_norm": 0.59765625, + "learning_rate": 1.56612616612553e-06, + "loss": 1.5217, + "step": 9508 + }, + { + "epoch": 1.6404726990425256, + "grad_norm": 0.58203125, + "learning_rate": 1.5646639967244758e-06, + "loss": 1.5132, + "step": 9509 + }, + { + "epoch": 1.6406452169412575, + "grad_norm": 0.5625, + "learning_rate": 1.5632024522658185e-06, + "loss": 1.3342, + "step": 9510 + }, + { + "epoch": 1.6408177348399895, + "grad_norm": 0.640625, + "learning_rate": 1.56174153285784e-06, + "loss": 1.457, + "step": 9511 + }, + { + "epoch": 1.6409902527387217, + "grad_norm": 0.56640625, + "learning_rate": 1.560281238608773e-06, + "loss": 1.3921, + "step": 9512 + }, + { + "epoch": 1.6411627706374536, + "grad_norm": 0.5625, + "learning_rate": 1.558821569626805e-06, + "loss": 1.4043, + "step": 9513 + }, + { + "epoch": 1.6413352885361856, + "grad_norm": 0.56640625, + "learning_rate": 1.557362526020081e-06, + "loss": 1.4605, + "step": 9514 + }, + { + "epoch": 1.6415078064349178, + "grad_norm": 0.578125, + "learning_rate": 1.5559041078966885e-06, + "loss": 1.3708, + "step": 9515 + }, + { + "epoch": 1.6416803243336497, + "grad_norm": 0.6640625, + "learning_rate": 1.554446315364685e-06, + "loss": 1.35, + "step": 9516 + }, + { + "epoch": 1.6418528422323817, + "grad_norm": 0.5859375, + "learning_rate": 1.552989148532067e-06, + "loss": 1.529, + "step": 9517 + }, + { + "epoch": 1.6420253601311137, + "grad_norm": 0.59765625, + "learning_rate": 1.5515326075067916e-06, + "loss": 1.3917, + "step": 9518 + }, + { + "epoch": 1.6421978780298456, + "grad_norm": 0.5859375, + "learning_rate": 1.5500766923967692e-06, + "loss": 1.4447, + "step": 9519 + }, + { + "epoch": 1.6423703959285776, + "grad_norm": 0.5859375, + "learning_rate": 1.5486214033098613e-06, + "loss": 1.3875, + "step": 9520 + }, + { + "epoch": 1.6425429138273095, + "grad_norm": 0.6171875, + "learning_rate": 1.5471667403538904e-06, + "loss": 1.5287, + "step": 9521 + }, + { + "epoch": 1.6427154317260415, + "grad_norm": 0.609375, + "learning_rate": 1.5457127036366216e-06, + "loss": 1.3733, + "step": 9522 + }, + { + "epoch": 1.6428879496247735, + "grad_norm": 0.56640625, + "learning_rate": 1.5442592932657797e-06, + "loss": 1.3913, + "step": 9523 + }, + { + "epoch": 1.6430604675235054, + "grad_norm": 0.609375, + "learning_rate": 1.5428065093490441e-06, + "loss": 1.3989, + "step": 9524 + }, + { + "epoch": 1.6432329854222374, + "grad_norm": 0.60546875, + "learning_rate": 1.5413543519940443e-06, + "loss": 1.3217, + "step": 9525 + }, + { + "epoch": 1.6434055033209696, + "grad_norm": 0.578125, + "learning_rate": 1.5399028213083666e-06, + "loss": 1.3971, + "step": 9526 + }, + { + "epoch": 1.6435780212197015, + "grad_norm": 0.59375, + "learning_rate": 1.5384519173995515e-06, + "loss": 1.4748, + "step": 9527 + }, + { + "epoch": 1.6437505391184335, + "grad_norm": 0.56640625, + "learning_rate": 1.5370016403750843e-06, + "loss": 1.3221, + "step": 9528 + }, + { + "epoch": 1.6439230570171657, + "grad_norm": 0.55078125, + "learning_rate": 1.5355519903424187e-06, + "loss": 1.4242, + "step": 9529 + }, + { + "epoch": 1.6440955749158976, + "grad_norm": 0.59375, + "learning_rate": 1.534102967408948e-06, + "loss": 1.4049, + "step": 9530 + }, + { + "epoch": 1.6442680928146296, + "grad_norm": 0.62109375, + "learning_rate": 1.5326545716820273e-06, + "loss": 1.2972, + "step": 9531 + }, + { + "epoch": 1.6444406107133616, + "grad_norm": 0.578125, + "learning_rate": 1.531206803268962e-06, + "loss": 1.4253, + "step": 9532 + }, + { + "epoch": 1.6446131286120935, + "grad_norm": 0.5859375, + "learning_rate": 1.5297596622770117e-06, + "loss": 1.4365, + "step": 9533 + }, + { + "epoch": 1.6447856465108255, + "grad_norm": 0.61328125, + "learning_rate": 1.5283131488133906e-06, + "loss": 1.3451, + "step": 9534 + }, + { + "epoch": 1.6449581644095574, + "grad_norm": 0.6015625, + "learning_rate": 1.5268672629852655e-06, + "loss": 1.4688, + "step": 9535 + }, + { + "epoch": 1.6451306823082894, + "grad_norm": 0.5859375, + "learning_rate": 1.5254220048997548e-06, + "loss": 1.4259, + "step": 9536 + }, + { + "epoch": 1.6453032002070214, + "grad_norm": 0.5703125, + "learning_rate": 1.523977374663934e-06, + "loss": 1.3756, + "step": 9537 + }, + { + "epoch": 1.6454757181057533, + "grad_norm": 0.55859375, + "learning_rate": 1.5225333723848335e-06, + "loss": 1.4103, + "step": 9538 + }, + { + "epoch": 1.6456482360044855, + "grad_norm": 0.62890625, + "learning_rate": 1.5210899981694238e-06, + "loss": 1.5031, + "step": 9539 + }, + { + "epoch": 1.6458207539032175, + "grad_norm": 0.5703125, + "learning_rate": 1.5196472521246518e-06, + "loss": 1.405, + "step": 9540 + }, + { + "epoch": 1.6459932718019494, + "grad_norm": 0.6015625, + "learning_rate": 1.5182051343573968e-06, + "loss": 1.507, + "step": 9541 + }, + { + "epoch": 1.6461657897006814, + "grad_norm": 0.57421875, + "learning_rate": 1.5167636449745015e-06, + "loss": 1.4083, + "step": 9542 + }, + { + "epoch": 1.6463383075994136, + "grad_norm": 0.5546875, + "learning_rate": 1.515322784082761e-06, + "loss": 1.3217, + "step": 9543 + }, + { + "epoch": 1.6465108254981455, + "grad_norm": 0.56640625, + "learning_rate": 1.5138825517889232e-06, + "loss": 1.4405, + "step": 9544 + }, + { + "epoch": 1.6466833433968775, + "grad_norm": 0.6328125, + "learning_rate": 1.5124429481996904e-06, + "loss": 1.3475, + "step": 9545 + }, + { + "epoch": 1.6468558612956095, + "grad_norm": 0.609375, + "learning_rate": 1.5110039734217186e-06, + "loss": 1.4315, + "step": 9546 + }, + { + "epoch": 1.6470283791943414, + "grad_norm": 0.5703125, + "learning_rate": 1.50956562756161e-06, + "loss": 1.3847, + "step": 9547 + }, + { + "epoch": 1.6472008970930734, + "grad_norm": 0.54296875, + "learning_rate": 1.508127910725934e-06, + "loss": 1.362, + "step": 9548 + }, + { + "epoch": 1.6473734149918053, + "grad_norm": 0.57421875, + "learning_rate": 1.5066908230212008e-06, + "loss": 1.448, + "step": 9549 + }, + { + "epoch": 1.6475459328905373, + "grad_norm": 0.62890625, + "learning_rate": 1.5052543645538798e-06, + "loss": 1.5037, + "step": 9550 + }, + { + "epoch": 1.6477184507892693, + "grad_norm": 0.58203125, + "learning_rate": 1.5038185354303924e-06, + "loss": 1.4538, + "step": 9551 + }, + { + "epoch": 1.6478909686880012, + "grad_norm": 0.5859375, + "learning_rate": 1.5023833357571128e-06, + "loss": 1.4369, + "step": 9552 + }, + { + "epoch": 1.6480634865867334, + "grad_norm": 0.59765625, + "learning_rate": 1.5009487656403765e-06, + "loss": 1.385, + "step": 9553 + }, + { + "epoch": 1.6482360044854654, + "grad_norm": 0.56640625, + "learning_rate": 1.4995148251864566e-06, + "loss": 1.331, + "step": 9554 + }, + { + "epoch": 1.6484085223841973, + "grad_norm": 0.5703125, + "learning_rate": 1.4980815145015925e-06, + "loss": 1.3528, + "step": 9555 + }, + { + "epoch": 1.6485810402829295, + "grad_norm": 0.6640625, + "learning_rate": 1.4966488336919728e-06, + "loss": 1.3618, + "step": 9556 + }, + { + "epoch": 1.6487535581816615, + "grad_norm": 0.6015625, + "learning_rate": 1.4952167828637377e-06, + "loss": 1.3758, + "step": 9557 + }, + { + "epoch": 1.6489260760803934, + "grad_norm": 0.58984375, + "learning_rate": 1.4937853621229848e-06, + "loss": 1.5034, + "step": 9558 + }, + { + "epoch": 1.6490985939791254, + "grad_norm": 0.60546875, + "learning_rate": 1.4923545715757625e-06, + "loss": 1.445, + "step": 9559 + }, + { + "epoch": 1.6492711118778574, + "grad_norm": 0.55078125, + "learning_rate": 1.4909244113280685e-06, + "loss": 1.4184, + "step": 9560 + }, + { + "epoch": 1.6494436297765893, + "grad_norm": 0.6015625, + "learning_rate": 1.4894948814858656e-06, + "loss": 1.4357, + "step": 9561 + }, + { + "epoch": 1.6496161476753213, + "grad_norm": 0.67578125, + "learning_rate": 1.4880659821550547e-06, + "loss": 1.4171, + "step": 9562 + }, + { + "epoch": 1.6497886655740532, + "grad_norm": 0.59375, + "learning_rate": 1.4866377134415022e-06, + "loss": 1.4275, + "step": 9563 + }, + { + "epoch": 1.6499611834727852, + "grad_norm": 0.61328125, + "learning_rate": 1.4852100754510213e-06, + "loss": 1.4104, + "step": 9564 + }, + { + "epoch": 1.6501337013715172, + "grad_norm": 0.73046875, + "learning_rate": 1.4837830682893806e-06, + "loss": 1.4573, + "step": 9565 + }, + { + "epoch": 1.6503062192702493, + "grad_norm": 0.56640625, + "learning_rate": 1.4823566920623022e-06, + "loss": 1.3716, + "step": 9566 + }, + { + "epoch": 1.6504787371689813, + "grad_norm": 0.546875, + "learning_rate": 1.4809309468754608e-06, + "loss": 1.3767, + "step": 9567 + }, + { + "epoch": 1.6506512550677133, + "grad_norm": 0.5859375, + "learning_rate": 1.4795058328344847e-06, + "loss": 1.4028, + "step": 9568 + }, + { + "epoch": 1.6508237729664452, + "grad_norm": 0.5625, + "learning_rate": 1.4780813500449541e-06, + "loss": 1.3859, + "step": 9569 + }, + { + "epoch": 1.6509962908651774, + "grad_norm": 0.56640625, + "learning_rate": 1.4766574986124082e-06, + "loss": 1.4154, + "step": 9570 + }, + { + "epoch": 1.6511688087639094, + "grad_norm": 0.5625, + "learning_rate": 1.4752342786423257e-06, + "loss": 1.4186, + "step": 9571 + }, + { + "epoch": 1.6513413266626413, + "grad_norm": 0.59765625, + "learning_rate": 1.4738116902401567e-06, + "loss": 1.3528, + "step": 9572 + }, + { + "epoch": 1.6515138445613733, + "grad_norm": 0.5859375, + "learning_rate": 1.4723897335112913e-06, + "loss": 1.397, + "step": 9573 + }, + { + "epoch": 1.6516863624601053, + "grad_norm": 0.546875, + "learning_rate": 1.4709684085610765e-06, + "loss": 1.3941, + "step": 9574 + }, + { + "epoch": 1.6518588803588372, + "grad_norm": 0.546875, + "learning_rate": 1.4695477154948134e-06, + "loss": 1.479, + "step": 9575 + }, + { + "epoch": 1.6520313982575692, + "grad_norm": 0.61328125, + "learning_rate": 1.468127654417757e-06, + "loss": 1.3031, + "step": 9576 + }, + { + "epoch": 1.6522039161563011, + "grad_norm": 0.59375, + "learning_rate": 1.4667082254351127e-06, + "loss": 1.4715, + "step": 9577 + }, + { + "epoch": 1.652376434055033, + "grad_norm": 0.59765625, + "learning_rate": 1.4652894286520446e-06, + "loss": 1.4589, + "step": 9578 + }, + { + "epoch": 1.652548951953765, + "grad_norm": 0.5859375, + "learning_rate": 1.4638712641736585e-06, + "loss": 1.3271, + "step": 9579 + }, + { + "epoch": 1.6527214698524972, + "grad_norm": 0.625, + "learning_rate": 1.4624537321050292e-06, + "loss": 1.4969, + "step": 9580 + }, + { + "epoch": 1.6528939877512292, + "grad_norm": 0.62109375, + "learning_rate": 1.4610368325511693e-06, + "loss": 1.3658, + "step": 9581 + }, + { + "epoch": 1.6530665056499612, + "grad_norm": 0.60546875, + "learning_rate": 1.4596205656170581e-06, + "loss": 1.4521, + "step": 9582 + }, + { + "epoch": 1.6532390235486933, + "grad_norm": 0.5546875, + "learning_rate": 1.4582049314076163e-06, + "loss": 1.4213, + "step": 9583 + }, + { + "epoch": 1.6534115414474253, + "grad_norm": 0.5859375, + "learning_rate": 1.4567899300277222e-06, + "loss": 1.4668, + "step": 9584 + }, + { + "epoch": 1.6535840593461573, + "grad_norm": 0.59765625, + "learning_rate": 1.4553755615822151e-06, + "loss": 1.4904, + "step": 9585 + }, + { + "epoch": 1.6537565772448892, + "grad_norm": 0.62890625, + "learning_rate": 1.4539618261758737e-06, + "loss": 1.3653, + "step": 9586 + }, + { + "epoch": 1.6539290951436212, + "grad_norm": 0.578125, + "learning_rate": 1.4525487239134373e-06, + "loss": 1.333, + "step": 9587 + }, + { + "epoch": 1.6541016130423531, + "grad_norm": 0.609375, + "learning_rate": 1.4511362548996e-06, + "loss": 1.519, + "step": 9588 + }, + { + "epoch": 1.654274130941085, + "grad_norm": 0.60546875, + "learning_rate": 1.4497244192390037e-06, + "loss": 1.3489, + "step": 9589 + }, + { + "epoch": 1.654446648839817, + "grad_norm": 0.578125, + "learning_rate": 1.4483132170362468e-06, + "loss": 1.4774, + "step": 9590 + }, + { + "epoch": 1.654619166738549, + "grad_norm": 0.71875, + "learning_rate": 1.4469026483958837e-06, + "loss": 1.3916, + "step": 9591 + }, + { + "epoch": 1.654791684637281, + "grad_norm": 0.6015625, + "learning_rate": 1.4454927134224085e-06, + "loss": 1.4945, + "step": 9592 + }, + { + "epoch": 1.654964202536013, + "grad_norm": 0.5703125, + "learning_rate": 1.4440834122202896e-06, + "loss": 1.4431, + "step": 9593 + }, + { + "epoch": 1.6551367204347451, + "grad_norm": 0.5703125, + "learning_rate": 1.4426747448939294e-06, + "loss": 1.4583, + "step": 9594 + }, + { + "epoch": 1.655309238333477, + "grad_norm": 0.58203125, + "learning_rate": 1.4412667115476918e-06, + "loss": 1.3336, + "step": 9595 + }, + { + "epoch": 1.655481756232209, + "grad_norm": 0.56640625, + "learning_rate": 1.439859312285894e-06, + "loss": 1.387, + "step": 9596 + }, + { + "epoch": 1.6556542741309412, + "grad_norm": 0.5625, + "learning_rate": 1.4384525472128052e-06, + "loss": 1.4272, + "step": 9597 + }, + { + "epoch": 1.6558267920296732, + "grad_norm": 0.57421875, + "learning_rate": 1.4370464164326458e-06, + "loss": 1.4258, + "step": 9598 + }, + { + "epoch": 1.6559993099284052, + "grad_norm": 0.6015625, + "learning_rate": 1.4356409200495924e-06, + "loss": 1.4453, + "step": 9599 + }, + { + "epoch": 1.6561718278271371, + "grad_norm": 0.5625, + "learning_rate": 1.4342360581677717e-06, + "loss": 1.4543, + "step": 9600 + }, + { + "epoch": 1.6561718278271371, + "eval_loss": 1.4071323871612549, + "eval_runtime": 10.9314, + "eval_samples_per_second": 93.675, + "eval_steps_per_second": 23.419, + "step": 9600 + }, + { + "epoch": 1.656344345725869, + "grad_norm": 0.62890625, + "learning_rate": 1.4328318308912647e-06, + "loss": 1.3847, + "step": 9601 + }, + { + "epoch": 1.656516863624601, + "grad_norm": 0.81640625, + "learning_rate": 1.4314282383241097e-06, + "loss": 1.4592, + "step": 9602 + }, + { + "epoch": 1.656689381523333, + "grad_norm": 0.6015625, + "learning_rate": 1.430025280570284e-06, + "loss": 1.5076, + "step": 9603 + }, + { + "epoch": 1.656861899422065, + "grad_norm": 0.5625, + "learning_rate": 1.4286229577337373e-06, + "loss": 1.3705, + "step": 9604 + }, + { + "epoch": 1.657034417320797, + "grad_norm": 0.59375, + "learning_rate": 1.4272212699183574e-06, + "loss": 1.4033, + "step": 9605 + }, + { + "epoch": 1.6572069352195289, + "grad_norm": 0.54296875, + "learning_rate": 1.42582021722799e-06, + "loss": 1.4395, + "step": 9606 + }, + { + "epoch": 1.657379453118261, + "grad_norm": 0.55078125, + "learning_rate": 1.4244197997664354e-06, + "loss": 1.3369, + "step": 9607 + }, + { + "epoch": 1.657551971016993, + "grad_norm": 0.58984375, + "learning_rate": 1.4230200176374442e-06, + "loss": 1.4236, + "step": 9608 + }, + { + "epoch": 1.657724488915725, + "grad_norm": 0.5625, + "learning_rate": 1.421620870944722e-06, + "loss": 1.4639, + "step": 9609 + }, + { + "epoch": 1.657897006814457, + "grad_norm": 1.046875, + "learning_rate": 1.4202223597919285e-06, + "loss": 1.4062, + "step": 9610 + }, + { + "epoch": 1.6580695247131891, + "grad_norm": 0.6328125, + "learning_rate": 1.4188244842826659e-06, + "loss": 1.4875, + "step": 9611 + }, + { + "epoch": 1.658242042611921, + "grad_norm": 0.625, + "learning_rate": 1.4174272445205084e-06, + "loss": 1.3593, + "step": 9612 + }, + { + "epoch": 1.658414560510653, + "grad_norm": 0.59765625, + "learning_rate": 1.416030640608963e-06, + "loss": 1.4083, + "step": 9613 + }, + { + "epoch": 1.658587078409385, + "grad_norm": 0.5390625, + "learning_rate": 1.414634672651506e-06, + "loss": 1.391, + "step": 9614 + }, + { + "epoch": 1.658759596308117, + "grad_norm": 0.61328125, + "learning_rate": 1.4132393407515555e-06, + "loss": 1.5134, + "step": 9615 + }, + { + "epoch": 1.658932114206849, + "grad_norm": 0.62890625, + "learning_rate": 1.4118446450124834e-06, + "loss": 1.3834, + "step": 9616 + }, + { + "epoch": 1.659104632105581, + "grad_norm": 0.578125, + "learning_rate": 1.4104505855376271e-06, + "loss": 1.4039, + "step": 9617 + }, + { + "epoch": 1.6592771500043129, + "grad_norm": 0.65234375, + "learning_rate": 1.4090571624302585e-06, + "loss": 1.5073, + "step": 9618 + }, + { + "epoch": 1.6594496679030448, + "grad_norm": 1.6015625, + "learning_rate": 1.4076643757936147e-06, + "loss": 1.5064, + "step": 9619 + }, + { + "epoch": 1.6596221858017768, + "grad_norm": 1.421875, + "learning_rate": 1.4062722257308803e-06, + "loss": 1.3995, + "step": 9620 + }, + { + "epoch": 1.659794703700509, + "grad_norm": 0.59765625, + "learning_rate": 1.4048807123451958e-06, + "loss": 1.4615, + "step": 9621 + }, + { + "epoch": 1.659967221599241, + "grad_norm": 0.62890625, + "learning_rate": 1.4034898357396532e-06, + "loss": 1.406, + "step": 9622 + }, + { + "epoch": 1.6601397394979729, + "grad_norm": 0.5625, + "learning_rate": 1.4020995960173001e-06, + "loss": 1.4212, + "step": 9623 + }, + { + "epoch": 1.660312257396705, + "grad_norm": 0.70703125, + "learning_rate": 1.4007099932811253e-06, + "loss": 1.4811, + "step": 9624 + }, + { + "epoch": 1.660484775295437, + "grad_norm": 0.59375, + "learning_rate": 1.3993210276340895e-06, + "loss": 1.4382, + "step": 9625 + }, + { + "epoch": 1.660657293194169, + "grad_norm": 0.58984375, + "learning_rate": 1.3979326991790898e-06, + "loss": 1.3531, + "step": 9626 + }, + { + "epoch": 1.660829811092901, + "grad_norm": 0.58984375, + "learning_rate": 1.3965450080189836e-06, + "loss": 1.4453, + "step": 9627 + }, + { + "epoch": 1.661002328991633, + "grad_norm": 0.6015625, + "learning_rate": 1.3951579542565808e-06, + "loss": 1.4147, + "step": 9628 + }, + { + "epoch": 1.6611748468903649, + "grad_norm": 0.56640625, + "learning_rate": 1.3937715379946414e-06, + "loss": 1.3453, + "step": 9629 + }, + { + "epoch": 1.6613473647890968, + "grad_norm": 0.62890625, + "learning_rate": 1.3923857593358813e-06, + "loss": 1.4904, + "step": 9630 + }, + { + "epoch": 1.6615198826878288, + "grad_norm": 0.57421875, + "learning_rate": 1.3910006183829671e-06, + "loss": 1.3955, + "step": 9631 + }, + { + "epoch": 1.6616924005865608, + "grad_norm": 0.58203125, + "learning_rate": 1.3896161152385178e-06, + "loss": 1.3752, + "step": 9632 + }, + { + "epoch": 1.6618649184852927, + "grad_norm": 0.66015625, + "learning_rate": 1.3882322500051072e-06, + "loss": 1.4852, + "step": 9633 + }, + { + "epoch": 1.6620374363840247, + "grad_norm": 0.58203125, + "learning_rate": 1.3868490227852626e-06, + "loss": 1.305, + "step": 9634 + }, + { + "epoch": 1.6622099542827569, + "grad_norm": 0.578125, + "learning_rate": 1.385466433681456e-06, + "loss": 1.3817, + "step": 9635 + }, + { + "epoch": 1.6623824721814888, + "grad_norm": 0.9609375, + "learning_rate": 1.3840844827961263e-06, + "loss": 1.4308, + "step": 9636 + }, + { + "epoch": 1.6625549900802208, + "grad_norm": 0.5859375, + "learning_rate": 1.3827031702316495e-06, + "loss": 1.4287, + "step": 9637 + }, + { + "epoch": 1.662727507978953, + "grad_norm": 0.640625, + "learning_rate": 1.381322496090367e-06, + "loss": 1.4174, + "step": 9638 + }, + { + "epoch": 1.662900025877685, + "grad_norm": 0.55078125, + "learning_rate": 1.3799424604745648e-06, + "loss": 1.4176, + "step": 9639 + }, + { + "epoch": 1.663072543776417, + "grad_norm": 0.55078125, + "learning_rate": 1.3785630634864855e-06, + "loss": 1.4426, + "step": 9640 + }, + { + "epoch": 1.6632450616751489, + "grad_norm": 0.61328125, + "learning_rate": 1.377184305228324e-06, + "loss": 1.4198, + "step": 9641 + }, + { + "epoch": 1.6634175795738808, + "grad_norm": 0.57421875, + "learning_rate": 1.3758061858022276e-06, + "loss": 1.3411, + "step": 9642 + }, + { + "epoch": 1.6635900974726128, + "grad_norm": 0.58203125, + "learning_rate": 1.3744287053102944e-06, + "loss": 1.4388, + "step": 9643 + }, + { + "epoch": 1.6637626153713447, + "grad_norm": 0.578125, + "learning_rate": 1.3730518638545809e-06, + "loss": 1.4606, + "step": 9644 + }, + { + "epoch": 1.6639351332700767, + "grad_norm": 0.546875, + "learning_rate": 1.3716756615370842e-06, + "loss": 1.3962, + "step": 9645 + }, + { + "epoch": 1.6641076511688087, + "grad_norm": 0.61328125, + "learning_rate": 1.3703000984597714e-06, + "loss": 1.3739, + "step": 9646 + }, + { + "epoch": 1.6642801690675406, + "grad_norm": 0.59765625, + "learning_rate": 1.3689251747245458e-06, + "loss": 1.4007, + "step": 9647 + }, + { + "epoch": 1.6644526869662728, + "grad_norm": 0.59765625, + "learning_rate": 1.3675508904332703e-06, + "loss": 1.3208, + "step": 9648 + }, + { + "epoch": 1.6646252048650048, + "grad_norm": 0.5859375, + "learning_rate": 1.3661772456877675e-06, + "loss": 1.3696, + "step": 9649 + }, + { + "epoch": 1.6647977227637367, + "grad_norm": 0.609375, + "learning_rate": 1.3648042405897987e-06, + "loss": 1.3645, + "step": 9650 + }, + { + "epoch": 1.6649702406624687, + "grad_norm": 0.64453125, + "learning_rate": 1.3634318752410868e-06, + "loss": 1.4679, + "step": 9651 + }, + { + "epoch": 1.6651427585612009, + "grad_norm": 0.58203125, + "learning_rate": 1.362060149743305e-06, + "loss": 1.4031, + "step": 9652 + }, + { + "epoch": 1.6653152764599328, + "grad_norm": 0.5625, + "learning_rate": 1.3606890641980809e-06, + "loss": 1.4893, + "step": 9653 + }, + { + "epoch": 1.6654877943586648, + "grad_norm": 0.55859375, + "learning_rate": 1.3593186187069907e-06, + "loss": 1.4081, + "step": 9654 + }, + { + "epoch": 1.6656603122573967, + "grad_norm": 0.59375, + "learning_rate": 1.357948813371569e-06, + "loss": 1.3926, + "step": 9655 + }, + { + "epoch": 1.6658328301561287, + "grad_norm": 0.57421875, + "learning_rate": 1.3565796482932936e-06, + "loss": 1.3133, + "step": 9656 + }, + { + "epoch": 1.6660053480548607, + "grad_norm": 0.61328125, + "learning_rate": 1.3552111235736077e-06, + "loss": 1.3755, + "step": 9657 + }, + { + "epoch": 1.6661778659535926, + "grad_norm": 0.5546875, + "learning_rate": 1.3538432393138957e-06, + "loss": 1.4162, + "step": 9658 + }, + { + "epoch": 1.6663503838523246, + "grad_norm": 0.56640625, + "learning_rate": 1.3524759956155e-06, + "loss": 1.3238, + "step": 9659 + }, + { + "epoch": 1.6665229017510566, + "grad_norm": 0.5859375, + "learning_rate": 1.3511093925797136e-06, + "loss": 1.4309, + "step": 9660 + }, + { + "epoch": 1.6666954196497885, + "grad_norm": 0.6015625, + "learning_rate": 1.3497434303077861e-06, + "loss": 1.5292, + "step": 9661 + }, + { + "epoch": 1.6668679375485207, + "grad_norm": 0.54296875, + "learning_rate": 1.348378108900913e-06, + "loss": 1.3987, + "step": 9662 + }, + { + "epoch": 1.6670404554472527, + "grad_norm": 0.56640625, + "learning_rate": 1.3470134284602487e-06, + "loss": 1.4505, + "step": 9663 + }, + { + "epoch": 1.6672129733459846, + "grad_norm": 0.5625, + "learning_rate": 1.345649389086895e-06, + "loss": 1.4911, + "step": 9664 + }, + { + "epoch": 1.6673854912447168, + "grad_norm": 0.59375, + "learning_rate": 1.3442859908819095e-06, + "loss": 1.3742, + "step": 9665 + }, + { + "epoch": 1.6675580091434488, + "grad_norm": 0.59375, + "learning_rate": 1.342923233946304e-06, + "loss": 1.3935, + "step": 9666 + }, + { + "epoch": 1.6677305270421807, + "grad_norm": 0.5390625, + "learning_rate": 1.3415611183810329e-06, + "loss": 1.3412, + "step": 9667 + }, + { + "epoch": 1.6679030449409127, + "grad_norm": 0.58203125, + "learning_rate": 1.340199644287018e-06, + "loss": 1.3944, + "step": 9668 + }, + { + "epoch": 1.6680755628396446, + "grad_norm": 0.5546875, + "learning_rate": 1.3388388117651186e-06, + "loss": 1.3959, + "step": 9669 + }, + { + "epoch": 1.6682480807383766, + "grad_norm": 0.60546875, + "learning_rate": 1.3374786209161617e-06, + "loss": 1.4506, + "step": 9670 + }, + { + "epoch": 1.6684205986371086, + "grad_norm": 0.640625, + "learning_rate": 1.3361190718409123e-06, + "loss": 1.4873, + "step": 9671 + }, + { + "epoch": 1.6685931165358405, + "grad_norm": 0.6015625, + "learning_rate": 1.3347601646400965e-06, + "loss": 1.4537, + "step": 9672 + }, + { + "epoch": 1.6687656344345725, + "grad_norm": 0.62109375, + "learning_rate": 1.3334018994143916e-06, + "loss": 1.388, + "step": 9673 + }, + { + "epoch": 1.6689381523333044, + "grad_norm": 0.6796875, + "learning_rate": 1.3320442762644236e-06, + "loss": 1.4557, + "step": 9674 + }, + { + "epoch": 1.6691106702320364, + "grad_norm": 0.5703125, + "learning_rate": 1.3306872952907769e-06, + "loss": 1.4547, + "step": 9675 + }, + { + "epoch": 1.6692831881307686, + "grad_norm": 0.5703125, + "learning_rate": 1.329330956593985e-06, + "loss": 1.4649, + "step": 9676 + }, + { + "epoch": 1.6694557060295006, + "grad_norm": 0.59375, + "learning_rate": 1.3279752602745287e-06, + "loss": 1.4182, + "step": 9677 + }, + { + "epoch": 1.6696282239282325, + "grad_norm": 0.578125, + "learning_rate": 1.3266202064328548e-06, + "loss": 1.5169, + "step": 9678 + }, + { + "epoch": 1.6698007418269647, + "grad_norm": 0.67578125, + "learning_rate": 1.325265795169348e-06, + "loss": 1.4113, + "step": 9679 + }, + { + "epoch": 1.6699732597256967, + "grad_norm": 0.546875, + "learning_rate": 1.3239120265843508e-06, + "loss": 1.4695, + "step": 9680 + }, + { + "epoch": 1.6701457776244286, + "grad_norm": 0.53125, + "learning_rate": 1.3225589007781658e-06, + "loss": 1.4003, + "step": 9681 + }, + { + "epoch": 1.6703182955231606, + "grad_norm": 0.59375, + "learning_rate": 1.3212064178510342e-06, + "loss": 1.373, + "step": 9682 + }, + { + "epoch": 1.6704908134218925, + "grad_norm": 0.5859375, + "learning_rate": 1.319854577903159e-06, + "loss": 1.3848, + "step": 9683 + }, + { + "epoch": 1.6706633313206245, + "grad_norm": 0.578125, + "learning_rate": 1.3185033810346926e-06, + "loss": 1.4805, + "step": 9684 + }, + { + "epoch": 1.6708358492193565, + "grad_norm": 0.58984375, + "learning_rate": 1.3171528273457402e-06, + "loss": 1.4182, + "step": 9685 + }, + { + "epoch": 1.6710083671180884, + "grad_norm": 0.5625, + "learning_rate": 1.3158029169363595e-06, + "loss": 1.4491, + "step": 9686 + }, + { + "epoch": 1.6711808850168204, + "grad_norm": 0.6171875, + "learning_rate": 1.314453649906562e-06, + "loss": 1.4682, + "step": 9687 + }, + { + "epoch": 1.6713534029155523, + "grad_norm": 0.6015625, + "learning_rate": 1.313105026356304e-06, + "loss": 1.4939, + "step": 9688 + }, + { + "epoch": 1.6715259208142845, + "grad_norm": 0.58203125, + "learning_rate": 1.311757046385508e-06, + "loss": 1.348, + "step": 9689 + }, + { + "epoch": 1.6716984387130165, + "grad_norm": 0.60546875, + "learning_rate": 1.310409710094035e-06, + "loss": 1.5174, + "step": 9690 + }, + { + "epoch": 1.6718709566117484, + "grad_norm": 0.60546875, + "learning_rate": 1.3090630175817042e-06, + "loss": 1.4019, + "step": 9691 + }, + { + "epoch": 1.6720434745104804, + "grad_norm": 0.55859375, + "learning_rate": 1.3077169689482893e-06, + "loss": 1.3465, + "step": 9692 + }, + { + "epoch": 1.6722159924092126, + "grad_norm": 0.5859375, + "learning_rate": 1.3063715642935137e-06, + "loss": 1.4102, + "step": 9693 + }, + { + "epoch": 1.6723885103079446, + "grad_norm": 0.55078125, + "learning_rate": 1.3050268037170533e-06, + "loss": 1.3047, + "step": 9694 + }, + { + "epoch": 1.6725610282066765, + "grad_norm": 0.64453125, + "learning_rate": 1.303682687318537e-06, + "loss": 1.4796, + "step": 9695 + }, + { + "epoch": 1.6727335461054085, + "grad_norm": 0.62109375, + "learning_rate": 1.30233921519754e-06, + "loss": 1.42, + "step": 9696 + }, + { + "epoch": 1.6729060640041404, + "grad_norm": 0.55078125, + "learning_rate": 1.3009963874536025e-06, + "loss": 1.4029, + "step": 9697 + }, + { + "epoch": 1.6730785819028724, + "grad_norm": 0.59765625, + "learning_rate": 1.299654204186208e-06, + "loss": 1.4715, + "step": 9698 + }, + { + "epoch": 1.6732510998016044, + "grad_norm": 0.55859375, + "learning_rate": 1.2983126654947886e-06, + "loss": 1.3625, + "step": 9699 + }, + { + "epoch": 1.6734236177003363, + "grad_norm": 0.5859375, + "learning_rate": 1.2969717714787411e-06, + "loss": 1.4229, + "step": 9700 + }, + { + "epoch": 1.6734236177003363, + "eval_loss": 1.407073736190796, + "eval_runtime": 10.8926, + "eval_samples_per_second": 94.009, + "eval_steps_per_second": 23.502, + "step": 9700 + }, + { + "epoch": 1.6735961355990683, + "grad_norm": 0.63671875, + "learning_rate": 1.2956315222374006e-06, + "loss": 1.4324, + "step": 9701 + }, + { + "epoch": 1.6737686534978002, + "grad_norm": 0.58203125, + "learning_rate": 1.2942919178700674e-06, + "loss": 1.4398, + "step": 9702 + }, + { + "epoch": 1.6739411713965324, + "grad_norm": 0.65234375, + "learning_rate": 1.2929529584759836e-06, + "loss": 1.5529, + "step": 9703 + }, + { + "epoch": 1.6741136892952644, + "grad_norm": 0.5859375, + "learning_rate": 1.2916146441543487e-06, + "loss": 1.4504, + "step": 9704 + }, + { + "epoch": 1.6742862071939963, + "grad_norm": 0.58984375, + "learning_rate": 1.290276975004312e-06, + "loss": 1.3953, + "step": 9705 + }, + { + "epoch": 1.6744587250927285, + "grad_norm": 0.58984375, + "learning_rate": 1.28893995112498e-06, + "loss": 1.4363, + "step": 9706 + }, + { + "epoch": 1.6746312429914605, + "grad_norm": 0.58203125, + "learning_rate": 1.2876035726154046e-06, + "loss": 1.3643, + "step": 9707 + }, + { + "epoch": 1.6748037608901925, + "grad_norm": 0.64453125, + "learning_rate": 1.2862678395745954e-06, + "loss": 1.4318, + "step": 9708 + }, + { + "epoch": 1.6749762787889244, + "grad_norm": 0.55859375, + "learning_rate": 1.2849327521015076e-06, + "loss": 1.4568, + "step": 9709 + }, + { + "epoch": 1.6751487966876564, + "grad_norm": 0.640625, + "learning_rate": 1.283598310295059e-06, + "loss": 1.2942, + "step": 9710 + }, + { + "epoch": 1.6753213145863883, + "grad_norm": 0.59765625, + "learning_rate": 1.282264514254109e-06, + "loss": 1.3316, + "step": 9711 + }, + { + "epoch": 1.6754938324851203, + "grad_norm": 0.5625, + "learning_rate": 1.2809313640774723e-06, + "loss": 1.4064, + "step": 9712 + }, + { + "epoch": 1.6756663503838523, + "grad_norm": 0.5859375, + "learning_rate": 1.279598859863923e-06, + "loss": 1.4209, + "step": 9713 + }, + { + "epoch": 1.6758388682825842, + "grad_norm": 0.58984375, + "learning_rate": 1.2782670017121768e-06, + "loss": 1.3198, + "step": 9714 + }, + { + "epoch": 1.6760113861813162, + "grad_norm": 0.5859375, + "learning_rate": 1.2769357897209056e-06, + "loss": 1.467, + "step": 9715 + }, + { + "epoch": 1.6761839040800484, + "grad_norm": 0.58984375, + "learning_rate": 1.2756052239887362e-06, + "loss": 1.3825, + "step": 9716 + }, + { + "epoch": 1.6763564219787803, + "grad_norm": 0.55859375, + "learning_rate": 1.2742753046142442e-06, + "loss": 1.4707, + "step": 9717 + }, + { + "epoch": 1.6765289398775123, + "grad_norm": 0.5625, + "learning_rate": 1.2729460316959586e-06, + "loss": 1.4252, + "step": 9718 + }, + { + "epoch": 1.6767014577762442, + "grad_norm": 0.640625, + "learning_rate": 1.2716174053323628e-06, + "loss": 1.4965, + "step": 9719 + }, + { + "epoch": 1.6768739756749764, + "grad_norm": 0.6015625, + "learning_rate": 1.2702894256218823e-06, + "loss": 1.482, + "step": 9720 + }, + { + "epoch": 1.6770464935737084, + "grad_norm": 0.5859375, + "learning_rate": 1.2689620926629108e-06, + "loss": 1.3392, + "step": 9721 + }, + { + "epoch": 1.6772190114724403, + "grad_norm": 0.57421875, + "learning_rate": 1.26763540655378e-06, + "loss": 1.4667, + "step": 9722 + }, + { + "epoch": 1.6773915293711723, + "grad_norm": 0.59765625, + "learning_rate": 1.2663093673927796e-06, + "loss": 1.3934, + "step": 9723 + }, + { + "epoch": 1.6775640472699043, + "grad_norm": 0.66796875, + "learning_rate": 1.2649839752781522e-06, + "loss": 1.4194, + "step": 9724 + }, + { + "epoch": 1.6777365651686362, + "grad_norm": 0.59765625, + "learning_rate": 1.263659230308092e-06, + "loss": 1.5728, + "step": 9725 + }, + { + "epoch": 1.6779090830673682, + "grad_norm": 0.5546875, + "learning_rate": 1.262335132580742e-06, + "loss": 1.4046, + "step": 9726 + }, + { + "epoch": 1.6780816009661002, + "grad_norm": 0.56640625, + "learning_rate": 1.2610116821942032e-06, + "loss": 1.4356, + "step": 9727 + }, + { + "epoch": 1.6782541188648321, + "grad_norm": 0.6015625, + "learning_rate": 1.259688879246519e-06, + "loss": 1.4939, + "step": 9728 + }, + { + "epoch": 1.678426636763564, + "grad_norm": 0.5859375, + "learning_rate": 1.2583667238356956e-06, + "loss": 1.4155, + "step": 9729 + }, + { + "epoch": 1.6785991546622963, + "grad_norm": 0.640625, + "learning_rate": 1.2570452160596859e-06, + "loss": 1.3951, + "step": 9730 + }, + { + "epoch": 1.6787716725610282, + "grad_norm": 0.5546875, + "learning_rate": 1.2557243560163955e-06, + "loss": 1.419, + "step": 9731 + }, + { + "epoch": 1.6789441904597602, + "grad_norm": 0.57421875, + "learning_rate": 1.2544041438036836e-06, + "loss": 1.474, + "step": 9732 + }, + { + "epoch": 1.6791167083584921, + "grad_norm": 0.6953125, + "learning_rate": 1.2530845795193536e-06, + "loss": 1.3757, + "step": 9733 + }, + { + "epoch": 1.6792892262572243, + "grad_norm": 0.578125, + "learning_rate": 1.2517656632611753e-06, + "loss": 1.4638, + "step": 9734 + }, + { + "epoch": 1.6794617441559563, + "grad_norm": 0.58203125, + "learning_rate": 1.2504473951268559e-06, + "loss": 1.3717, + "step": 9735 + }, + { + "epoch": 1.6796342620546882, + "grad_norm": 0.62890625, + "learning_rate": 1.249129775214064e-06, + "loss": 1.3255, + "step": 9736 + }, + { + "epoch": 1.6798067799534202, + "grad_norm": 0.55859375, + "learning_rate": 1.2478128036204151e-06, + "loss": 1.3849, + "step": 9737 + }, + { + "epoch": 1.6799792978521522, + "grad_norm": 0.546875, + "learning_rate": 1.246496480443481e-06, + "loss": 1.4408, + "step": 9738 + }, + { + "epoch": 1.6801518157508841, + "grad_norm": 0.5546875, + "learning_rate": 1.2451808057807813e-06, + "loss": 1.4953, + "step": 9739 + }, + { + "epoch": 1.680324333649616, + "grad_norm": 0.55078125, + "learning_rate": 1.2438657797297927e-06, + "loss": 1.4011, + "step": 9740 + }, + { + "epoch": 1.680496851548348, + "grad_norm": 0.5703125, + "learning_rate": 1.2425514023879338e-06, + "loss": 1.3963, + "step": 9741 + }, + { + "epoch": 1.68066936944708, + "grad_norm": 0.65625, + "learning_rate": 1.24123767385259e-06, + "loss": 1.3753, + "step": 9742 + }, + { + "epoch": 1.680841887345812, + "grad_norm": 0.5859375, + "learning_rate": 1.2399245942210847e-06, + "loss": 1.3894, + "step": 9743 + }, + { + "epoch": 1.6810144052445442, + "grad_norm": 0.671875, + "learning_rate": 1.2386121635906978e-06, + "loss": 1.5035, + "step": 9744 + }, + { + "epoch": 1.6811869231432761, + "grad_norm": 0.6015625, + "learning_rate": 1.2373003820586715e-06, + "loss": 1.4379, + "step": 9745 + }, + { + "epoch": 1.681359441042008, + "grad_norm": 0.6171875, + "learning_rate": 1.2359892497221815e-06, + "loss": 1.4879, + "step": 9746 + }, + { + "epoch": 1.6815319589407403, + "grad_norm": 0.6015625, + "learning_rate": 1.2346787666783678e-06, + "loss": 1.434, + "step": 9747 + }, + { + "epoch": 1.6817044768394722, + "grad_norm": 0.62890625, + "learning_rate": 1.2333689330243204e-06, + "loss": 1.3453, + "step": 9748 + }, + { + "epoch": 1.6818769947382042, + "grad_norm": 0.5703125, + "learning_rate": 1.2320597488570774e-06, + "loss": 1.3516, + "step": 9749 + }, + { + "epoch": 1.6820495126369361, + "grad_norm": 0.84765625, + "learning_rate": 1.2307512142736344e-06, + "loss": 1.4055, + "step": 9750 + }, + { + "epoch": 1.682222030535668, + "grad_norm": 0.5703125, + "learning_rate": 1.2294433293709352e-06, + "loss": 1.4123, + "step": 9751 + }, + { + "epoch": 1.6823945484344, + "grad_norm": 0.5546875, + "learning_rate": 1.2281360942458708e-06, + "loss": 1.4203, + "step": 9752 + }, + { + "epoch": 1.682567066333132, + "grad_norm": 0.56640625, + "learning_rate": 1.2268295089952986e-06, + "loss": 1.4716, + "step": 9753 + }, + { + "epoch": 1.682739584231864, + "grad_norm": 0.5625, + "learning_rate": 1.225523573716012e-06, + "loss": 1.4118, + "step": 9754 + }, + { + "epoch": 1.682912102130596, + "grad_norm": 0.59375, + "learning_rate": 1.2242182885047638e-06, + "loss": 1.4714, + "step": 9755 + }, + { + "epoch": 1.683084620029328, + "grad_norm": 0.59765625, + "learning_rate": 1.2229136534582586e-06, + "loss": 1.4043, + "step": 9756 + }, + { + "epoch": 1.68325713792806, + "grad_norm": 0.58984375, + "learning_rate": 1.2216096686731515e-06, + "loss": 1.3979, + "step": 9757 + }, + { + "epoch": 1.683429655826792, + "grad_norm": 0.6015625, + "learning_rate": 1.2203063342460496e-06, + "loss": 1.2974, + "step": 9758 + }, + { + "epoch": 1.683602173725524, + "grad_norm": 0.60546875, + "learning_rate": 1.2190036502735158e-06, + "loss": 1.3953, + "step": 9759 + }, + { + "epoch": 1.683774691624256, + "grad_norm": 0.6015625, + "learning_rate": 1.2177016168520528e-06, + "loss": 1.4185, + "step": 9760 + }, + { + "epoch": 1.6839472095229882, + "grad_norm": 0.59765625, + "learning_rate": 1.216400234078131e-06, + "loss": 1.5038, + "step": 9761 + }, + { + "epoch": 1.6841197274217201, + "grad_norm": 0.55859375, + "learning_rate": 1.2150995020481616e-06, + "loss": 1.4263, + "step": 9762 + }, + { + "epoch": 1.684292245320452, + "grad_norm": 0.6015625, + "learning_rate": 1.2137994208585125e-06, + "loss": 1.4546, + "step": 9763 + }, + { + "epoch": 1.684464763219184, + "grad_norm": 0.59765625, + "learning_rate": 1.2124999906055024e-06, + "loss": 1.3523, + "step": 9764 + }, + { + "epoch": 1.684637281117916, + "grad_norm": 0.55859375, + "learning_rate": 1.2112012113853955e-06, + "loss": 1.2748, + "step": 9765 + }, + { + "epoch": 1.684809799016648, + "grad_norm": 0.64453125, + "learning_rate": 1.2099030832944224e-06, + "loss": 1.4521, + "step": 9766 + }, + { + "epoch": 1.68498231691538, + "grad_norm": 0.578125, + "learning_rate": 1.2086056064287498e-06, + "loss": 1.3665, + "step": 9767 + }, + { + "epoch": 1.6851548348141119, + "grad_norm": 0.578125, + "learning_rate": 1.2073087808845052e-06, + "loss": 1.433, + "step": 9768 + }, + { + "epoch": 1.6853273527128438, + "grad_norm": 0.56640625, + "learning_rate": 1.206012606757765e-06, + "loss": 1.4158, + "step": 9769 + }, + { + "epoch": 1.6854998706115758, + "grad_norm": 0.61328125, + "learning_rate": 1.2047170841445589e-06, + "loss": 1.47, + "step": 9770 + }, + { + "epoch": 1.685672388510308, + "grad_norm": 0.63671875, + "learning_rate": 1.2034222131408669e-06, + "loss": 1.4627, + "step": 9771 + }, + { + "epoch": 1.68584490640904, + "grad_norm": 0.59765625, + "learning_rate": 1.2021279938426223e-06, + "loss": 1.4755, + "step": 9772 + }, + { + "epoch": 1.686017424307772, + "grad_norm": 0.58984375, + "learning_rate": 1.2008344263457029e-06, + "loss": 1.4738, + "step": 9773 + }, + { + "epoch": 1.686189942206504, + "grad_norm": 0.59375, + "learning_rate": 1.199541510745954e-06, + "loss": 1.3455, + "step": 9774 + }, + { + "epoch": 1.686362460105236, + "grad_norm": 0.578125, + "learning_rate": 1.1982492471391549e-06, + "loss": 1.4601, + "step": 9775 + }, + { + "epoch": 1.686534978003968, + "grad_norm": 0.6015625, + "learning_rate": 1.1969576356210466e-06, + "loss": 1.4357, + "step": 9776 + }, + { + "epoch": 1.6867074959027, + "grad_norm": 0.578125, + "learning_rate": 1.1956666762873236e-06, + "loss": 1.4608, + "step": 9777 + }, + { + "epoch": 1.686880013801432, + "grad_norm": 0.58203125, + "learning_rate": 1.194376369233624e-06, + "loss": 1.4586, + "step": 9778 + }, + { + "epoch": 1.687052531700164, + "grad_norm": 0.61328125, + "learning_rate": 1.1930867145555424e-06, + "loss": 1.4273, + "step": 9779 + }, + { + "epoch": 1.6872250495988959, + "grad_norm": 0.59765625, + "learning_rate": 1.1917977123486258e-06, + "loss": 1.3533, + "step": 9780 + }, + { + "epoch": 1.6873975674976278, + "grad_norm": 0.5625, + "learning_rate": 1.1905093627083698e-06, + "loss": 1.3954, + "step": 9781 + }, + { + "epoch": 1.6875700853963598, + "grad_norm": 0.5859375, + "learning_rate": 1.1892216657302247e-06, + "loss": 1.4152, + "step": 9782 + }, + { + "epoch": 1.6877426032950917, + "grad_norm": 0.56640625, + "learning_rate": 1.1879346215095932e-06, + "loss": 1.3656, + "step": 9783 + }, + { + "epoch": 1.6879151211938237, + "grad_norm": 0.609375, + "learning_rate": 1.1866482301418213e-06, + "loss": 1.4807, + "step": 9784 + }, + { + "epoch": 1.6880876390925559, + "grad_norm": 0.5625, + "learning_rate": 1.1853624917222217e-06, + "loss": 1.3826, + "step": 9785 + }, + { + "epoch": 1.6882601569912878, + "grad_norm": 0.6484375, + "learning_rate": 1.1840774063460403e-06, + "loss": 1.4644, + "step": 9786 + }, + { + "epoch": 1.6884326748900198, + "grad_norm": 0.609375, + "learning_rate": 1.1827929741084931e-06, + "loss": 1.4286, + "step": 9787 + }, + { + "epoch": 1.688605192788752, + "grad_norm": 0.59765625, + "learning_rate": 1.1815091951047331e-06, + "loss": 1.4263, + "step": 9788 + }, + { + "epoch": 1.688777710687484, + "grad_norm": 0.59765625, + "learning_rate": 1.1802260694298717e-06, + "loss": 1.4465, + "step": 9789 + }, + { + "epoch": 1.688950228586216, + "grad_norm": 0.56640625, + "learning_rate": 1.178943597178972e-06, + "loss": 1.398, + "step": 9790 + }, + { + "epoch": 1.6891227464849479, + "grad_norm": 0.71875, + "learning_rate": 1.1776617784470469e-06, + "loss": 1.4605, + "step": 9791 + }, + { + "epoch": 1.6892952643836798, + "grad_norm": 0.58984375, + "learning_rate": 1.1763806133290623e-06, + "loss": 1.4018, + "step": 9792 + }, + { + "epoch": 1.6894677822824118, + "grad_norm": 0.58203125, + "learning_rate": 1.1751001019199337e-06, + "loss": 1.4055, + "step": 9793 + }, + { + "epoch": 1.6896403001811438, + "grad_norm": 0.5546875, + "learning_rate": 1.1738202443145307e-06, + "loss": 1.4657, + "step": 9794 + }, + { + "epoch": 1.6898128180798757, + "grad_norm": 0.59375, + "learning_rate": 1.172541040607672e-06, + "loss": 1.4122, + "step": 9795 + }, + { + "epoch": 1.6899853359786077, + "grad_norm": 0.78125, + "learning_rate": 1.1712624908941318e-06, + "loss": 1.3335, + "step": 9796 + }, + { + "epoch": 1.6901578538773396, + "grad_norm": 0.578125, + "learning_rate": 1.1699845952686273e-06, + "loss": 1.3836, + "step": 9797 + }, + { + "epoch": 1.6903303717760718, + "grad_norm": 0.56640625, + "learning_rate": 1.1687073538258398e-06, + "loss": 1.4557, + "step": 9798 + }, + { + "epoch": 1.6905028896748038, + "grad_norm": 0.625, + "learning_rate": 1.1674307666603901e-06, + "loss": 1.4109, + "step": 9799 + }, + { + "epoch": 1.6906754075735357, + "grad_norm": 0.6015625, + "learning_rate": 1.1661548338668572e-06, + "loss": 1.3921, + "step": 9800 + }, + { + "epoch": 1.6906754075735357, + "eval_loss": 1.4071011543273926, + "eval_runtime": 10.8438, + "eval_samples_per_second": 94.432, + "eval_steps_per_second": 23.608, + "step": 9800 + }, + { + "epoch": 1.6908479254722677, + "grad_norm": 0.57421875, + "learning_rate": 1.1648795555397719e-06, + "loss": 1.3999, + "step": 9801 + }, + { + "epoch": 1.6910204433709999, + "grad_norm": 0.6015625, + "learning_rate": 1.163604931773612e-06, + "loss": 1.5643, + "step": 9802 + }, + { + "epoch": 1.6911929612697318, + "grad_norm": 0.58984375, + "learning_rate": 1.1623309626628121e-06, + "loss": 1.4941, + "step": 9803 + }, + { + "epoch": 1.6913654791684638, + "grad_norm": 0.66015625, + "learning_rate": 1.1610576483017566e-06, + "loss": 1.5024, + "step": 9804 + }, + { + "epoch": 1.6915379970671958, + "grad_norm": 0.640625, + "learning_rate": 1.1597849887847746e-06, + "loss": 1.4233, + "step": 9805 + }, + { + "epoch": 1.6917105149659277, + "grad_norm": 0.53515625, + "learning_rate": 1.1585129842061605e-06, + "loss": 1.3673, + "step": 9806 + }, + { + "epoch": 1.6918830328646597, + "grad_norm": 0.578125, + "learning_rate": 1.1572416346601467e-06, + "loss": 1.4561, + "step": 9807 + }, + { + "epoch": 1.6920555507633916, + "grad_norm": 0.546875, + "learning_rate": 1.1559709402409236e-06, + "loss": 1.3763, + "step": 9808 + }, + { + "epoch": 1.6922280686621236, + "grad_norm": 0.5859375, + "learning_rate": 1.1547009010426368e-06, + "loss": 1.423, + "step": 9809 + }, + { + "epoch": 1.6924005865608556, + "grad_norm": 0.59375, + "learning_rate": 1.1534315171593736e-06, + "loss": 1.5238, + "step": 9810 + }, + { + "epoch": 1.6925731044595875, + "grad_norm": 0.625, + "learning_rate": 1.1521627886851794e-06, + "loss": 1.4278, + "step": 9811 + }, + { + "epoch": 1.6927456223583197, + "grad_norm": 0.6484375, + "learning_rate": 1.1508947157140493e-06, + "loss": 1.3774, + "step": 9812 + }, + { + "epoch": 1.6929181402570517, + "grad_norm": 0.61328125, + "learning_rate": 1.1496272983399303e-06, + "loss": 1.439, + "step": 9813 + }, + { + "epoch": 1.6930906581557836, + "grad_norm": 0.58984375, + "learning_rate": 1.1483605366567208e-06, + "loss": 1.3043, + "step": 9814 + }, + { + "epoch": 1.6932631760545158, + "grad_norm": 0.58203125, + "learning_rate": 1.147094430758272e-06, + "loss": 1.4669, + "step": 9815 + }, + { + "epoch": 1.6934356939532478, + "grad_norm": 0.57421875, + "learning_rate": 1.1458289807383804e-06, + "loss": 1.4208, + "step": 9816 + }, + { + "epoch": 1.6936082118519797, + "grad_norm": 0.5546875, + "learning_rate": 1.1445641866908042e-06, + "loss": 1.4478, + "step": 9817 + }, + { + "epoch": 1.6937807297507117, + "grad_norm": 0.59375, + "learning_rate": 1.1433000487092415e-06, + "loss": 1.4366, + "step": 9818 + }, + { + "epoch": 1.6939532476494437, + "grad_norm": 0.625, + "learning_rate": 1.1420365668873535e-06, + "loss": 1.5139, + "step": 9819 + }, + { + "epoch": 1.6941257655481756, + "grad_norm": 0.6484375, + "learning_rate": 1.1407737413187426e-06, + "loss": 1.4421, + "step": 9820 + }, + { + "epoch": 1.6942982834469076, + "grad_norm": 0.59765625, + "learning_rate": 1.1395115720969663e-06, + "loss": 1.3685, + "step": 9821 + }, + { + "epoch": 1.6944708013456395, + "grad_norm": 0.6328125, + "learning_rate": 1.1382500593155377e-06, + "loss": 1.443, + "step": 9822 + }, + { + "epoch": 1.6946433192443715, + "grad_norm": 0.546875, + "learning_rate": 1.1369892030679142e-06, + "loss": 1.4063, + "step": 9823 + }, + { + "epoch": 1.6948158371431035, + "grad_norm": 0.58203125, + "learning_rate": 1.1357290034475087e-06, + "loss": 1.3731, + "step": 9824 + }, + { + "epoch": 1.6949883550418354, + "grad_norm": 0.59375, + "learning_rate": 1.1344694605476859e-06, + "loss": 1.3671, + "step": 9825 + }, + { + "epoch": 1.6951608729405676, + "grad_norm": 0.5546875, + "learning_rate": 1.1332105744617605e-06, + "loss": 1.4597, + "step": 9826 + }, + { + "epoch": 1.6953333908392996, + "grad_norm": 0.58203125, + "learning_rate": 1.1319523452829973e-06, + "loss": 1.3508, + "step": 9827 + }, + { + "epoch": 1.6955059087380315, + "grad_norm": 0.609375, + "learning_rate": 1.1306947731046169e-06, + "loss": 1.4448, + "step": 9828 + }, + { + "epoch": 1.6956784266367637, + "grad_norm": 0.578125, + "learning_rate": 1.1294378580197806e-06, + "loss": 1.3837, + "step": 9829 + }, + { + "epoch": 1.6958509445354957, + "grad_norm": 0.5625, + "learning_rate": 1.1281816001216183e-06, + "loss": 1.3462, + "step": 9830 + }, + { + "epoch": 1.6960234624342276, + "grad_norm": 0.54296875, + "learning_rate": 1.1269259995031955e-06, + "loss": 1.4266, + "step": 9831 + }, + { + "epoch": 1.6961959803329596, + "grad_norm": 0.58203125, + "learning_rate": 1.1256710562575346e-06, + "loss": 1.4333, + "step": 9832 + }, + { + "epoch": 1.6963684982316916, + "grad_norm": 0.5625, + "learning_rate": 1.124416770477612e-06, + "loss": 1.3963, + "step": 9833 + }, + { + "epoch": 1.6965410161304235, + "grad_norm": 0.5546875, + "learning_rate": 1.1231631422563526e-06, + "loss": 1.4522, + "step": 9834 + }, + { + "epoch": 1.6967135340291555, + "grad_norm": 0.55078125, + "learning_rate": 1.1219101716866332e-06, + "loss": 1.3365, + "step": 9835 + }, + { + "epoch": 1.6968860519278874, + "grad_norm": 0.63671875, + "learning_rate": 1.1206578588612815e-06, + "loss": 1.4551, + "step": 9836 + }, + { + "epoch": 1.6970585698266194, + "grad_norm": 0.5546875, + "learning_rate": 1.1194062038730735e-06, + "loss": 1.5414, + "step": 9837 + }, + { + "epoch": 1.6972310877253514, + "grad_norm": 0.59375, + "learning_rate": 1.118155206814746e-06, + "loss": 1.4245, + "step": 9838 + }, + { + "epoch": 1.6974036056240835, + "grad_norm": 0.6328125, + "learning_rate": 1.1169048677789751e-06, + "loss": 1.4564, + "step": 9839 + }, + { + "epoch": 1.6975761235228155, + "grad_norm": 0.734375, + "learning_rate": 1.1156551868583942e-06, + "loss": 1.4433, + "step": 9840 + }, + { + "epoch": 1.6977486414215475, + "grad_norm": 0.6171875, + "learning_rate": 1.1144061641455927e-06, + "loss": 1.4318, + "step": 9841 + }, + { + "epoch": 1.6979211593202794, + "grad_norm": 0.60546875, + "learning_rate": 1.1131577997331e-06, + "loss": 1.4245, + "step": 9842 + }, + { + "epoch": 1.6980936772190116, + "grad_norm": 0.57421875, + "learning_rate": 1.1119100937134052e-06, + "loss": 1.5177, + "step": 9843 + }, + { + "epoch": 1.6982661951177436, + "grad_norm": 0.55859375, + "learning_rate": 1.1106630461789459e-06, + "loss": 1.4323, + "step": 9844 + }, + { + "epoch": 1.6984387130164755, + "grad_norm": 0.59375, + "learning_rate": 1.109416657222112e-06, + "loss": 1.38, + "step": 9845 + }, + { + "epoch": 1.6986112309152075, + "grad_norm": 0.55859375, + "learning_rate": 1.1081709269352426e-06, + "loss": 1.5297, + "step": 9846 + }, + { + "epoch": 1.6987837488139395, + "grad_norm": 0.546875, + "learning_rate": 1.1069258554106289e-06, + "loss": 1.3105, + "step": 9847 + }, + { + "epoch": 1.6989562667126714, + "grad_norm": 0.6875, + "learning_rate": 1.1056814427405148e-06, + "loss": 1.4188, + "step": 9848 + }, + { + "epoch": 1.6991287846114034, + "grad_norm": 0.5546875, + "learning_rate": 1.1044376890170971e-06, + "loss": 1.4763, + "step": 9849 + }, + { + "epoch": 1.6993013025101353, + "grad_norm": 1.0390625, + "learning_rate": 1.1031945943325118e-06, + "loss": 1.3321, + "step": 9850 + }, + { + "epoch": 1.6994738204088673, + "grad_norm": 0.58203125, + "learning_rate": 1.101952158778865e-06, + "loss": 1.3957, + "step": 9851 + }, + { + "epoch": 1.6996463383075993, + "grad_norm": 0.59765625, + "learning_rate": 1.100710382448198e-06, + "loss": 1.5045, + "step": 9852 + }, + { + "epoch": 1.6998188562063314, + "grad_norm": 1.75, + "learning_rate": 1.0994692654325111e-06, + "loss": 1.5065, + "step": 9853 + }, + { + "epoch": 1.6999913741050634, + "grad_norm": 0.640625, + "learning_rate": 1.0982288078237547e-06, + "loss": 1.3886, + "step": 9854 + }, + { + "epoch": 1.7001638920037954, + "grad_norm": 0.56640625, + "learning_rate": 1.096989009713828e-06, + "loss": 1.4662, + "step": 9855 + }, + { + "epoch": 1.7003364099025275, + "grad_norm": 0.5703125, + "learning_rate": 1.0957498711945858e-06, + "loss": 1.3959, + "step": 9856 + }, + { + "epoch": 1.7005089278012595, + "grad_norm": 0.63671875, + "learning_rate": 1.0945113923578277e-06, + "loss": 1.5277, + "step": 9857 + }, + { + "epoch": 1.7006814456999915, + "grad_norm": 0.54296875, + "learning_rate": 1.0932735732953103e-06, + "loss": 1.4183, + "step": 9858 + }, + { + "epoch": 1.7008539635987234, + "grad_norm": 0.57421875, + "learning_rate": 1.0920364140987383e-06, + "loss": 1.3968, + "step": 9859 + }, + { + "epoch": 1.7010264814974554, + "grad_norm": 0.5703125, + "learning_rate": 1.0907999148597703e-06, + "loss": 1.4088, + "step": 9860 + }, + { + "epoch": 1.7011989993961874, + "grad_norm": 0.6171875, + "learning_rate": 1.0895640756700087e-06, + "loss": 1.3208, + "step": 9861 + }, + { + "epoch": 1.7013715172949193, + "grad_norm": 0.6328125, + "learning_rate": 1.0883288966210181e-06, + "loss": 1.4109, + "step": 9862 + }, + { + "epoch": 1.7015440351936513, + "grad_norm": 0.58203125, + "learning_rate": 1.0870943778043041e-06, + "loss": 1.4039, + "step": 9863 + }, + { + "epoch": 1.7017165530923832, + "grad_norm": 0.89453125, + "learning_rate": 1.0858605193113292e-06, + "loss": 1.3308, + "step": 9864 + }, + { + "epoch": 1.7018890709911152, + "grad_norm": 0.546875, + "learning_rate": 1.0846273212335046e-06, + "loss": 1.3447, + "step": 9865 + }, + { + "epoch": 1.7020615888898474, + "grad_norm": 0.5703125, + "learning_rate": 1.083394783662194e-06, + "loss": 1.4377, + "step": 9866 + }, + { + "epoch": 1.7022341067885793, + "grad_norm": 0.5859375, + "learning_rate": 1.0821629066887118e-06, + "loss": 1.4277, + "step": 9867 + }, + { + "epoch": 1.7024066246873113, + "grad_norm": 0.55859375, + "learning_rate": 1.0809316904043243e-06, + "loss": 1.3596, + "step": 9868 + }, + { + "epoch": 1.7025791425860433, + "grad_norm": 0.56640625, + "learning_rate": 1.0797011349002418e-06, + "loss": 1.3799, + "step": 9869 + }, + { + "epoch": 1.7027516604847754, + "grad_norm": 0.62890625, + "learning_rate": 1.0784712402676412e-06, + "loss": 1.5748, + "step": 9870 + }, + { + "epoch": 1.7029241783835074, + "grad_norm": 0.5625, + "learning_rate": 1.0772420065976319e-06, + "loss": 1.4519, + "step": 9871 + }, + { + "epoch": 1.7030966962822394, + "grad_norm": 0.55859375, + "learning_rate": 1.0760134339812856e-06, + "loss": 1.4332, + "step": 9872 + }, + { + "epoch": 1.7032692141809713, + "grad_norm": 1.6640625, + "learning_rate": 1.0747855225096272e-06, + "loss": 1.376, + "step": 9873 + }, + { + "epoch": 1.7034417320797033, + "grad_norm": 0.5859375, + "learning_rate": 1.0735582722736205e-06, + "loss": 1.3404, + "step": 9874 + }, + { + "epoch": 1.7036142499784352, + "grad_norm": 0.5546875, + "learning_rate": 1.0723316833641961e-06, + "loss": 1.3909, + "step": 9875 + }, + { + "epoch": 1.7037867678771672, + "grad_norm": 0.578125, + "learning_rate": 1.0711057558722216e-06, + "loss": 1.3385, + "step": 9876 + }, + { + "epoch": 1.7039592857758992, + "grad_norm": 0.5859375, + "learning_rate": 1.069880489888523e-06, + "loss": 1.4349, + "step": 9877 + }, + { + "epoch": 1.7041318036746311, + "grad_norm": 0.6328125, + "learning_rate": 1.0686558855038753e-06, + "loss": 1.4143, + "step": 9878 + }, + { + "epoch": 1.704304321573363, + "grad_norm": 0.56640625, + "learning_rate": 1.0674319428090052e-06, + "loss": 1.489, + "step": 9879 + }, + { + "epoch": 1.7044768394720953, + "grad_norm": 0.55859375, + "learning_rate": 1.0662086618945911e-06, + "loss": 1.4414, + "step": 9880 + }, + { + "epoch": 1.7046493573708272, + "grad_norm": 0.5546875, + "learning_rate": 1.0649860428512604e-06, + "loss": 1.3912, + "step": 9881 + }, + { + "epoch": 1.7048218752695592, + "grad_norm": 0.578125, + "learning_rate": 1.0637640857695897e-06, + "loss": 1.408, + "step": 9882 + }, + { + "epoch": 1.7049943931682912, + "grad_norm": 0.58984375, + "learning_rate": 1.0625427907401154e-06, + "loss": 1.3399, + "step": 9883 + }, + { + "epoch": 1.7051669110670233, + "grad_norm": 0.58984375, + "learning_rate": 1.0613221578533128e-06, + "loss": 1.3745, + "step": 9884 + }, + { + "epoch": 1.7053394289657553, + "grad_norm": 0.58203125, + "learning_rate": 1.0601021871996154e-06, + "loss": 1.302, + "step": 9885 + }, + { + "epoch": 1.7055119468644873, + "grad_norm": 0.59765625, + "learning_rate": 1.0588828788694082e-06, + "loss": 1.4234, + "step": 9886 + }, + { + "epoch": 1.7056844647632192, + "grad_norm": 0.578125, + "learning_rate": 1.057664232953024e-06, + "loss": 1.436, + "step": 9887 + }, + { + "epoch": 1.7058569826619512, + "grad_norm": 0.55859375, + "learning_rate": 1.0564462495407468e-06, + "loss": 1.3744, + "step": 9888 + }, + { + "epoch": 1.7060295005606831, + "grad_norm": 0.83203125, + "learning_rate": 1.0552289287228145e-06, + "loss": 1.5192, + "step": 9889 + }, + { + "epoch": 1.706202018459415, + "grad_norm": 0.7109375, + "learning_rate": 1.0540122705894117e-06, + "loss": 1.4054, + "step": 9890 + }, + { + "epoch": 1.706374536358147, + "grad_norm": 0.625, + "learning_rate": 1.0527962752306776e-06, + "loss": 1.4689, + "step": 9891 + }, + { + "epoch": 1.706547054256879, + "grad_norm": 0.65234375, + "learning_rate": 1.051580942736702e-06, + "loss": 1.4161, + "step": 9892 + }, + { + "epoch": 1.706719572155611, + "grad_norm": 0.640625, + "learning_rate": 1.0503662731975184e-06, + "loss": 1.3389, + "step": 9893 + }, + { + "epoch": 1.7068920900543432, + "grad_norm": 0.546875, + "learning_rate": 1.049152266703124e-06, + "loss": 1.4677, + "step": 9894 + }, + { + "epoch": 1.7070646079530751, + "grad_norm": 0.5546875, + "learning_rate": 1.0479389233434566e-06, + "loss": 1.3986, + "step": 9895 + }, + { + "epoch": 1.707237125851807, + "grad_norm": 0.61328125, + "learning_rate": 1.0467262432084092e-06, + "loss": 1.497, + "step": 9896 + }, + { + "epoch": 1.7074096437505393, + "grad_norm": 0.71484375, + "learning_rate": 1.0455142263878226e-06, + "loss": 1.4305, + "step": 9897 + }, + { + "epoch": 1.7075821616492712, + "grad_norm": 0.62890625, + "learning_rate": 1.0443028729714944e-06, + "loss": 1.4552, + "step": 9898 + }, + { + "epoch": 1.7077546795480032, + "grad_norm": 0.57421875, + "learning_rate": 1.0430921830491657e-06, + "loss": 1.3979, + "step": 9899 + }, + { + "epoch": 1.7079271974467352, + "grad_norm": 0.55859375, + "learning_rate": 1.0418821567105353e-06, + "loss": 1.3842, + "step": 9900 + }, + { + "epoch": 1.7079271974467352, + "eval_loss": 1.4070851802825928, + "eval_runtime": 10.7355, + "eval_samples_per_second": 95.384, + "eval_steps_per_second": 23.846, + "step": 9900 + }, + { + "epoch": 1.7080997153454671, + "grad_norm": 0.59765625, + "learning_rate": 1.0406727940452443e-06, + "loss": 1.5596, + "step": 9901 + }, + { + "epoch": 1.708272233244199, + "grad_norm": 0.66796875, + "learning_rate": 1.0394640951428968e-06, + "loss": 1.416, + "step": 9902 + }, + { + "epoch": 1.708444751142931, + "grad_norm": 0.609375, + "learning_rate": 1.038256060093036e-06, + "loss": 1.4251, + "step": 9903 + }, + { + "epoch": 1.708617269041663, + "grad_norm": 0.5703125, + "learning_rate": 1.0370486889851616e-06, + "loss": 1.3457, + "step": 9904 + }, + { + "epoch": 1.708789786940395, + "grad_norm": 0.5625, + "learning_rate": 1.0358419819087228e-06, + "loss": 1.434, + "step": 9905 + }, + { + "epoch": 1.708962304839127, + "grad_norm": 0.5703125, + "learning_rate": 1.034635938953119e-06, + "loss": 1.3788, + "step": 9906 + }, + { + "epoch": 1.709134822737859, + "grad_norm": 0.6015625, + "learning_rate": 1.0334305602077067e-06, + "loss": 1.4269, + "step": 9907 + }, + { + "epoch": 1.709307340636591, + "grad_norm": 0.78515625, + "learning_rate": 1.0322258457617828e-06, + "loss": 1.4744, + "step": 9908 + }, + { + "epoch": 1.709479858535323, + "grad_norm": 0.5546875, + "learning_rate": 1.0310217957046009e-06, + "loss": 1.3707, + "step": 9909 + }, + { + "epoch": 1.709652376434055, + "grad_norm": 0.58203125, + "learning_rate": 1.029818410125365e-06, + "loss": 1.4436, + "step": 9910 + }, + { + "epoch": 1.7098248943327872, + "grad_norm": 0.5703125, + "learning_rate": 1.0286156891132303e-06, + "loss": 1.3644, + "step": 9911 + }, + { + "epoch": 1.7099974122315191, + "grad_norm": 0.58203125, + "learning_rate": 1.0274136327573004e-06, + "loss": 1.4127, + "step": 9912 + }, + { + "epoch": 1.710169930130251, + "grad_norm": 0.53125, + "learning_rate": 1.0262122411466346e-06, + "loss": 1.3324, + "step": 9913 + }, + { + "epoch": 1.710342448028983, + "grad_norm": 0.671875, + "learning_rate": 1.0250115143702321e-06, + "loss": 1.5323, + "step": 9914 + }, + { + "epoch": 1.710514965927715, + "grad_norm": 0.5703125, + "learning_rate": 1.0238114525170595e-06, + "loss": 1.4241, + "step": 9915 + }, + { + "epoch": 1.710687483826447, + "grad_norm": 0.55859375, + "learning_rate": 1.0226120556760178e-06, + "loss": 1.4542, + "step": 9916 + }, + { + "epoch": 1.710860001725179, + "grad_norm": 0.5625, + "learning_rate": 1.021413323935969e-06, + "loss": 1.4219, + "step": 9917 + }, + { + "epoch": 1.711032519623911, + "grad_norm": 0.56640625, + "learning_rate": 1.0202152573857216e-06, + "loss": 1.3185, + "step": 9918 + }, + { + "epoch": 1.7112050375226429, + "grad_norm": 0.59765625, + "learning_rate": 1.0190178561140363e-06, + "loss": 1.4635, + "step": 9919 + }, + { + "epoch": 1.7113775554213748, + "grad_norm": 0.66015625, + "learning_rate": 1.0178211202096233e-06, + "loss": 1.3594, + "step": 9920 + }, + { + "epoch": 1.711550073320107, + "grad_norm": 0.6015625, + "learning_rate": 1.0166250497611462e-06, + "loss": 1.4801, + "step": 9921 + }, + { + "epoch": 1.711722591218839, + "grad_norm": 0.58984375, + "learning_rate": 1.0154296448572154e-06, + "loss": 1.392, + "step": 9922 + }, + { + "epoch": 1.711895109117571, + "grad_norm": 0.57421875, + "learning_rate": 1.0142349055863942e-06, + "loss": 1.434, + "step": 9923 + }, + { + "epoch": 1.712067627016303, + "grad_norm": 0.56640625, + "learning_rate": 1.0130408320371998e-06, + "loss": 1.4354, + "step": 9924 + }, + { + "epoch": 1.712240144915035, + "grad_norm": 0.5546875, + "learning_rate": 1.0118474242980892e-06, + "loss": 1.3889, + "step": 9925 + }, + { + "epoch": 1.712412662813767, + "grad_norm": 0.5859375, + "learning_rate": 1.010654682457486e-06, + "loss": 1.4636, + "step": 9926 + }, + { + "epoch": 1.712585180712499, + "grad_norm": 0.55859375, + "learning_rate": 1.0094626066037506e-06, + "loss": 1.4264, + "step": 9927 + }, + { + "epoch": 1.712757698611231, + "grad_norm": 0.60546875, + "learning_rate": 1.0082711968252e-06, + "loss": 1.4446, + "step": 9928 + }, + { + "epoch": 1.712930216509963, + "grad_norm": 0.59765625, + "learning_rate": 1.0070804532101019e-06, + "loss": 1.4269, + "step": 9929 + }, + { + "epoch": 1.7131027344086949, + "grad_norm": 0.6015625, + "learning_rate": 1.0058903758466743e-06, + "loss": 1.4745, + "step": 9930 + }, + { + "epoch": 1.7132752523074268, + "grad_norm": 0.578125, + "learning_rate": 1.0047009648230865e-06, + "loss": 1.5025, + "step": 9931 + }, + { + "epoch": 1.7134477702061588, + "grad_norm": 0.62890625, + "learning_rate": 1.0035122202274572e-06, + "loss": 1.5489, + "step": 9932 + }, + { + "epoch": 1.7136202881048908, + "grad_norm": 0.60546875, + "learning_rate": 1.0023241421478513e-06, + "loss": 1.3831, + "step": 9933 + }, + { + "epoch": 1.7137928060036227, + "grad_norm": 0.625, + "learning_rate": 1.0011367306722975e-06, + "loss": 1.3894, + "step": 9934 + }, + { + "epoch": 1.713965323902355, + "grad_norm": 0.55859375, + "learning_rate": 9.999499858887585e-07, + "loss": 1.3981, + "step": 9935 + }, + { + "epoch": 1.7141378418010869, + "grad_norm": 0.54296875, + "learning_rate": 9.987639078851629e-07, + "loss": 1.3585, + "step": 9936 + }, + { + "epoch": 1.7143103596998188, + "grad_norm": 0.71875, + "learning_rate": 9.975784967493774e-07, + "loss": 1.4269, + "step": 9937 + }, + { + "epoch": 1.714482877598551, + "grad_norm": 0.5625, + "learning_rate": 9.963937525692247e-07, + "loss": 1.4254, + "step": 9938 + }, + { + "epoch": 1.714655395497283, + "grad_norm": 0.59375, + "learning_rate": 9.952096754324847e-07, + "loss": 1.5058, + "step": 9939 + }, + { + "epoch": 1.714827913396015, + "grad_norm": 0.5546875, + "learning_rate": 9.940262654268729e-07, + "loss": 1.4928, + "step": 9940 + }, + { + "epoch": 1.7150004312947469, + "grad_norm": 0.55859375, + "learning_rate": 9.928435226400678e-07, + "loss": 1.3425, + "step": 9941 + }, + { + "epoch": 1.7151729491934788, + "grad_norm": 0.71875, + "learning_rate": 9.916614471596953e-07, + "loss": 1.456, + "step": 9942 + }, + { + "epoch": 1.7153454670922108, + "grad_norm": 0.5703125, + "learning_rate": 9.904800390733283e-07, + "loss": 1.4231, + "step": 9943 + }, + { + "epoch": 1.7155179849909428, + "grad_norm": 0.5859375, + "learning_rate": 9.892992984684935e-07, + "loss": 1.3841, + "step": 9944 + }, + { + "epoch": 1.7156905028896747, + "grad_norm": 0.6015625, + "learning_rate": 9.88119225432671e-07, + "loss": 1.4788, + "step": 9945 + }, + { + "epoch": 1.7158630207884067, + "grad_norm": 0.58203125, + "learning_rate": 9.86939820053281e-07, + "loss": 1.4304, + "step": 9946 + }, + { + "epoch": 1.7160355386871387, + "grad_norm": 0.66796875, + "learning_rate": 9.857610824177088e-07, + "loss": 1.3979, + "step": 9947 + }, + { + "epoch": 1.7162080565858708, + "grad_norm": 0.71484375, + "learning_rate": 9.845830126132782e-07, + "loss": 1.3864, + "step": 9948 + }, + { + "epoch": 1.7163805744846028, + "grad_norm": 0.78125, + "learning_rate": 9.834056107272682e-07, + "loss": 1.4578, + "step": 9949 + }, + { + "epoch": 1.7165530923833348, + "grad_norm": 0.78515625, + "learning_rate": 9.82228876846909e-07, + "loss": 1.4952, + "step": 9950 + }, + { + "epoch": 1.7167256102820667, + "grad_norm": 0.56640625, + "learning_rate": 9.810528110593798e-07, + "loss": 1.4029, + "step": 9951 + }, + { + "epoch": 1.716898128180799, + "grad_norm": 0.60546875, + "learning_rate": 9.798774134518119e-07, + "loss": 1.446, + "step": 9952 + }, + { + "epoch": 1.7170706460795309, + "grad_norm": 0.60546875, + "learning_rate": 9.787026841112856e-07, + "loss": 1.2539, + "step": 9953 + }, + { + "epoch": 1.7172431639782628, + "grad_norm": 0.59765625, + "learning_rate": 9.775286231248316e-07, + "loss": 1.4176, + "step": 9954 + }, + { + "epoch": 1.7174156818769948, + "grad_norm": 0.5625, + "learning_rate": 9.763552305794323e-07, + "loss": 1.3271, + "step": 9955 + }, + { + "epoch": 1.7175881997757267, + "grad_norm": 0.578125, + "learning_rate": 9.751825065620223e-07, + "loss": 1.5245, + "step": 9956 + }, + { + "epoch": 1.7177607176744587, + "grad_norm": 0.640625, + "learning_rate": 9.740104511594783e-07, + "loss": 1.5093, + "step": 9957 + }, + { + "epoch": 1.7179332355731907, + "grad_norm": 0.59375, + "learning_rate": 9.728390644586394e-07, + "loss": 1.4256, + "step": 9958 + }, + { + "epoch": 1.7181057534719226, + "grad_norm": 0.6171875, + "learning_rate": 9.716683465462862e-07, + "loss": 1.3404, + "step": 9959 + }, + { + "epoch": 1.7182782713706546, + "grad_norm": 0.56640625, + "learning_rate": 9.704982975091538e-07, + "loss": 1.4166, + "step": 9960 + }, + { + "epoch": 1.7184507892693865, + "grad_norm": 0.5546875, + "learning_rate": 9.69328917433926e-07, + "loss": 1.3496, + "step": 9961 + }, + { + "epoch": 1.7186233071681187, + "grad_norm": 0.61328125, + "learning_rate": 9.681602064072382e-07, + "loss": 1.3654, + "step": 9962 + }, + { + "epoch": 1.7187958250668507, + "grad_norm": 0.578125, + "learning_rate": 9.669921645156755e-07, + "loss": 1.4074, + "step": 9963 + }, + { + "epoch": 1.7189683429655827, + "grad_norm": 0.56640625, + "learning_rate": 9.658247918457763e-07, + "loss": 1.3444, + "step": 9964 + }, + { + "epoch": 1.7191408608643148, + "grad_norm": 0.8125, + "learning_rate": 9.646580884840207e-07, + "loss": 1.5314, + "step": 9965 + }, + { + "epoch": 1.7193133787630468, + "grad_norm": 0.546875, + "learning_rate": 9.634920545168535e-07, + "loss": 1.4694, + "step": 9966 + }, + { + "epoch": 1.7194858966617788, + "grad_norm": 0.578125, + "learning_rate": 9.623266900306538e-07, + "loss": 1.421, + "step": 9967 + }, + { + "epoch": 1.7196584145605107, + "grad_norm": 0.58203125, + "learning_rate": 9.611619951117657e-07, + "loss": 1.4543, + "step": 9968 + }, + { + "epoch": 1.7198309324592427, + "grad_norm": 0.6171875, + "learning_rate": 9.599979698464733e-07, + "loss": 1.4106, + "step": 9969 + }, + { + "epoch": 1.7200034503579746, + "grad_norm": 0.57421875, + "learning_rate": 9.588346143210137e-07, + "loss": 1.4361, + "step": 9970 + }, + { + "epoch": 1.7201759682567066, + "grad_norm": 0.5859375, + "learning_rate": 9.576719286215818e-07, + "loss": 1.3925, + "step": 9971 + }, + { + "epoch": 1.7203484861554386, + "grad_norm": 0.6328125, + "learning_rate": 9.565099128343103e-07, + "loss": 1.4657, + "step": 9972 + }, + { + "epoch": 1.7205210040541705, + "grad_norm": 0.96484375, + "learning_rate": 9.553485670452911e-07, + "loss": 1.3915, + "step": 9973 + }, + { + "epoch": 1.7206935219529025, + "grad_norm": 0.5859375, + "learning_rate": 9.541878913405633e-07, + "loss": 1.4143, + "step": 9974 + }, + { + "epoch": 1.7208660398516344, + "grad_norm": 0.5859375, + "learning_rate": 9.530278858061171e-07, + "loss": 1.4653, + "step": 9975 + }, + { + "epoch": 1.7210385577503666, + "grad_norm": 0.5703125, + "learning_rate": 9.518685505278935e-07, + "loss": 1.4554, + "step": 9976 + }, + { + "epoch": 1.7212110756490986, + "grad_norm": 0.59765625, + "learning_rate": 9.507098855917851e-07, + "loss": 1.5149, + "step": 9977 + }, + { + "epoch": 1.7213835935478305, + "grad_norm": 0.578125, + "learning_rate": 9.495518910836276e-07, + "loss": 1.555, + "step": 9978 + }, + { + "epoch": 1.7215561114465627, + "grad_norm": 0.55078125, + "learning_rate": 9.483945670892191e-07, + "loss": 1.4126, + "step": 9979 + }, + { + "epoch": 1.7217286293452947, + "grad_norm": 0.578125, + "learning_rate": 9.472379136942955e-07, + "loss": 1.4291, + "step": 9980 + }, + { + "epoch": 1.7219011472440267, + "grad_norm": 0.5859375, + "learning_rate": 9.460819309845526e-07, + "loss": 1.3575, + "step": 9981 + }, + { + "epoch": 1.7220736651427586, + "grad_norm": 0.625, + "learning_rate": 9.449266190456318e-07, + "loss": 1.4072, + "step": 9982 + }, + { + "epoch": 1.7222461830414906, + "grad_norm": 0.578125, + "learning_rate": 9.437719779631249e-07, + "loss": 1.4664, + "step": 9983 + }, + { + "epoch": 1.7224187009402225, + "grad_norm": 0.5546875, + "learning_rate": 9.426180078225766e-07, + "loss": 1.4289, + "step": 9984 + }, + { + "epoch": 1.7225912188389545, + "grad_norm": 0.58984375, + "learning_rate": 9.414647087094786e-07, + "loss": 1.3887, + "step": 9985 + }, + { + "epoch": 1.7227637367376865, + "grad_norm": 0.57421875, + "learning_rate": 9.403120807092759e-07, + "loss": 1.4219, + "step": 9986 + }, + { + "epoch": 1.7229362546364184, + "grad_norm": 0.56640625, + "learning_rate": 9.391601239073611e-07, + "loss": 1.3415, + "step": 9987 + }, + { + "epoch": 1.7231087725351504, + "grad_norm": 0.56640625, + "learning_rate": 9.380088383890818e-07, + "loss": 1.3889, + "step": 9988 + }, + { + "epoch": 1.7232812904338826, + "grad_norm": 0.6328125, + "learning_rate": 9.368582242397262e-07, + "loss": 1.3631, + "step": 9989 + }, + { + "epoch": 1.7234538083326145, + "grad_norm": 0.6015625, + "learning_rate": 9.357082815445451e-07, + "loss": 1.4397, + "step": 9990 + }, + { + "epoch": 1.7236263262313465, + "grad_norm": 0.5546875, + "learning_rate": 9.345590103887292e-07, + "loss": 1.4168, + "step": 9991 + }, + { + "epoch": 1.7237988441300784, + "grad_norm": 0.56640625, + "learning_rate": 9.33410410857426e-07, + "loss": 1.4388, + "step": 9992 + }, + { + "epoch": 1.7239713620288106, + "grad_norm": 0.60546875, + "learning_rate": 9.322624830357297e-07, + "loss": 1.3092, + "step": 9993 + }, + { + "epoch": 1.7241438799275426, + "grad_norm": 0.56640625, + "learning_rate": 9.311152270086865e-07, + "loss": 1.3909, + "step": 9994 + }, + { + "epoch": 1.7243163978262745, + "grad_norm": 0.6171875, + "learning_rate": 9.29968642861293e-07, + "loss": 1.4871, + "step": 9995 + }, + { + "epoch": 1.7244889157250065, + "grad_norm": 0.5546875, + "learning_rate": 9.288227306784936e-07, + "loss": 1.403, + "step": 9996 + }, + { + "epoch": 1.7246614336237385, + "grad_norm": 0.71484375, + "learning_rate": 9.276774905451868e-07, + "loss": 1.4458, + "step": 9997 + }, + { + "epoch": 1.7248339515224704, + "grad_norm": 0.5859375, + "learning_rate": 9.265329225462183e-07, + "loss": 1.4917, + "step": 9998 + }, + { + "epoch": 1.7250064694212024, + "grad_norm": 0.5546875, + "learning_rate": 9.253890267663824e-07, + "loss": 1.4892, + "step": 9999 + }, + { + "epoch": 1.7251789873199344, + "grad_norm": 0.70703125, + "learning_rate": 9.242458032904311e-07, + "loss": 1.5056, + "step": 10000 + }, + { + "epoch": 1.7251789873199344, + "eval_loss": 1.4070346355438232, + "eval_runtime": 10.8088, + "eval_samples_per_second": 94.738, + "eval_steps_per_second": 23.684, + "step": 10000 + }, + { + "epoch": 1.7253515052186663, + "grad_norm": 0.5859375, + "learning_rate": 9.231032522030569e-07, + "loss": 1.508, + "step": 10001 + }, + { + "epoch": 1.7255240231173983, + "grad_norm": 0.6171875, + "learning_rate": 9.219613735889066e-07, + "loss": 1.4105, + "step": 10002 + }, + { + "epoch": 1.7256965410161305, + "grad_norm": 0.59375, + "learning_rate": 9.208201675325834e-07, + "loss": 1.4321, + "step": 10003 + }, + { + "epoch": 1.7258690589148624, + "grad_norm": 0.58203125, + "learning_rate": 9.19679634118631e-07, + "loss": 1.3757, + "step": 10004 + }, + { + "epoch": 1.7260415768135944, + "grad_norm": 0.56640625, + "learning_rate": 9.185397734315471e-07, + "loss": 1.4183, + "step": 10005 + }, + { + "epoch": 1.7262140947123266, + "grad_norm": 0.640625, + "learning_rate": 9.174005855557799e-07, + "loss": 1.3735, + "step": 10006 + }, + { + "epoch": 1.7263866126110585, + "grad_norm": 0.609375, + "learning_rate": 9.162620705757286e-07, + "loss": 1.4336, + "step": 10007 + }, + { + "epoch": 1.7265591305097905, + "grad_norm": 0.55859375, + "learning_rate": 9.1512422857574e-07, + "loss": 1.4212, + "step": 10008 + }, + { + "epoch": 1.7267316484085224, + "grad_norm": 0.60546875, + "learning_rate": 9.139870596401168e-07, + "loss": 1.493, + "step": 10009 + }, + { + "epoch": 1.7269041663072544, + "grad_norm": 0.59765625, + "learning_rate": 9.128505638530993e-07, + "loss": 1.4634, + "step": 10010 + }, + { + "epoch": 1.7270766842059864, + "grad_norm": 0.61328125, + "learning_rate": 9.117147412988958e-07, + "loss": 1.3975, + "step": 10011 + }, + { + "epoch": 1.7272492021047183, + "grad_norm": 0.55859375, + "learning_rate": 9.105795920616478e-07, + "loss": 1.4103, + "step": 10012 + }, + { + "epoch": 1.7274217200034503, + "grad_norm": 0.55078125, + "learning_rate": 9.09445116225458e-07, + "loss": 1.3861, + "step": 10013 + }, + { + "epoch": 1.7275942379021823, + "grad_norm": 0.58203125, + "learning_rate": 9.083113138743738e-07, + "loss": 1.3988, + "step": 10014 + }, + { + "epoch": 1.7277667558009142, + "grad_norm": 0.58203125, + "learning_rate": 9.071781850923944e-07, + "loss": 1.3857, + "step": 10015 + }, + { + "epoch": 1.7279392736996462, + "grad_norm": 0.578125, + "learning_rate": 9.060457299634706e-07, + "loss": 1.4469, + "step": 10016 + }, + { + "epoch": 1.7281117915983784, + "grad_norm": 0.58984375, + "learning_rate": 9.049139485714997e-07, + "loss": 1.4281, + "step": 10017 + }, + { + "epoch": 1.7282843094971103, + "grad_norm": 0.640625, + "learning_rate": 9.037828410003336e-07, + "loss": 1.4445, + "step": 10018 + }, + { + "epoch": 1.7284568273958423, + "grad_norm": 0.61328125, + "learning_rate": 9.026524073337695e-07, + "loss": 1.3679, + "step": 10019 + }, + { + "epoch": 1.7286293452945745, + "grad_norm": 0.6015625, + "learning_rate": 9.015226476555594e-07, + "loss": 1.4261, + "step": 10020 + }, + { + "epoch": 1.7288018631933064, + "grad_norm": 0.58203125, + "learning_rate": 9.003935620493987e-07, + "loss": 1.5309, + "step": 10021 + }, + { + "epoch": 1.7289743810920384, + "grad_norm": 0.56640625, + "learning_rate": 8.992651505989425e-07, + "loss": 1.3974, + "step": 10022 + }, + { + "epoch": 1.7291468989907703, + "grad_norm": 0.59375, + "learning_rate": 8.981374133877851e-07, + "loss": 1.4646, + "step": 10023 + }, + { + "epoch": 1.7293194168895023, + "grad_norm": 0.59765625, + "learning_rate": 8.970103504994832e-07, + "loss": 1.5218, + "step": 10024 + }, + { + "epoch": 1.7294919347882343, + "grad_norm": 0.578125, + "learning_rate": 8.958839620175297e-07, + "loss": 1.4581, + "step": 10025 + }, + { + "epoch": 1.7296644526869662, + "grad_norm": 0.57421875, + "learning_rate": 8.94758248025378e-07, + "loss": 1.3696, + "step": 10026 + }, + { + "epoch": 1.7298369705856982, + "grad_norm": 0.61328125, + "learning_rate": 8.936332086064281e-07, + "loss": 1.4554, + "step": 10027 + }, + { + "epoch": 1.7300094884844301, + "grad_norm": 0.55859375, + "learning_rate": 8.925088438440288e-07, + "loss": 1.4383, + "step": 10028 + }, + { + "epoch": 1.730182006383162, + "grad_norm": 0.62109375, + "learning_rate": 8.913851538214802e-07, + "loss": 1.4926, + "step": 10029 + }, + { + "epoch": 1.7303545242818943, + "grad_norm": 0.609375, + "learning_rate": 8.902621386220355e-07, + "loss": 1.3719, + "step": 10030 + }, + { + "epoch": 1.7305270421806263, + "grad_norm": 0.54296875, + "learning_rate": 8.891397983288874e-07, + "loss": 1.4848, + "step": 10031 + }, + { + "epoch": 1.7306995600793582, + "grad_norm": 0.54296875, + "learning_rate": 8.880181330251935e-07, + "loss": 1.4134, + "step": 10032 + }, + { + "epoch": 1.7308720779780902, + "grad_norm": 0.58984375, + "learning_rate": 8.868971427940498e-07, + "loss": 1.4186, + "step": 10033 + }, + { + "epoch": 1.7310445958768224, + "grad_norm": 0.70703125, + "learning_rate": 8.857768277185041e-07, + "loss": 1.4711, + "step": 10034 + }, + { + "epoch": 1.7312171137755543, + "grad_norm": 0.56640625, + "learning_rate": 8.846571878815647e-07, + "loss": 1.4353, + "step": 10035 + }, + { + "epoch": 1.7313896316742863, + "grad_norm": 0.609375, + "learning_rate": 8.835382233661727e-07, + "loss": 1.4302, + "step": 10036 + }, + { + "epoch": 1.7315621495730182, + "grad_norm": 0.5703125, + "learning_rate": 8.82419934255232e-07, + "loss": 1.4478, + "step": 10037 + }, + { + "epoch": 1.7317346674717502, + "grad_norm": 0.578125, + "learning_rate": 8.813023206315918e-07, + "loss": 1.3949, + "step": 10038 + }, + { + "epoch": 1.7319071853704822, + "grad_norm": 0.55078125, + "learning_rate": 8.801853825780516e-07, + "loss": 1.3945, + "step": 10039 + }, + { + "epoch": 1.7320797032692141, + "grad_norm": 0.5546875, + "learning_rate": 8.790691201773616e-07, + "loss": 1.4858, + "step": 10040 + }, + { + "epoch": 1.732252221167946, + "grad_norm": 0.59375, + "learning_rate": 8.779535335122236e-07, + "loss": 1.4134, + "step": 10041 + }, + { + "epoch": 1.732424739066678, + "grad_norm": 0.6015625, + "learning_rate": 8.768386226652814e-07, + "loss": 1.5136, + "step": 10042 + }, + { + "epoch": 1.73259725696541, + "grad_norm": 1.5234375, + "learning_rate": 8.757243877191412e-07, + "loss": 1.4558, + "step": 10043 + }, + { + "epoch": 1.7327697748641422, + "grad_norm": 0.5703125, + "learning_rate": 8.746108287563482e-07, + "loss": 1.5203, + "step": 10044 + }, + { + "epoch": 1.7329422927628741, + "grad_norm": 0.80078125, + "learning_rate": 8.734979458594028e-07, + "loss": 1.5167, + "step": 10045 + }, + { + "epoch": 1.733114810661606, + "grad_norm": 0.58203125, + "learning_rate": 8.723857391107549e-07, + "loss": 1.5184, + "step": 10046 + }, + { + "epoch": 1.7332873285603383, + "grad_norm": 0.734375, + "learning_rate": 8.712742085928027e-07, + "loss": 1.5099, + "step": 10047 + }, + { + "epoch": 1.7334598464590703, + "grad_norm": 0.6015625, + "learning_rate": 8.701633543878974e-07, + "loss": 1.4642, + "step": 10048 + }, + { + "epoch": 1.7336323643578022, + "grad_norm": 0.5703125, + "learning_rate": 8.69053176578335e-07, + "loss": 1.3658, + "step": 10049 + }, + { + "epoch": 1.7338048822565342, + "grad_norm": 0.5859375, + "learning_rate": 8.679436752463677e-07, + "loss": 1.467, + "step": 10050 + }, + { + "epoch": 1.7339774001552661, + "grad_norm": 0.58203125, + "learning_rate": 8.668348504741919e-07, + "loss": 1.4236, + "step": 10051 + }, + { + "epoch": 1.734149918053998, + "grad_norm": 0.56640625, + "learning_rate": 8.657267023439586e-07, + "loss": 1.3282, + "step": 10052 + }, + { + "epoch": 1.73432243595273, + "grad_norm": 0.56640625, + "learning_rate": 8.64619230937761e-07, + "loss": 1.5454, + "step": 10053 + }, + { + "epoch": 1.734494953851462, + "grad_norm": 0.66015625, + "learning_rate": 8.635124363376557e-07, + "loss": 1.3545, + "step": 10054 + }, + { + "epoch": 1.734667471750194, + "grad_norm": 0.59375, + "learning_rate": 8.624063186256327e-07, + "loss": 1.3744, + "step": 10055 + }, + { + "epoch": 1.734839989648926, + "grad_norm": 0.58203125, + "learning_rate": 8.613008778836463e-07, + "loss": 1.3671, + "step": 10056 + }, + { + "epoch": 1.7350125075476581, + "grad_norm": 0.6171875, + "learning_rate": 8.601961141935922e-07, + "loss": 1.4429, + "step": 10057 + }, + { + "epoch": 1.73518502544639, + "grad_norm": 0.5546875, + "learning_rate": 8.590920276373172e-07, + "loss": 1.4882, + "step": 10058 + }, + { + "epoch": 1.735357543345122, + "grad_norm": 0.89453125, + "learning_rate": 8.579886182966191e-07, + "loss": 1.4334, + "step": 10059 + }, + { + "epoch": 1.735530061243854, + "grad_norm": 0.5859375, + "learning_rate": 8.56885886253247e-07, + "loss": 1.4344, + "step": 10060 + }, + { + "epoch": 1.7357025791425862, + "grad_norm": 0.58203125, + "learning_rate": 8.557838315888966e-07, + "loss": 1.3999, + "step": 10061 + }, + { + "epoch": 1.7358750970413181, + "grad_norm": 0.61328125, + "learning_rate": 8.546824543852183e-07, + "loss": 1.4345, + "step": 10062 + }, + { + "epoch": 1.73604761494005, + "grad_norm": 0.625, + "learning_rate": 8.535817547238023e-07, + "loss": 1.4524, + "step": 10063 + }, + { + "epoch": 1.736220132838782, + "grad_norm": 0.60546875, + "learning_rate": 8.524817326862034e-07, + "loss": 1.5366, + "step": 10064 + }, + { + "epoch": 1.736392650737514, + "grad_norm": 0.5625, + "learning_rate": 8.513823883539118e-07, + "loss": 1.2663, + "step": 10065 + }, + { + "epoch": 1.736565168636246, + "grad_norm": 0.55078125, + "learning_rate": 8.502837218083737e-07, + "loss": 1.3951, + "step": 10066 + }, + { + "epoch": 1.736737686534978, + "grad_norm": 0.6171875, + "learning_rate": 8.491857331309928e-07, + "loss": 1.3688, + "step": 10067 + }, + { + "epoch": 1.73691020443371, + "grad_norm": 0.61328125, + "learning_rate": 8.480884224031061e-07, + "loss": 1.378, + "step": 10068 + }, + { + "epoch": 1.7370827223324419, + "grad_norm": 0.578125, + "learning_rate": 8.469917897060142e-07, + "loss": 1.3862, + "step": 10069 + }, + { + "epoch": 1.7372552402311738, + "grad_norm": 0.55078125, + "learning_rate": 8.458958351209601e-07, + "loss": 1.4146, + "step": 10070 + }, + { + "epoch": 1.737427758129906, + "grad_norm": 0.609375, + "learning_rate": 8.448005587291408e-07, + "loss": 1.3553, + "step": 10071 + }, + { + "epoch": 1.737600276028638, + "grad_norm": 0.59375, + "learning_rate": 8.437059606117004e-07, + "loss": 1.5589, + "step": 10072 + }, + { + "epoch": 1.73777279392737, + "grad_norm": 0.6015625, + "learning_rate": 8.426120408497351e-07, + "loss": 1.3337, + "step": 10073 + }, + { + "epoch": 1.7379453118261021, + "grad_norm": 0.87109375, + "learning_rate": 8.415187995242846e-07, + "loss": 1.4074, + "step": 10074 + }, + { + "epoch": 1.738117829724834, + "grad_norm": 0.5625, + "learning_rate": 8.404262367163495e-07, + "loss": 1.3617, + "step": 10075 + }, + { + "epoch": 1.738290347623566, + "grad_norm": 0.609375, + "learning_rate": 8.393343525068687e-07, + "loss": 1.4669, + "step": 10076 + }, + { + "epoch": 1.738462865522298, + "grad_norm": 0.6796875, + "learning_rate": 8.382431469767372e-07, + "loss": 1.4331, + "step": 10077 + }, + { + "epoch": 1.73863538342103, + "grad_norm": 0.98828125, + "learning_rate": 8.371526202067993e-07, + "loss": 1.3406, + "step": 10078 + }, + { + "epoch": 1.738807901319762, + "grad_norm": 0.62890625, + "learning_rate": 8.360627722778469e-07, + "loss": 1.3818, + "step": 10079 + }, + { + "epoch": 1.738980419218494, + "grad_norm": 0.5859375, + "learning_rate": 8.349736032706234e-07, + "loss": 1.3747, + "step": 10080 + }, + { + "epoch": 1.7391529371172259, + "grad_norm": 0.57421875, + "learning_rate": 8.338851132658221e-07, + "loss": 1.5443, + "step": 10081 + }, + { + "epoch": 1.7393254550159578, + "grad_norm": 0.640625, + "learning_rate": 8.327973023440827e-07, + "loss": 1.4706, + "step": 10082 + }, + { + "epoch": 1.7394979729146898, + "grad_norm": 0.53125, + "learning_rate": 8.317101705859986e-07, + "loss": 1.4077, + "step": 10083 + }, + { + "epoch": 1.7396704908134217, + "grad_norm": 0.57421875, + "learning_rate": 8.306237180721121e-07, + "loss": 1.3999, + "step": 10084 + }, + { + "epoch": 1.739843008712154, + "grad_norm": 0.62109375, + "learning_rate": 8.295379448829133e-07, + "loss": 1.422, + "step": 10085 + }, + { + "epoch": 1.7400155266108859, + "grad_norm": 0.6171875, + "learning_rate": 8.284528510988444e-07, + "loss": 1.3776, + "step": 10086 + }, + { + "epoch": 1.7401880445096178, + "grad_norm": 0.59765625, + "learning_rate": 8.273684368002922e-07, + "loss": 1.4917, + "step": 10087 + }, + { + "epoch": 1.74036056240835, + "grad_norm": 0.71875, + "learning_rate": 8.262847020676024e-07, + "loss": 1.5477, + "step": 10088 + }, + { + "epoch": 1.740533080307082, + "grad_norm": 0.59765625, + "learning_rate": 8.252016469810597e-07, + "loss": 1.4891, + "step": 10089 + }, + { + "epoch": 1.740705598205814, + "grad_norm": 0.6640625, + "learning_rate": 8.241192716209056e-07, + "loss": 1.404, + "step": 10090 + }, + { + "epoch": 1.740878116104546, + "grad_norm": 0.59375, + "learning_rate": 8.230375760673304e-07, + "loss": 1.3418, + "step": 10091 + }, + { + "epoch": 1.7410506340032779, + "grad_norm": 0.625, + "learning_rate": 8.219565604004709e-07, + "loss": 1.4332, + "step": 10092 + }, + { + "epoch": 1.7412231519020098, + "grad_norm": 0.57421875, + "learning_rate": 8.208762247004176e-07, + "loss": 1.3749, + "step": 10093 + }, + { + "epoch": 1.7413956698007418, + "grad_norm": 0.7890625, + "learning_rate": 8.197965690472088e-07, + "loss": 1.4559, + "step": 10094 + }, + { + "epoch": 1.7415681876994737, + "grad_norm": 0.57421875, + "learning_rate": 8.187175935208269e-07, + "loss": 1.3704, + "step": 10095 + }, + { + "epoch": 1.7417407055982057, + "grad_norm": 0.54296875, + "learning_rate": 8.17639298201216e-07, + "loss": 1.4201, + "step": 10096 + }, + { + "epoch": 1.7419132234969377, + "grad_norm": 0.55859375, + "learning_rate": 8.165616831682588e-07, + "loss": 1.3645, + "step": 10097 + }, + { + "epoch": 1.7420857413956699, + "grad_norm": 0.5859375, + "learning_rate": 8.154847485017913e-07, + "loss": 1.5445, + "step": 10098 + }, + { + "epoch": 1.7422582592944018, + "grad_norm": 0.57421875, + "learning_rate": 8.144084942816043e-07, + "loss": 1.4455, + "step": 10099 + }, + { + "epoch": 1.7424307771931338, + "grad_norm": 0.6171875, + "learning_rate": 8.133329205874286e-07, + "loss": 1.4443, + "step": 10100 + }, + { + "epoch": 1.7424307771931338, + "eval_loss": 1.4070847034454346, + "eval_runtime": 11.2683, + "eval_samples_per_second": 90.874, + "eval_steps_per_second": 22.719, + "step": 10100 + }, + { + "epoch": 1.7426032950918657, + "grad_norm": 0.57421875, + "learning_rate": 8.122580274989511e-07, + "loss": 1.3178, + "step": 10101 + }, + { + "epoch": 1.742775812990598, + "grad_norm": 0.58984375, + "learning_rate": 8.111838150958062e-07, + "loss": 1.4056, + "step": 10102 + }, + { + "epoch": 1.7429483308893299, + "grad_norm": 0.6171875, + "learning_rate": 8.101102834575792e-07, + "loss": 1.4306, + "step": 10103 + }, + { + "epoch": 1.7431208487880618, + "grad_norm": 0.57421875, + "learning_rate": 8.090374326638028e-07, + "loss": 1.3321, + "step": 10104 + }, + { + "epoch": 1.7432933666867938, + "grad_norm": 0.57421875, + "learning_rate": 8.079652627939638e-07, + "loss": 1.4098, + "step": 10105 + }, + { + "epoch": 1.7434658845855258, + "grad_norm": 0.56640625, + "learning_rate": 8.068937739274885e-07, + "loss": 1.4515, + "step": 10106 + }, + { + "epoch": 1.7436384024842577, + "grad_norm": 0.70703125, + "learning_rate": 8.058229661437677e-07, + "loss": 1.4637, + "step": 10107 + }, + { + "epoch": 1.7438109203829897, + "grad_norm": 0.58984375, + "learning_rate": 8.047528395221271e-07, + "loss": 1.4579, + "step": 10108 + }, + { + "epoch": 1.7439834382817216, + "grad_norm": 0.625, + "learning_rate": 8.03683394141851e-07, + "loss": 1.4756, + "step": 10109 + }, + { + "epoch": 1.7441559561804536, + "grad_norm": 0.5546875, + "learning_rate": 8.026146300821702e-07, + "loss": 1.3693, + "step": 10110 + }, + { + "epoch": 1.7443284740791856, + "grad_norm": 0.68359375, + "learning_rate": 8.015465474222661e-07, + "loss": 1.3065, + "step": 10111 + }, + { + "epoch": 1.7445009919779177, + "grad_norm": 0.66796875, + "learning_rate": 8.004791462412675e-07, + "loss": 1.4108, + "step": 10112 + }, + { + "epoch": 1.7446735098766497, + "grad_norm": 0.5390625, + "learning_rate": 7.994124266182568e-07, + "loss": 1.4297, + "step": 10113 + }, + { + "epoch": 1.7448460277753817, + "grad_norm": 0.5703125, + "learning_rate": 7.983463886322584e-07, + "loss": 1.4389, + "step": 10114 + }, + { + "epoch": 1.7450185456741139, + "grad_norm": 0.6171875, + "learning_rate": 7.97281032362256e-07, + "loss": 1.4527, + "step": 10115 + }, + { + "epoch": 1.7451910635728458, + "grad_norm": 0.55859375, + "learning_rate": 7.962163578871751e-07, + "loss": 1.3734, + "step": 10116 + }, + { + "epoch": 1.7453635814715778, + "grad_norm": 0.58203125, + "learning_rate": 7.95152365285895e-07, + "loss": 1.4244, + "step": 10117 + }, + { + "epoch": 1.7455360993703097, + "grad_norm": 0.5546875, + "learning_rate": 7.940890546372437e-07, + "loss": 1.408, + "step": 10118 + }, + { + "epoch": 1.7457086172690417, + "grad_norm": 0.58984375, + "learning_rate": 7.930264260199938e-07, + "loss": 1.4781, + "step": 10119 + }, + { + "epoch": 1.7458811351677737, + "grad_norm": 0.60546875, + "learning_rate": 7.919644795128767e-07, + "loss": 1.393, + "step": 10120 + }, + { + "epoch": 1.7460536530665056, + "grad_norm": 0.61328125, + "learning_rate": 7.909032151945639e-07, + "loss": 1.5179, + "step": 10121 + }, + { + "epoch": 1.7462261709652376, + "grad_norm": 0.796875, + "learning_rate": 7.898426331436815e-07, + "loss": 1.4342, + "step": 10122 + }, + { + "epoch": 1.7463986888639695, + "grad_norm": 0.57421875, + "learning_rate": 7.887827334388054e-07, + "loss": 1.4451, + "step": 10123 + }, + { + "epoch": 1.7465712067627015, + "grad_norm": 0.625, + "learning_rate": 7.877235161584584e-07, + "loss": 1.4059, + "step": 10124 + }, + { + "epoch": 1.7467437246614335, + "grad_norm": 0.55859375, + "learning_rate": 7.866649813811145e-07, + "loss": 1.4599, + "step": 10125 + }, + { + "epoch": 1.7469162425601656, + "grad_norm": 0.67578125, + "learning_rate": 7.856071291851975e-07, + "loss": 1.4546, + "step": 10126 + }, + { + "epoch": 1.7470887604588976, + "grad_norm": 0.57421875, + "learning_rate": 7.845499596490758e-07, + "loss": 1.3923, + "step": 10127 + }, + { + "epoch": 1.7472612783576296, + "grad_norm": 0.59765625, + "learning_rate": 7.834934728510768e-07, + "loss": 1.3865, + "step": 10128 + }, + { + "epoch": 1.7474337962563617, + "grad_norm": 0.609375, + "learning_rate": 7.824376688694668e-07, + "loss": 1.3667, + "step": 10129 + }, + { + "epoch": 1.7476063141550937, + "grad_norm": 0.5859375, + "learning_rate": 7.813825477824665e-07, + "loss": 1.5013, + "step": 10130 + }, + { + "epoch": 1.7477788320538257, + "grad_norm": 0.58203125, + "learning_rate": 7.803281096682524e-07, + "loss": 1.4107, + "step": 10131 + }, + { + "epoch": 1.7479513499525576, + "grad_norm": 0.59375, + "learning_rate": 7.792743546049364e-07, + "loss": 1.4919, + "step": 10132 + }, + { + "epoch": 1.7481238678512896, + "grad_norm": 0.625, + "learning_rate": 7.782212826705892e-07, + "loss": 1.3758, + "step": 10133 + }, + { + "epoch": 1.7482963857500216, + "grad_norm": 0.58984375, + "learning_rate": 7.771688939432309e-07, + "loss": 1.5687, + "step": 10134 + }, + { + "epoch": 1.7484689036487535, + "grad_norm": 0.56640625, + "learning_rate": 7.761171885008279e-07, + "loss": 1.3382, + "step": 10135 + }, + { + "epoch": 1.7486414215474855, + "grad_norm": 0.58203125, + "learning_rate": 7.750661664212966e-07, + "loss": 1.3175, + "step": 10136 + }, + { + "epoch": 1.7488139394462174, + "grad_norm": 0.62109375, + "learning_rate": 7.74015827782505e-07, + "loss": 1.522, + "step": 10137 + }, + { + "epoch": 1.7489864573449494, + "grad_norm": 0.60546875, + "learning_rate": 7.72966172662265e-07, + "loss": 1.5441, + "step": 10138 + }, + { + "epoch": 1.7491589752436816, + "grad_norm": 0.57421875, + "learning_rate": 7.719172011383468e-07, + "loss": 1.3934, + "step": 10139 + }, + { + "epoch": 1.7493314931424135, + "grad_norm": 0.56640625, + "learning_rate": 7.708689132884606e-07, + "loss": 1.4005, + "step": 10140 + }, + { + "epoch": 1.7495040110411455, + "grad_norm": 0.6171875, + "learning_rate": 7.698213091902718e-07, + "loss": 1.5491, + "step": 10141 + }, + { + "epoch": 1.7496765289398775, + "grad_norm": 0.890625, + "learning_rate": 7.687743889213939e-07, + "loss": 1.431, + "step": 10142 + }, + { + "epoch": 1.7498490468386096, + "grad_norm": 0.8984375, + "learning_rate": 7.677281525593871e-07, + "loss": 1.4588, + "step": 10143 + }, + { + "epoch": 1.7500215647373416, + "grad_norm": 0.5859375, + "learning_rate": 7.666826001817651e-07, + "loss": 1.4738, + "step": 10144 + }, + { + "epoch": 1.7501940826360736, + "grad_norm": 0.56640625, + "learning_rate": 7.656377318659891e-07, + "loss": 1.3973, + "step": 10145 + }, + { + "epoch": 1.7503666005348055, + "grad_norm": 0.56640625, + "learning_rate": 7.645935476894684e-07, + "loss": 1.4568, + "step": 10146 + }, + { + "epoch": 1.7505391184335375, + "grad_norm": 0.578125, + "learning_rate": 7.635500477295632e-07, + "loss": 1.3341, + "step": 10147 + }, + { + "epoch": 1.7507116363322694, + "grad_norm": 0.7421875, + "learning_rate": 7.625072320635829e-07, + "loss": 1.3969, + "step": 10148 + }, + { + "epoch": 1.7508841542310014, + "grad_norm": 0.5859375, + "learning_rate": 7.614651007687857e-07, + "loss": 1.5004, + "step": 10149 + }, + { + "epoch": 1.7510566721297334, + "grad_norm": 0.59375, + "learning_rate": 7.604236539223797e-07, + "loss": 1.4903, + "step": 10150 + }, + { + "epoch": 1.7512291900284653, + "grad_norm": 0.5546875, + "learning_rate": 7.593828916015178e-07, + "loss": 1.5149, + "step": 10151 + }, + { + "epoch": 1.7514017079271973, + "grad_norm": 0.60546875, + "learning_rate": 7.583428138833126e-07, + "loss": 1.3694, + "step": 10152 + }, + { + "epoch": 1.7515742258259295, + "grad_norm": 0.56640625, + "learning_rate": 7.573034208448149e-07, + "loss": 1.3829, + "step": 10153 + }, + { + "epoch": 1.7517467437246614, + "grad_norm": 0.59765625, + "learning_rate": 7.562647125630307e-07, + "loss": 1.4432, + "step": 10154 + }, + { + "epoch": 1.7519192616233934, + "grad_norm": 0.58984375, + "learning_rate": 7.55226689114914e-07, + "loss": 1.4537, + "step": 10155 + }, + { + "epoch": 1.7520917795221256, + "grad_norm": 0.5859375, + "learning_rate": 7.541893505773679e-07, + "loss": 1.4597, + "step": 10156 + }, + { + "epoch": 1.7522642974208575, + "grad_norm": 0.6015625, + "learning_rate": 7.531526970272463e-07, + "loss": 1.3658, + "step": 10157 + }, + { + "epoch": 1.7524368153195895, + "grad_norm": 0.61328125, + "learning_rate": 7.5211672854135e-07, + "loss": 1.4581, + "step": 10158 + }, + { + "epoch": 1.7526093332183215, + "grad_norm": 0.58984375, + "learning_rate": 7.510814451964277e-07, + "loss": 1.4452, + "step": 10159 + }, + { + "epoch": 1.7527818511170534, + "grad_norm": 0.55078125, + "learning_rate": 7.500468470691846e-07, + "loss": 1.4134, + "step": 10160 + }, + { + "epoch": 1.7529543690157854, + "grad_norm": 0.546875, + "learning_rate": 7.490129342362662e-07, + "loss": 1.3958, + "step": 10161 + }, + { + "epoch": 1.7531268869145173, + "grad_norm": 0.59375, + "learning_rate": 7.479797067742711e-07, + "loss": 1.4252, + "step": 10162 + }, + { + "epoch": 1.7532994048132493, + "grad_norm": 0.578125, + "learning_rate": 7.469471647597515e-07, + "loss": 1.4293, + "step": 10163 + }, + { + "epoch": 1.7534719227119813, + "grad_norm": 0.5625, + "learning_rate": 7.459153082691994e-07, + "loss": 1.4295, + "step": 10164 + }, + { + "epoch": 1.7536444406107132, + "grad_norm": 0.5625, + "learning_rate": 7.448841373790639e-07, + "loss": 1.3745, + "step": 10165 + }, + { + "epoch": 1.7538169585094452, + "grad_norm": 0.6015625, + "learning_rate": 7.438536521657402e-07, + "loss": 1.401, + "step": 10166 + }, + { + "epoch": 1.7539894764081774, + "grad_norm": 0.59765625, + "learning_rate": 7.428238527055731e-07, + "loss": 1.4449, + "step": 10167 + }, + { + "epoch": 1.7541619943069093, + "grad_norm": 0.60546875, + "learning_rate": 7.41794739074857e-07, + "loss": 1.3888, + "step": 10168 + }, + { + "epoch": 1.7543345122056413, + "grad_norm": 0.64453125, + "learning_rate": 7.407663113498353e-07, + "loss": 1.4028, + "step": 10169 + }, + { + "epoch": 1.7545070301043735, + "grad_norm": 0.5390625, + "learning_rate": 7.397385696066972e-07, + "loss": 1.3718, + "step": 10170 + }, + { + "epoch": 1.7546795480031054, + "grad_norm": 0.5859375, + "learning_rate": 7.387115139215895e-07, + "loss": 1.3943, + "step": 10171 + }, + { + "epoch": 1.7548520659018374, + "grad_norm": 0.60546875, + "learning_rate": 7.376851443705968e-07, + "loss": 1.5253, + "step": 10172 + }, + { + "epoch": 1.7550245838005694, + "grad_norm": 0.578125, + "learning_rate": 7.36659461029765e-07, + "loss": 1.4728, + "step": 10173 + }, + { + "epoch": 1.7551971016993013, + "grad_norm": 0.61328125, + "learning_rate": 7.35634463975079e-07, + "loss": 1.4457, + "step": 10174 + }, + { + "epoch": 1.7553696195980333, + "grad_norm": 0.9921875, + "learning_rate": 7.346101532824789e-07, + "loss": 1.398, + "step": 10175 + }, + { + "epoch": 1.7555421374967652, + "grad_norm": 0.9921875, + "learning_rate": 7.33586529027851e-07, + "loss": 1.4072, + "step": 10176 + }, + { + "epoch": 1.7557146553954972, + "grad_norm": 0.640625, + "learning_rate": 7.325635912870321e-07, + "loss": 1.4049, + "step": 10177 + }, + { + "epoch": 1.7558871732942292, + "grad_norm": 0.5625, + "learning_rate": 7.315413401358084e-07, + "loss": 1.529, + "step": 10178 + }, + { + "epoch": 1.7560596911929611, + "grad_norm": 0.55859375, + "learning_rate": 7.305197756499139e-07, + "loss": 1.4471, + "step": 10179 + }, + { + "epoch": 1.7562322090916933, + "grad_norm": 0.59375, + "learning_rate": 7.294988979050333e-07, + "loss": 1.4572, + "step": 10180 + }, + { + "epoch": 1.7564047269904253, + "grad_norm": 0.61328125, + "learning_rate": 7.284787069767984e-07, + "loss": 1.3271, + "step": 10181 + }, + { + "epoch": 1.7565772448891572, + "grad_norm": 0.5703125, + "learning_rate": 7.274592029407946e-07, + "loss": 1.4513, + "step": 10182 + }, + { + "epoch": 1.7567497627878892, + "grad_norm": 0.58984375, + "learning_rate": 7.264403858725466e-07, + "loss": 1.504, + "step": 10183 + }, + { + "epoch": 1.7569222806866214, + "grad_norm": 0.55078125, + "learning_rate": 7.254222558475421e-07, + "loss": 1.3852, + "step": 10184 + }, + { + "epoch": 1.7570947985853533, + "grad_norm": 0.5859375, + "learning_rate": 7.244048129412051e-07, + "loss": 1.5207, + "step": 10185 + }, + { + "epoch": 1.7572673164840853, + "grad_norm": 0.5703125, + "learning_rate": 7.233880572289165e-07, + "loss": 1.4163, + "step": 10186 + }, + { + "epoch": 1.7574398343828173, + "grad_norm": 0.60546875, + "learning_rate": 7.223719887860037e-07, + "loss": 1.4681, + "step": 10187 + }, + { + "epoch": 1.7576123522815492, + "grad_norm": 0.625, + "learning_rate": 7.213566076877431e-07, + "loss": 1.3599, + "step": 10188 + }, + { + "epoch": 1.7577848701802812, + "grad_norm": 0.6796875, + "learning_rate": 7.203419140093604e-07, + "loss": 1.3668, + "step": 10189 + }, + { + "epoch": 1.7579573880790131, + "grad_norm": 0.59375, + "learning_rate": 7.193279078260329e-07, + "loss": 1.4273, + "step": 10190 + }, + { + "epoch": 1.758129905977745, + "grad_norm": 0.59375, + "learning_rate": 7.183145892128785e-07, + "loss": 1.4715, + "step": 10191 + }, + { + "epoch": 1.758302423876477, + "grad_norm": 0.58203125, + "learning_rate": 7.17301958244978e-07, + "loss": 1.4836, + "step": 10192 + }, + { + "epoch": 1.758474941775209, + "grad_norm": 0.66015625, + "learning_rate": 7.162900149973473e-07, + "loss": 1.362, + "step": 10193 + }, + { + "epoch": 1.7586474596739412, + "grad_norm": 0.6328125, + "learning_rate": 7.152787595449573e-07, + "loss": 1.3473, + "step": 10194 + }, + { + "epoch": 1.7588199775726732, + "grad_norm": 0.59765625, + "learning_rate": 7.142681919627348e-07, + "loss": 1.4041, + "step": 10195 + }, + { + "epoch": 1.7589924954714051, + "grad_norm": 0.578125, + "learning_rate": 7.132583123255421e-07, + "loss": 1.3846, + "step": 10196 + }, + { + "epoch": 1.7591650133701373, + "grad_norm": 0.72265625, + "learning_rate": 7.122491207082006e-07, + "loss": 1.3585, + "step": 10197 + }, + { + "epoch": 1.7593375312688693, + "grad_norm": 0.6171875, + "learning_rate": 7.112406171854758e-07, + "loss": 1.4366, + "step": 10198 + }, + { + "epoch": 1.7595100491676012, + "grad_norm": 0.59765625, + "learning_rate": 7.102328018320859e-07, + "loss": 1.4385, + "step": 10199 + }, + { + "epoch": 1.7596825670663332, + "grad_norm": 0.62890625, + "learning_rate": 7.092256747226944e-07, + "loss": 1.4074, + "step": 10200 + }, + { + "epoch": 1.7596825670663332, + "eval_loss": 1.4070675373077393, + "eval_runtime": 10.9117, + "eval_samples_per_second": 93.844, + "eval_steps_per_second": 23.461, + "step": 10200 + }, + { + "epoch": 1.7598550849650652, + "grad_norm": 0.578125, + "learning_rate": 7.082192359319184e-07, + "loss": 1.3849, + "step": 10201 + }, + { + "epoch": 1.7600276028637971, + "grad_norm": 0.59765625, + "learning_rate": 7.07213485534316e-07, + "loss": 1.3763, + "step": 10202 + }, + { + "epoch": 1.760200120762529, + "grad_norm": 0.6171875, + "learning_rate": 7.062084236044065e-07, + "loss": 1.4973, + "step": 10203 + }, + { + "epoch": 1.760372638661261, + "grad_norm": 0.59375, + "learning_rate": 7.052040502166424e-07, + "loss": 1.359, + "step": 10204 + }, + { + "epoch": 1.760545156559993, + "grad_norm": 0.5703125, + "learning_rate": 7.042003654454432e-07, + "loss": 1.4431, + "step": 10205 + }, + { + "epoch": 1.760717674458725, + "grad_norm": 0.5703125, + "learning_rate": 7.031973693651617e-07, + "loss": 1.4034, + "step": 10206 + }, + { + "epoch": 1.7608901923574571, + "grad_norm": 0.58203125, + "learning_rate": 7.021950620501084e-07, + "loss": 1.5532, + "step": 10207 + }, + { + "epoch": 1.761062710256189, + "grad_norm": 0.64453125, + "learning_rate": 7.011934435745404e-07, + "loss": 1.3747, + "step": 10208 + }, + { + "epoch": 1.761235228154921, + "grad_norm": 0.55859375, + "learning_rate": 7.001925140126631e-07, + "loss": 1.4181, + "step": 10209 + }, + { + "epoch": 1.761407746053653, + "grad_norm": 0.578125, + "learning_rate": 6.991922734386336e-07, + "loss": 1.3301, + "step": 10210 + }, + { + "epoch": 1.7615802639523852, + "grad_norm": 0.59765625, + "learning_rate": 6.981927219265527e-07, + "loss": 1.5361, + "step": 10211 + }, + { + "epoch": 1.7617527818511172, + "grad_norm": 0.7265625, + "learning_rate": 6.971938595504768e-07, + "loss": 1.4583, + "step": 10212 + }, + { + "epoch": 1.7619252997498491, + "grad_norm": 0.62109375, + "learning_rate": 6.961956863844055e-07, + "loss": 1.4323, + "step": 10213 + }, + { + "epoch": 1.762097817648581, + "grad_norm": 0.60546875, + "learning_rate": 6.951982025022929e-07, + "loss": 1.3942, + "step": 10214 + }, + { + "epoch": 1.762270335547313, + "grad_norm": 0.5703125, + "learning_rate": 6.942014079780335e-07, + "loss": 1.4092, + "step": 10215 + }, + { + "epoch": 1.762442853446045, + "grad_norm": 0.5625, + "learning_rate": 6.932053028854813e-07, + "loss": 1.4492, + "step": 10216 + }, + { + "epoch": 1.762615371344777, + "grad_norm": 0.578125, + "learning_rate": 6.922098872984317e-07, + "loss": 1.4895, + "step": 10217 + }, + { + "epoch": 1.762787889243509, + "grad_norm": 0.56640625, + "learning_rate": 6.912151612906303e-07, + "loss": 1.4247, + "step": 10218 + }, + { + "epoch": 1.762960407142241, + "grad_norm": 0.5859375, + "learning_rate": 6.902211249357738e-07, + "loss": 1.4815, + "step": 10219 + }, + { + "epoch": 1.7631329250409729, + "grad_norm": 0.57421875, + "learning_rate": 6.892277783075063e-07, + "loss": 1.4463, + "step": 10220 + }, + { + "epoch": 1.763305442939705, + "grad_norm": 0.59765625, + "learning_rate": 6.882351214794225e-07, + "loss": 1.422, + "step": 10221 + }, + { + "epoch": 1.763477960838437, + "grad_norm": 0.60546875, + "learning_rate": 6.872431545250636e-07, + "loss": 1.5413, + "step": 10222 + }, + { + "epoch": 1.763650478737169, + "grad_norm": 0.58984375, + "learning_rate": 6.862518775179183e-07, + "loss": 1.4846, + "step": 10223 + }, + { + "epoch": 1.763822996635901, + "grad_norm": 0.58984375, + "learning_rate": 6.852612905314326e-07, + "loss": 1.4419, + "step": 10224 + }, + { + "epoch": 1.763995514534633, + "grad_norm": 0.56640625, + "learning_rate": 6.842713936389889e-07, + "loss": 1.3773, + "step": 10225 + }, + { + "epoch": 1.764168032433365, + "grad_norm": 0.59375, + "learning_rate": 6.832821869139272e-07, + "loss": 1.4648, + "step": 10226 + }, + { + "epoch": 1.764340550332097, + "grad_norm": 0.890625, + "learning_rate": 6.822936704295369e-07, + "loss": 1.3654, + "step": 10227 + }, + { + "epoch": 1.764513068230829, + "grad_norm": 0.5859375, + "learning_rate": 6.813058442590504e-07, + "loss": 1.4976, + "step": 10228 + }, + { + "epoch": 1.764685586129561, + "grad_norm": 0.6015625, + "learning_rate": 6.803187084756524e-07, + "loss": 1.371, + "step": 10229 + }, + { + "epoch": 1.764858104028293, + "grad_norm": 0.60546875, + "learning_rate": 6.793322631524768e-07, + "loss": 1.4333, + "step": 10230 + }, + { + "epoch": 1.7650306219270249, + "grad_norm": 0.59765625, + "learning_rate": 6.78346508362605e-07, + "loss": 1.4147, + "step": 10231 + }, + { + "epoch": 1.7652031398257568, + "grad_norm": 0.62890625, + "learning_rate": 6.773614441790677e-07, + "loss": 1.3943, + "step": 10232 + }, + { + "epoch": 1.7653756577244888, + "grad_norm": 0.58984375, + "learning_rate": 6.763770706748462e-07, + "loss": 1.4275, + "step": 10233 + }, + { + "epoch": 1.7655481756232208, + "grad_norm": 0.6328125, + "learning_rate": 6.75393387922868e-07, + "loss": 1.3645, + "step": 10234 + }, + { + "epoch": 1.765720693521953, + "grad_norm": 0.68359375, + "learning_rate": 6.744103959960113e-07, + "loss": 1.3754, + "step": 10235 + }, + { + "epoch": 1.765893211420685, + "grad_norm": 0.58203125, + "learning_rate": 6.734280949670991e-07, + "loss": 1.3763, + "step": 10236 + }, + { + "epoch": 1.7660657293194169, + "grad_norm": 0.578125, + "learning_rate": 6.724464849089107e-07, + "loss": 1.4541, + "step": 10237 + }, + { + "epoch": 1.766238247218149, + "grad_norm": 0.578125, + "learning_rate": 6.714655658941671e-07, + "loss": 1.3945, + "step": 10238 + }, + { + "epoch": 1.766410765116881, + "grad_norm": 0.57421875, + "learning_rate": 6.704853379955423e-07, + "loss": 1.3877, + "step": 10239 + }, + { + "epoch": 1.766583283015613, + "grad_norm": 0.5546875, + "learning_rate": 6.69505801285657e-07, + "loss": 1.4581, + "step": 10240 + }, + { + "epoch": 1.766755800914345, + "grad_norm": 0.77734375, + "learning_rate": 6.68526955837081e-07, + "loss": 1.4175, + "step": 10241 + }, + { + "epoch": 1.7669283188130769, + "grad_norm": 0.59375, + "learning_rate": 6.675488017223342e-07, + "loss": 1.4392, + "step": 10242 + }, + { + "epoch": 1.7671008367118088, + "grad_norm": 0.609375, + "learning_rate": 6.665713390138839e-07, + "loss": 1.3622, + "step": 10243 + }, + { + "epoch": 1.7672733546105408, + "grad_norm": 0.5703125, + "learning_rate": 6.655945677841457e-07, + "loss": 1.3939, + "step": 10244 + }, + { + "epoch": 1.7674458725092728, + "grad_norm": 0.5703125, + "learning_rate": 6.646184881054874e-07, + "loss": 1.3592, + "step": 10245 + }, + { + "epoch": 1.7676183904080047, + "grad_norm": 0.86328125, + "learning_rate": 6.636431000502231e-07, + "loss": 1.4977, + "step": 10246 + }, + { + "epoch": 1.7677909083067367, + "grad_norm": 0.59765625, + "learning_rate": 6.626684036906106e-07, + "loss": 1.4954, + "step": 10247 + }, + { + "epoch": 1.7679634262054689, + "grad_norm": 0.58203125, + "learning_rate": 6.616943990988689e-07, + "loss": 1.4395, + "step": 10248 + }, + { + "epoch": 1.7681359441042008, + "grad_norm": 0.5546875, + "learning_rate": 6.607210863471525e-07, + "loss": 1.4819, + "step": 10249 + }, + { + "epoch": 1.7683084620029328, + "grad_norm": 0.55859375, + "learning_rate": 6.597484655075726e-07, + "loss": 1.3285, + "step": 10250 + }, + { + "epoch": 1.7684809799016648, + "grad_norm": 0.58984375, + "learning_rate": 6.58776536652187e-07, + "loss": 1.4535, + "step": 10251 + }, + { + "epoch": 1.768653497800397, + "grad_norm": 0.5625, + "learning_rate": 6.578052998530016e-07, + "loss": 1.539, + "step": 10252 + }, + { + "epoch": 1.768826015699129, + "grad_norm": 0.578125, + "learning_rate": 6.568347551819731e-07, + "loss": 1.4043, + "step": 10253 + }, + { + "epoch": 1.7689985335978609, + "grad_norm": 0.7109375, + "learning_rate": 6.558649027110054e-07, + "loss": 1.3375, + "step": 10254 + }, + { + "epoch": 1.7691710514965928, + "grad_norm": 0.5859375, + "learning_rate": 6.548957425119484e-07, + "loss": 1.4756, + "step": 10255 + }, + { + "epoch": 1.7693435693953248, + "grad_norm": 0.59375, + "learning_rate": 6.539272746566083e-07, + "loss": 1.4516, + "step": 10256 + }, + { + "epoch": 1.7695160872940567, + "grad_norm": 0.6015625, + "learning_rate": 6.529594992167321e-07, + "loss": 1.3337, + "step": 10257 + }, + { + "epoch": 1.7696886051927887, + "grad_norm": 0.57421875, + "learning_rate": 6.519924162640168e-07, + "loss": 1.5003, + "step": 10258 + }, + { + "epoch": 1.7698611230915207, + "grad_norm": 0.6875, + "learning_rate": 6.510260258701151e-07, + "loss": 1.476, + "step": 10259 + }, + { + "epoch": 1.7700336409902526, + "grad_norm": 0.58203125, + "learning_rate": 6.500603281066175e-07, + "loss": 1.3975, + "step": 10260 + }, + { + "epoch": 1.7702061588889846, + "grad_norm": 0.59375, + "learning_rate": 6.490953230450758e-07, + "loss": 1.4206, + "step": 10261 + }, + { + "epoch": 1.7703786767877168, + "grad_norm": 0.69140625, + "learning_rate": 6.481310107569772e-07, + "loss": 1.4545, + "step": 10262 + }, + { + "epoch": 1.7705511946864487, + "grad_norm": 0.55859375, + "learning_rate": 6.471673913137666e-07, + "loss": 1.5057, + "step": 10263 + }, + { + "epoch": 1.7707237125851807, + "grad_norm": 0.56640625, + "learning_rate": 6.46204464786836e-07, + "loss": 1.3801, + "step": 10264 + }, + { + "epoch": 1.7708962304839129, + "grad_norm": 0.5546875, + "learning_rate": 6.452422312475226e-07, + "loss": 1.4318, + "step": 10265 + }, + { + "epoch": 1.7710687483826448, + "grad_norm": 0.640625, + "learning_rate": 6.442806907671162e-07, + "loss": 1.4907, + "step": 10266 + }, + { + "epoch": 1.7712412662813768, + "grad_norm": 0.65625, + "learning_rate": 6.433198434168552e-07, + "loss": 1.512, + "step": 10267 + }, + { + "epoch": 1.7714137841801088, + "grad_norm": 0.53125, + "learning_rate": 6.423596892679207e-07, + "loss": 1.3861, + "step": 10268 + }, + { + "epoch": 1.7715863020788407, + "grad_norm": 0.5859375, + "learning_rate": 6.414002283914522e-07, + "loss": 1.4541, + "step": 10269 + }, + { + "epoch": 1.7717588199775727, + "grad_norm": 0.5625, + "learning_rate": 6.404414608585285e-07, + "loss": 1.5342, + "step": 10270 + }, + { + "epoch": 1.7719313378763046, + "grad_norm": 0.55859375, + "learning_rate": 6.394833867401829e-07, + "loss": 1.4425, + "step": 10271 + }, + { + "epoch": 1.7721038557750366, + "grad_norm": 0.73828125, + "learning_rate": 6.385260061073962e-07, + "loss": 1.4565, + "step": 10272 + }, + { + "epoch": 1.7722763736737686, + "grad_norm": 0.59765625, + "learning_rate": 6.37569319031095e-07, + "loss": 1.4543, + "step": 10273 + }, + { + "epoch": 1.7724488915725005, + "grad_norm": 0.609375, + "learning_rate": 6.366133255821572e-07, + "loss": 1.4817, + "step": 10274 + }, + { + "epoch": 1.7726214094712325, + "grad_norm": 0.5703125, + "learning_rate": 6.356580258314105e-07, + "loss": 1.3968, + "step": 10275 + }, + { + "epoch": 1.7727939273699647, + "grad_norm": 0.578125, + "learning_rate": 6.347034198496271e-07, + "loss": 1.4356, + "step": 10276 + }, + { + "epoch": 1.7729664452686966, + "grad_norm": 0.70703125, + "learning_rate": 6.337495077075328e-07, + "loss": 1.4766, + "step": 10277 + }, + { + "epoch": 1.7731389631674286, + "grad_norm": 0.609375, + "learning_rate": 6.327962894757988e-07, + "loss": 1.5474, + "step": 10278 + }, + { + "epoch": 1.7733114810661608, + "grad_norm": 0.5703125, + "learning_rate": 6.31843765225042e-07, + "loss": 1.4506, + "step": 10279 + }, + { + "epoch": 1.7734839989648927, + "grad_norm": 0.62109375, + "learning_rate": 6.308919350258369e-07, + "loss": 1.4491, + "step": 10280 + }, + { + "epoch": 1.7736565168636247, + "grad_norm": 0.59375, + "learning_rate": 6.29940798948696e-07, + "loss": 1.4158, + "step": 10281 + }, + { + "epoch": 1.7738290347623566, + "grad_norm": 0.56640625, + "learning_rate": 6.289903570640887e-07, + "loss": 1.419, + "step": 10282 + }, + { + "epoch": 1.7740015526610886, + "grad_norm": 0.59765625, + "learning_rate": 6.280406094424285e-07, + "loss": 1.4394, + "step": 10283 + }, + { + "epoch": 1.7741740705598206, + "grad_norm": 1.4296875, + "learning_rate": 6.27091556154078e-07, + "loss": 1.5351, + "step": 10284 + }, + { + "epoch": 1.7743465884585525, + "grad_norm": 0.5703125, + "learning_rate": 6.261431972693499e-07, + "loss": 1.3564, + "step": 10285 + }, + { + "epoch": 1.7745191063572845, + "grad_norm": 0.55078125, + "learning_rate": 6.251955328585057e-07, + "loss": 1.3911, + "step": 10286 + }, + { + "epoch": 1.7746916242560165, + "grad_norm": 0.5546875, + "learning_rate": 6.242485629917494e-07, + "loss": 1.4094, + "step": 10287 + }, + { + "epoch": 1.7748641421547484, + "grad_norm": 0.609375, + "learning_rate": 6.233022877392458e-07, + "loss": 1.4178, + "step": 10288 + }, + { + "epoch": 1.7750366600534806, + "grad_norm": 0.60546875, + "learning_rate": 6.223567071710946e-07, + "loss": 1.3358, + "step": 10289 + }, + { + "epoch": 1.7752091779522126, + "grad_norm": 0.60546875, + "learning_rate": 6.214118213573517e-07, + "loss": 1.4563, + "step": 10290 + }, + { + "epoch": 1.7753816958509445, + "grad_norm": 0.62109375, + "learning_rate": 6.204676303680246e-07, + "loss": 1.373, + "step": 10291 + }, + { + "epoch": 1.7755542137496765, + "grad_norm": 0.59375, + "learning_rate": 6.195241342730585e-07, + "loss": 1.5764, + "step": 10292 + }, + { + "epoch": 1.7757267316484087, + "grad_norm": 0.6171875, + "learning_rate": 6.185813331423584e-07, + "loss": 1.4124, + "step": 10293 + }, + { + "epoch": 1.7758992495471406, + "grad_norm": 0.6171875, + "learning_rate": 6.176392270457709e-07, + "loss": 1.3338, + "step": 10294 + }, + { + "epoch": 1.7760717674458726, + "grad_norm": 0.6640625, + "learning_rate": 6.166978160530923e-07, + "loss": 1.4685, + "step": 10295 + }, + { + "epoch": 1.7762442853446045, + "grad_norm": 0.58984375, + "learning_rate": 6.15757100234069e-07, + "loss": 1.4325, + "step": 10296 + }, + { + "epoch": 1.7764168032433365, + "grad_norm": 0.5859375, + "learning_rate": 6.148170796583963e-07, + "loss": 1.3772, + "step": 10297 + }, + { + "epoch": 1.7765893211420685, + "grad_norm": 0.578125, + "learning_rate": 6.138777543957141e-07, + "loss": 1.4324, + "step": 10298 + }, + { + "epoch": 1.7767618390408004, + "grad_norm": 0.59765625, + "learning_rate": 6.129391245156168e-07, + "loss": 1.5199, + "step": 10299 + }, + { + "epoch": 1.7769343569395324, + "grad_norm": 0.55859375, + "learning_rate": 6.1200119008764e-07, + "loss": 1.4584, + "step": 10300 + }, + { + "epoch": 1.7769343569395324, + "eval_loss": 1.4070483446121216, + "eval_runtime": 11.0405, + "eval_samples_per_second": 92.75, + "eval_steps_per_second": 23.187, + "step": 10300 + }, + { + "epoch": 1.7771068748382643, + "grad_norm": 0.6171875, + "learning_rate": 6.110639511812765e-07, + "loss": 1.3487, + "step": 10301 + }, + { + "epoch": 1.7772793927369963, + "grad_norm": 0.58984375, + "learning_rate": 6.101274078659591e-07, + "loss": 1.4138, + "step": 10302 + }, + { + "epoch": 1.7774519106357285, + "grad_norm": 0.61328125, + "learning_rate": 6.091915602110743e-07, + "loss": 1.4851, + "step": 10303 + }, + { + "epoch": 1.7776244285344605, + "grad_norm": 0.66796875, + "learning_rate": 6.082564082859543e-07, + "loss": 1.4141, + "step": 10304 + }, + { + "epoch": 1.7777969464331924, + "grad_norm": 0.58984375, + "learning_rate": 6.073219521598828e-07, + "loss": 1.369, + "step": 10305 + }, + { + "epoch": 1.7779694643319246, + "grad_norm": 0.6171875, + "learning_rate": 6.063881919020887e-07, + "loss": 1.3713, + "step": 10306 + }, + { + "epoch": 1.7781419822306566, + "grad_norm": 0.84765625, + "learning_rate": 6.05455127581751e-07, + "loss": 1.5123, + "step": 10307 + }, + { + "epoch": 1.7783145001293885, + "grad_norm": 0.609375, + "learning_rate": 6.04522759267997e-07, + "loss": 1.4518, + "step": 10308 + }, + { + "epoch": 1.7784870180281205, + "grad_norm": 0.578125, + "learning_rate": 6.035910870299033e-07, + "loss": 1.435, + "step": 10309 + }, + { + "epoch": 1.7786595359268524, + "grad_norm": 0.640625, + "learning_rate": 6.026601109364949e-07, + "loss": 1.5177, + "step": 10310 + }, + { + "epoch": 1.7788320538255844, + "grad_norm": 0.58203125, + "learning_rate": 6.017298310567399e-07, + "loss": 1.4735, + "step": 10311 + }, + { + "epoch": 1.7790045717243164, + "grad_norm": 0.55859375, + "learning_rate": 6.008002474595653e-07, + "loss": 1.4801, + "step": 10312 + }, + { + "epoch": 1.7791770896230483, + "grad_norm": 0.55859375, + "learning_rate": 5.998713602138351e-07, + "loss": 1.3985, + "step": 10313 + }, + { + "epoch": 1.7793496075217803, + "grad_norm": 0.61328125, + "learning_rate": 5.989431693883696e-07, + "loss": 1.4779, + "step": 10314 + }, + { + "epoch": 1.7795221254205122, + "grad_norm": 0.69921875, + "learning_rate": 5.98015675051935e-07, + "loss": 1.415, + "step": 10315 + }, + { + "epoch": 1.7796946433192442, + "grad_norm": 0.59375, + "learning_rate": 5.970888772732453e-07, + "loss": 1.4775, + "step": 10316 + }, + { + "epoch": 1.7798671612179764, + "grad_norm": 0.6015625, + "learning_rate": 5.961627761209632e-07, + "loss": 1.4578, + "step": 10317 + }, + { + "epoch": 1.7800396791167084, + "grad_norm": 0.609375, + "learning_rate": 5.952373716637016e-07, + "loss": 1.415, + "step": 10318 + }, + { + "epoch": 1.7802121970154403, + "grad_norm": 0.54296875, + "learning_rate": 5.94312663970017e-07, + "loss": 1.3462, + "step": 10319 + }, + { + "epoch": 1.7803847149141725, + "grad_norm": 0.578125, + "learning_rate": 5.933886531084232e-07, + "loss": 1.4286, + "step": 10320 + }, + { + "epoch": 1.7805572328129045, + "grad_norm": 0.58984375, + "learning_rate": 5.924653391473689e-07, + "loss": 1.4845, + "step": 10321 + }, + { + "epoch": 1.7807297507116364, + "grad_norm": 0.58203125, + "learning_rate": 5.915427221552672e-07, + "loss": 1.4518, + "step": 10322 + }, + { + "epoch": 1.7809022686103684, + "grad_norm": 0.609375, + "learning_rate": 5.906208022004656e-07, + "loss": 1.4583, + "step": 10323 + }, + { + "epoch": 1.7810747865091003, + "grad_norm": 0.59375, + "learning_rate": 5.89699579351266e-07, + "loss": 1.3691, + "step": 10324 + }, + { + "epoch": 1.7812473044078323, + "grad_norm": 0.5625, + "learning_rate": 5.88779053675923e-07, + "loss": 1.4326, + "step": 10325 + }, + { + "epoch": 1.7814198223065643, + "grad_norm": 0.53515625, + "learning_rate": 5.878592252426296e-07, + "loss": 1.4378, + "step": 10326 + }, + { + "epoch": 1.7815923402052962, + "grad_norm": 0.6015625, + "learning_rate": 5.869400941195357e-07, + "loss": 1.488, + "step": 10327 + }, + { + "epoch": 1.7817648581040282, + "grad_norm": 0.58203125, + "learning_rate": 5.860216603747349e-07, + "loss": 1.3521, + "step": 10328 + }, + { + "epoch": 1.7819373760027601, + "grad_norm": 0.56640625, + "learning_rate": 5.851039240762702e-07, + "loss": 1.4167, + "step": 10329 + }, + { + "epoch": 1.7821098939014923, + "grad_norm": 0.5859375, + "learning_rate": 5.84186885292134e-07, + "loss": 1.5135, + "step": 10330 + }, + { + "epoch": 1.7822824118002243, + "grad_norm": 0.6171875, + "learning_rate": 5.832705440902675e-07, + "loss": 1.4273, + "step": 10331 + }, + { + "epoch": 1.7824549296989562, + "grad_norm": 0.5703125, + "learning_rate": 5.823549005385543e-07, + "loss": 1.5056, + "step": 10332 + }, + { + "epoch": 1.7826274475976882, + "grad_norm": 0.56640625, + "learning_rate": 5.814399547048378e-07, + "loss": 1.4058, + "step": 10333 + }, + { + "epoch": 1.7827999654964204, + "grad_norm": 0.62109375, + "learning_rate": 5.80525706656897e-07, + "loss": 1.4132, + "step": 10334 + }, + { + "epoch": 1.7829724833951524, + "grad_norm": 0.60546875, + "learning_rate": 5.796121564624679e-07, + "loss": 1.4934, + "step": 10335 + }, + { + "epoch": 1.7831450012938843, + "grad_norm": 0.56640625, + "learning_rate": 5.786993041892319e-07, + "loss": 1.3976, + "step": 10336 + }, + { + "epoch": 1.7833175191926163, + "grad_norm": 0.578125, + "learning_rate": 5.777871499048182e-07, + "loss": 1.5118, + "step": 10337 + }, + { + "epoch": 1.7834900370913482, + "grad_norm": 0.58984375, + "learning_rate": 5.76875693676805e-07, + "loss": 1.3587, + "step": 10338 + }, + { + "epoch": 1.7836625549900802, + "grad_norm": 0.58984375, + "learning_rate": 5.759649355727182e-07, + "loss": 1.4272, + "step": 10339 + }, + { + "epoch": 1.7838350728888122, + "grad_norm": 0.63671875, + "learning_rate": 5.750548756600338e-07, + "loss": 1.4136, + "step": 10340 + }, + { + "epoch": 1.7840075907875441, + "grad_norm": 0.57421875, + "learning_rate": 5.741455140061747e-07, + "loss": 1.3687, + "step": 10341 + }, + { + "epoch": 1.784180108686276, + "grad_norm": 0.59765625, + "learning_rate": 5.732368506785113e-07, + "loss": 1.471, + "step": 10342 + }, + { + "epoch": 1.784352626585008, + "grad_norm": 0.6953125, + "learning_rate": 5.72328885744361e-07, + "loss": 1.3807, + "step": 10343 + }, + { + "epoch": 1.7845251444837402, + "grad_norm": 0.6171875, + "learning_rate": 5.714216192709976e-07, + "loss": 1.4743, + "step": 10344 + }, + { + "epoch": 1.7846976623824722, + "grad_norm": 0.5859375, + "learning_rate": 5.705150513256297e-07, + "loss": 1.3449, + "step": 10345 + }, + { + "epoch": 1.7848701802812041, + "grad_norm": 0.5859375, + "learning_rate": 5.696091819754268e-07, + "loss": 1.4331, + "step": 10346 + }, + { + "epoch": 1.7850426981799363, + "grad_norm": 0.59765625, + "learning_rate": 5.687040112874986e-07, + "loss": 1.3974, + "step": 10347 + }, + { + "epoch": 1.7852152160786683, + "grad_norm": 0.57421875, + "learning_rate": 5.67799539328906e-07, + "loss": 1.4973, + "step": 10348 + }, + { + "epoch": 1.7853877339774002, + "grad_norm": 0.62109375, + "learning_rate": 5.668957661666597e-07, + "loss": 1.46, + "step": 10349 + }, + { + "epoch": 1.7855602518761322, + "grad_norm": 0.61328125, + "learning_rate": 5.659926918677172e-07, + "loss": 1.3491, + "step": 10350 + }, + { + "epoch": 1.7857327697748642, + "grad_norm": 0.5625, + "learning_rate": 5.650903164989784e-07, + "loss": 1.4681, + "step": 10351 + }, + { + "epoch": 1.7859052876735961, + "grad_norm": 0.5546875, + "learning_rate": 5.641886401273056e-07, + "loss": 1.4334, + "step": 10352 + }, + { + "epoch": 1.786077805572328, + "grad_norm": 0.59375, + "learning_rate": 5.632876628194917e-07, + "loss": 1.3925, + "step": 10353 + }, + { + "epoch": 1.78625032347106, + "grad_norm": 0.6015625, + "learning_rate": 5.623873846422945e-07, + "loss": 1.4241, + "step": 10354 + }, + { + "epoch": 1.786422841369792, + "grad_norm": 0.7421875, + "learning_rate": 5.614878056624074e-07, + "loss": 1.3811, + "step": 10355 + }, + { + "epoch": 1.786595359268524, + "grad_norm": 0.5625, + "learning_rate": 5.60588925946477e-07, + "loss": 1.4115, + "step": 10356 + }, + { + "epoch": 1.7867678771672562, + "grad_norm": 0.6953125, + "learning_rate": 5.596907455611011e-07, + "loss": 1.3852, + "step": 10357 + }, + { + "epoch": 1.7869403950659881, + "grad_norm": 0.56640625, + "learning_rate": 5.5879326457282e-07, + "loss": 1.4252, + "step": 10358 + }, + { + "epoch": 1.78711291296472, + "grad_norm": 0.63671875, + "learning_rate": 5.578964830481249e-07, + "loss": 1.4649, + "step": 10359 + }, + { + "epoch": 1.787285430863452, + "grad_norm": 0.59375, + "learning_rate": 5.570004010534557e-07, + "loss": 1.436, + "step": 10360 + }, + { + "epoch": 1.7874579487621842, + "grad_norm": 0.5859375, + "learning_rate": 5.561050186551986e-07, + "loss": 1.457, + "step": 10361 + }, + { + "epoch": 1.7876304666609162, + "grad_norm": 0.5703125, + "learning_rate": 5.552103359196914e-07, + "loss": 1.4263, + "step": 10362 + }, + { + "epoch": 1.7878029845596481, + "grad_norm": 0.55859375, + "learning_rate": 5.543163529132168e-07, + "loss": 1.4285, + "step": 10363 + }, + { + "epoch": 1.78797550245838, + "grad_norm": 0.60546875, + "learning_rate": 5.534230697020027e-07, + "loss": 1.4432, + "step": 10364 + }, + { + "epoch": 1.788148020357112, + "grad_norm": 0.578125, + "learning_rate": 5.525304863522363e-07, + "loss": 1.394, + "step": 10365 + }, + { + "epoch": 1.788320538255844, + "grad_norm": 0.6328125, + "learning_rate": 5.516386029300391e-07, + "loss": 1.4578, + "step": 10366 + }, + { + "epoch": 1.788493056154576, + "grad_norm": 0.52734375, + "learning_rate": 5.507474195014917e-07, + "loss": 1.3135, + "step": 10367 + }, + { + "epoch": 1.788665574053308, + "grad_norm": 0.56640625, + "learning_rate": 5.498569361326168e-07, + "loss": 1.402, + "step": 10368 + }, + { + "epoch": 1.78883809195204, + "grad_norm": 0.71484375, + "learning_rate": 5.489671528893869e-07, + "loss": 1.4344, + "step": 10369 + }, + { + "epoch": 1.7890106098507719, + "grad_norm": 0.5703125, + "learning_rate": 5.480780698377241e-07, + "loss": 1.3027, + "step": 10370 + }, + { + "epoch": 1.789183127749504, + "grad_norm": 0.5625, + "learning_rate": 5.471896870434957e-07, + "loss": 1.4547, + "step": 10371 + }, + { + "epoch": 1.789355645648236, + "grad_norm": 0.52734375, + "learning_rate": 5.463020045725187e-07, + "loss": 1.3488, + "step": 10372 + }, + { + "epoch": 1.789528163546968, + "grad_norm": 0.52734375, + "learning_rate": 5.454150224905586e-07, + "loss": 1.4106, + "step": 10373 + }, + { + "epoch": 1.7897006814457, + "grad_norm": 0.59375, + "learning_rate": 5.445287408633304e-07, + "loss": 1.3909, + "step": 10374 + }, + { + "epoch": 1.7898731993444321, + "grad_norm": 0.58984375, + "learning_rate": 5.436431597564907e-07, + "loss": 1.5508, + "step": 10375 + }, + { + "epoch": 1.790045717243164, + "grad_norm": 0.5859375, + "learning_rate": 5.427582792356545e-07, + "loss": 1.4281, + "step": 10376 + }, + { + "epoch": 1.790218235141896, + "grad_norm": 0.66796875, + "learning_rate": 5.418740993663751e-07, + "loss": 1.4708, + "step": 10377 + }, + { + "epoch": 1.790390753040628, + "grad_norm": 0.5625, + "learning_rate": 5.409906202141602e-07, + "loss": 1.4345, + "step": 10378 + }, + { + "epoch": 1.79056327093936, + "grad_norm": 0.578125, + "learning_rate": 5.401078418444617e-07, + "loss": 1.4302, + "step": 10379 + }, + { + "epoch": 1.790735788838092, + "grad_norm": 0.5859375, + "learning_rate": 5.392257643226828e-07, + "loss": 1.4859, + "step": 10380 + }, + { + "epoch": 1.7909083067368239, + "grad_norm": 0.578125, + "learning_rate": 5.383443877141737e-07, + "loss": 1.4114, + "step": 10381 + }, + { + "epoch": 1.7910808246355558, + "grad_norm": 0.5859375, + "learning_rate": 5.374637120842308e-07, + "loss": 1.476, + "step": 10382 + }, + { + "epoch": 1.7912533425342878, + "grad_norm": 0.59375, + "learning_rate": 5.365837374981009e-07, + "loss": 1.531, + "step": 10383 + }, + { + "epoch": 1.7914258604330198, + "grad_norm": 0.58984375, + "learning_rate": 5.357044640209796e-07, + "loss": 1.4339, + "step": 10384 + }, + { + "epoch": 1.791598378331752, + "grad_norm": 0.6015625, + "learning_rate": 5.348258917180038e-07, + "loss": 1.3312, + "step": 10385 + }, + { + "epoch": 1.791770896230484, + "grad_norm": 0.83203125, + "learning_rate": 5.339480206542702e-07, + "loss": 1.4726, + "step": 10386 + }, + { + "epoch": 1.7919434141292159, + "grad_norm": 0.578125, + "learning_rate": 5.330708508948123e-07, + "loss": 1.3928, + "step": 10387 + }, + { + "epoch": 1.792115932027948, + "grad_norm": 0.57421875, + "learning_rate": 5.321943825046171e-07, + "loss": 1.4194, + "step": 10388 + }, + { + "epoch": 1.79228844992668, + "grad_norm": 0.5625, + "learning_rate": 5.313186155486216e-07, + "loss": 1.4375, + "step": 10389 + }, + { + "epoch": 1.792460967825412, + "grad_norm": 0.578125, + "learning_rate": 5.304435500917049e-07, + "loss": 1.4913, + "step": 10390 + }, + { + "epoch": 1.792633485724144, + "grad_norm": 0.5703125, + "learning_rate": 5.295691861986985e-07, + "loss": 1.3996, + "step": 10391 + }, + { + "epoch": 1.792806003622876, + "grad_norm": 0.578125, + "learning_rate": 5.286955239343816e-07, + "loss": 1.4013, + "step": 10392 + }, + { + "epoch": 1.7929785215216079, + "grad_norm": 0.59765625, + "learning_rate": 5.278225633634793e-07, + "loss": 1.3898, + "step": 10393 + }, + { + "epoch": 1.7931510394203398, + "grad_norm": 0.578125, + "learning_rate": 5.269503045506652e-07, + "loss": 1.4631, + "step": 10394 + }, + { + "epoch": 1.7933235573190718, + "grad_norm": 0.59375, + "learning_rate": 5.260787475605656e-07, + "loss": 1.4925, + "step": 10395 + }, + { + "epoch": 1.7934960752178037, + "grad_norm": 0.59375, + "learning_rate": 5.252078924577453e-07, + "loss": 1.5018, + "step": 10396 + }, + { + "epoch": 1.7936685931165357, + "grad_norm": 0.6484375, + "learning_rate": 5.243377393067284e-07, + "loss": 1.3756, + "step": 10397 + }, + { + "epoch": 1.7938411110152679, + "grad_norm": 0.5625, + "learning_rate": 5.234682881719766e-07, + "loss": 1.4001, + "step": 10398 + }, + { + "epoch": 1.7940136289139998, + "grad_norm": 0.58203125, + "learning_rate": 5.225995391179061e-07, + "loss": 1.419, + "step": 10399 + }, + { + "epoch": 1.7941861468127318, + "grad_norm": 0.5625, + "learning_rate": 5.21731492208879e-07, + "loss": 1.3098, + "step": 10400 + }, + { + "epoch": 1.7941861468127318, + "eval_loss": 1.4070560932159424, + "eval_runtime": 10.8247, + "eval_samples_per_second": 94.598, + "eval_steps_per_second": 23.65, + "step": 10400 + }, + { + "epoch": 1.7943586647114638, + "grad_norm": 0.59765625, + "learning_rate": 5.208641475092069e-07, + "loss": 1.3357, + "step": 10401 + }, + { + "epoch": 1.794531182610196, + "grad_norm": 0.58203125, + "learning_rate": 5.199975050831463e-07, + "loss": 1.5601, + "step": 10402 + }, + { + "epoch": 1.794703700508928, + "grad_norm": 0.80859375, + "learning_rate": 5.191315649949046e-07, + "loss": 1.363, + "step": 10403 + }, + { + "epoch": 1.7948762184076599, + "grad_norm": 0.58203125, + "learning_rate": 5.182663273086364e-07, + "loss": 1.38, + "step": 10404 + }, + { + "epoch": 1.7950487363063918, + "grad_norm": 0.57421875, + "learning_rate": 5.174017920884423e-07, + "loss": 1.3737, + "step": 10405 + }, + { + "epoch": 1.7952212542051238, + "grad_norm": 0.5703125, + "learning_rate": 5.165379593983755e-07, + "loss": 1.5047, + "step": 10406 + }, + { + "epoch": 1.7953937721038558, + "grad_norm": 0.73046875, + "learning_rate": 5.156748293024283e-07, + "loss": 1.4924, + "step": 10407 + }, + { + "epoch": 1.7955662900025877, + "grad_norm": 0.6015625, + "learning_rate": 5.148124018645539e-07, + "loss": 1.3867, + "step": 10408 + }, + { + "epoch": 1.7957388079013197, + "grad_norm": 0.55859375, + "learning_rate": 5.139506771486414e-07, + "loss": 1.3976, + "step": 10409 + }, + { + "epoch": 1.7959113258000516, + "grad_norm": 0.59375, + "learning_rate": 5.130896552185349e-07, + "loss": 1.4595, + "step": 10410 + }, + { + "epoch": 1.7960838436987836, + "grad_norm": 0.578125, + "learning_rate": 5.122293361380238e-07, + "loss": 1.4081, + "step": 10411 + }, + { + "epoch": 1.7962563615975158, + "grad_norm": 0.62890625, + "learning_rate": 5.113697199708456e-07, + "loss": 1.4327, + "step": 10412 + }, + { + "epoch": 1.7964288794962477, + "grad_norm": 0.578125, + "learning_rate": 5.105108067806863e-07, + "loss": 1.3137, + "step": 10413 + }, + { + "epoch": 1.7966013973949797, + "grad_norm": 0.578125, + "learning_rate": 5.096525966311794e-07, + "loss": 1.4744, + "step": 10414 + }, + { + "epoch": 1.7967739152937119, + "grad_norm": 0.6015625, + "learning_rate": 5.087950895859062e-07, + "loss": 1.5153, + "step": 10415 + }, + { + "epoch": 1.7969464331924438, + "grad_norm": 0.6484375, + "learning_rate": 5.079382857083981e-07, + "loss": 1.4555, + "step": 10416 + }, + { + "epoch": 1.7971189510911758, + "grad_norm": 0.5546875, + "learning_rate": 5.070821850621277e-07, + "loss": 1.3925, + "step": 10417 + }, + { + "epoch": 1.7972914689899078, + "grad_norm": 0.5625, + "learning_rate": 5.062267877105275e-07, + "loss": 1.4293, + "step": 10418 + }, + { + "epoch": 1.7974639868886397, + "grad_norm": 0.61328125, + "learning_rate": 5.053720937169648e-07, + "loss": 1.3991, + "step": 10419 + }, + { + "epoch": 1.7976365047873717, + "grad_norm": 0.56640625, + "learning_rate": 5.04518103144761e-07, + "loss": 1.4303, + "step": 10420 + }, + { + "epoch": 1.7978090226861037, + "grad_norm": 0.6484375, + "learning_rate": 5.036648160571889e-07, + "loss": 1.4, + "step": 10421 + }, + { + "epoch": 1.7979815405848356, + "grad_norm": 0.640625, + "learning_rate": 5.028122325174623e-07, + "loss": 1.4848, + "step": 10422 + }, + { + "epoch": 1.7981540584835676, + "grad_norm": 0.6171875, + "learning_rate": 5.019603525887462e-07, + "loss": 1.4521, + "step": 10423 + }, + { + "epoch": 1.7983265763822995, + "grad_norm": 0.55859375, + "learning_rate": 5.011091763341547e-07, + "loss": 1.4574, + "step": 10424 + }, + { + "epoch": 1.7984990942810315, + "grad_norm": 0.59375, + "learning_rate": 5.002587038167461e-07, + "loss": 1.3833, + "step": 10425 + }, + { + "epoch": 1.7986716121797637, + "grad_norm": 0.5859375, + "learning_rate": 4.994089350995301e-07, + "loss": 1.5584, + "step": 10426 + }, + { + "epoch": 1.7988441300784956, + "grad_norm": 0.6328125, + "learning_rate": 4.985598702454653e-07, + "loss": 1.3979, + "step": 10427 + }, + { + "epoch": 1.7990166479772276, + "grad_norm": 0.625, + "learning_rate": 4.977115093174489e-07, + "loss": 1.5396, + "step": 10428 + }, + { + "epoch": 1.7991891658759598, + "grad_norm": 0.6328125, + "learning_rate": 4.968638523783398e-07, + "loss": 1.5011, + "step": 10429 + }, + { + "epoch": 1.7993616837746917, + "grad_norm": 0.6015625, + "learning_rate": 4.960168994909343e-07, + "loss": 1.3286, + "step": 10430 + }, + { + "epoch": 1.7995342016734237, + "grad_norm": 0.65625, + "learning_rate": 4.951706507179788e-07, + "loss": 1.3573, + "step": 10431 + }, + { + "epoch": 1.7997067195721557, + "grad_norm": 0.640625, + "learning_rate": 4.943251061221721e-07, + "loss": 1.4431, + "step": 10432 + }, + { + "epoch": 1.7998792374708876, + "grad_norm": 0.58984375, + "learning_rate": 4.934802657661553e-07, + "loss": 1.3382, + "step": 10433 + }, + { + "epoch": 1.8000517553696196, + "grad_norm": 0.6640625, + "learning_rate": 4.926361297125193e-07, + "loss": 1.516, + "step": 10434 + }, + { + "epoch": 1.8002242732683515, + "grad_norm": 0.55859375, + "learning_rate": 4.917926980238041e-07, + "loss": 1.3845, + "step": 10435 + }, + { + "epoch": 1.8003967911670835, + "grad_norm": 0.58984375, + "learning_rate": 4.909499707624966e-07, + "loss": 1.3371, + "step": 10436 + }, + { + "epoch": 1.8005693090658155, + "grad_norm": 0.6171875, + "learning_rate": 4.901079479910287e-07, + "loss": 1.462, + "step": 10437 + }, + { + "epoch": 1.8007418269645474, + "grad_norm": 0.58984375, + "learning_rate": 4.892666297717874e-07, + "loss": 1.4135, + "step": 10438 + }, + { + "epoch": 1.8009143448632796, + "grad_norm": 0.578125, + "learning_rate": 4.884260161670972e-07, + "loss": 1.4871, + "step": 10439 + }, + { + "epoch": 1.8010868627620116, + "grad_norm": 0.6328125, + "learning_rate": 4.875861072392408e-07, + "loss": 1.3797, + "step": 10440 + }, + { + "epoch": 1.8012593806607435, + "grad_norm": 0.58203125, + "learning_rate": 4.86746903050439e-07, + "loss": 1.4785, + "step": 10441 + }, + { + "epoch": 1.8014318985594755, + "grad_norm": 0.64453125, + "learning_rate": 4.859084036628714e-07, + "loss": 1.5303, + "step": 10442 + }, + { + "epoch": 1.8016044164582077, + "grad_norm": 0.65234375, + "learning_rate": 4.850706091386548e-07, + "loss": 1.4003, + "step": 10443 + }, + { + "epoch": 1.8017769343569396, + "grad_norm": 1.3046875, + "learning_rate": 4.842335195398595e-07, + "loss": 1.3405, + "step": 10444 + }, + { + "epoch": 1.8019494522556716, + "grad_norm": 0.63671875, + "learning_rate": 4.833971349285027e-07, + "loss": 1.3997, + "step": 10445 + }, + { + "epoch": 1.8021219701544036, + "grad_norm": 0.59765625, + "learning_rate": 4.825614553665481e-07, + "loss": 1.4876, + "step": 10446 + }, + { + "epoch": 1.8022944880531355, + "grad_norm": 0.56640625, + "learning_rate": 4.817264809159084e-07, + "loss": 1.41, + "step": 10447 + }, + { + "epoch": 1.8024670059518675, + "grad_norm": 0.58203125, + "learning_rate": 4.808922116384451e-07, + "loss": 1.4368, + "step": 10448 + }, + { + "epoch": 1.8026395238505994, + "grad_norm": 0.60546875, + "learning_rate": 4.800586475959623e-07, + "loss": 1.4177, + "step": 10449 + }, + { + "epoch": 1.8028120417493314, + "grad_norm": 0.578125, + "learning_rate": 4.792257888502217e-07, + "loss": 1.4365, + "step": 10450 + }, + { + "epoch": 1.8029845596480634, + "grad_norm": 0.62109375, + "learning_rate": 4.783936354629215e-07, + "loss": 1.3633, + "step": 10451 + }, + { + "epoch": 1.8031570775467953, + "grad_norm": 0.5859375, + "learning_rate": 4.775621874957126e-07, + "loss": 1.4407, + "step": 10452 + }, + { + "epoch": 1.8033295954455275, + "grad_norm": 0.56640625, + "learning_rate": 4.7673144501019897e-07, + "loss": 1.3938, + "step": 10453 + }, + { + "epoch": 1.8035021133442595, + "grad_norm": 0.58203125, + "learning_rate": 4.759014080679225e-07, + "loss": 1.4431, + "step": 10454 + }, + { + "epoch": 1.8036746312429914, + "grad_norm": 0.5625, + "learning_rate": 4.7507207673037956e-07, + "loss": 1.4637, + "step": 10455 + }, + { + "epoch": 1.8038471491417236, + "grad_norm": 0.6484375, + "learning_rate": 4.7424345105901105e-07, + "loss": 1.459, + "step": 10456 + }, + { + "epoch": 1.8040196670404556, + "grad_norm": 0.625, + "learning_rate": 4.7341553111520776e-07, + "loss": 1.4838, + "step": 10457 + }, + { + "epoch": 1.8041921849391875, + "grad_norm": 0.546875, + "learning_rate": 4.7258831696030624e-07, + "loss": 1.3738, + "step": 10458 + }, + { + "epoch": 1.8043647028379195, + "grad_norm": 0.578125, + "learning_rate": 4.7176180865559416e-07, + "loss": 1.393, + "step": 10459 + }, + { + "epoch": 1.8045372207366515, + "grad_norm": 0.55078125, + "learning_rate": 4.709360062622992e-07, + "loss": 1.4474, + "step": 10460 + }, + { + "epoch": 1.8047097386353834, + "grad_norm": 0.59375, + "learning_rate": 4.701109098416079e-07, + "loss": 1.5006, + "step": 10461 + }, + { + "epoch": 1.8048822565341154, + "grad_norm": 0.6171875, + "learning_rate": 4.6928651945464585e-07, + "loss": 1.4166, + "step": 10462 + }, + { + "epoch": 1.8050547744328473, + "grad_norm": 0.58203125, + "learning_rate": 4.6846283516248756e-07, + "loss": 1.5211, + "step": 10463 + }, + { + "epoch": 1.8052272923315793, + "grad_norm": 0.55078125, + "learning_rate": 4.6763985702615864e-07, + "loss": 1.4285, + "step": 10464 + }, + { + "epoch": 1.8053998102303113, + "grad_norm": 0.546875, + "learning_rate": 4.6681758510663035e-07, + "loss": 1.4389, + "step": 10465 + }, + { + "epoch": 1.8055723281290432, + "grad_norm": 0.6015625, + "learning_rate": 4.6599601946482164e-07, + "loss": 1.4591, + "step": 10466 + }, + { + "epoch": 1.8057448460277754, + "grad_norm": 0.6015625, + "learning_rate": 4.651751601615984e-07, + "loss": 1.4747, + "step": 10467 + }, + { + "epoch": 1.8059173639265074, + "grad_norm": 0.6484375, + "learning_rate": 4.6435500725777627e-07, + "loss": 1.3947, + "step": 10468 + }, + { + "epoch": 1.8060898818252393, + "grad_norm": 0.578125, + "learning_rate": 4.635355608141168e-07, + "loss": 1.4092, + "step": 10469 + }, + { + "epoch": 1.8062623997239715, + "grad_norm": 0.61328125, + "learning_rate": 4.6271682089132796e-07, + "loss": 1.3991, + "step": 10470 + }, + { + "epoch": 1.8064349176227035, + "grad_norm": 0.59765625, + "learning_rate": 4.618987875500702e-07, + "loss": 1.4512, + "step": 10471 + }, + { + "epoch": 1.8066074355214354, + "grad_norm": 0.578125, + "learning_rate": 4.610814608509484e-07, + "loss": 1.3993, + "step": 10472 + }, + { + "epoch": 1.8067799534201674, + "grad_norm": 0.55078125, + "learning_rate": 4.602648408545107e-07, + "loss": 1.4226, + "step": 10473 + }, + { + "epoch": 1.8069524713188994, + "grad_norm": 0.63671875, + "learning_rate": 4.594489276212633e-07, + "loss": 1.3826, + "step": 10474 + }, + { + "epoch": 1.8071249892176313, + "grad_norm": 0.60546875, + "learning_rate": 4.5863372121165095e-07, + "loss": 1.4057, + "step": 10475 + }, + { + "epoch": 1.8072975071163633, + "grad_norm": 0.57421875, + "learning_rate": 4.5781922168606883e-07, + "loss": 1.5323, + "step": 10476 + }, + { + "epoch": 1.8074700250150952, + "grad_norm": 0.61328125, + "learning_rate": 4.570054291048609e-07, + "loss": 1.3534, + "step": 10477 + }, + { + "epoch": 1.8076425429138272, + "grad_norm": 0.57421875, + "learning_rate": 4.561923435283189e-07, + "loss": 1.4982, + "step": 10478 + }, + { + "epoch": 1.8078150608125592, + "grad_norm": 0.5546875, + "learning_rate": 4.553799650166801e-07, + "loss": 1.376, + "step": 10479 + }, + { + "epoch": 1.8079875787112913, + "grad_norm": 0.6015625, + "learning_rate": 4.545682936301321e-07, + "loss": 1.4337, + "step": 10480 + }, + { + "epoch": 1.8081600966100233, + "grad_norm": 0.671875, + "learning_rate": 4.5375732942880557e-07, + "loss": 1.4383, + "step": 10481 + }, + { + "epoch": 1.8083326145087553, + "grad_norm": 0.5859375, + "learning_rate": 4.529470724727858e-07, + "loss": 1.4463, + "step": 10482 + }, + { + "epoch": 1.8085051324074872, + "grad_norm": 0.59765625, + "learning_rate": 4.5213752282209924e-07, + "loss": 1.4528, + "step": 10483 + }, + { + "epoch": 1.8086776503062194, + "grad_norm": 0.66796875, + "learning_rate": 4.513286805367212e-07, + "loss": 1.4562, + "step": 10484 + }, + { + "epoch": 1.8088501682049514, + "grad_norm": 0.56640625, + "learning_rate": 4.505205456765793e-07, + "loss": 1.384, + "step": 10485 + }, + { + "epoch": 1.8090226861036833, + "grad_norm": 0.625, + "learning_rate": 4.497131183015424e-07, + "loss": 1.3599, + "step": 10486 + }, + { + "epoch": 1.8091952040024153, + "grad_norm": 0.57421875, + "learning_rate": 4.4890639847143035e-07, + "loss": 1.4611, + "step": 10487 + }, + { + "epoch": 1.8093677219011473, + "grad_norm": 0.58203125, + "learning_rate": 4.4810038624601095e-07, + "loss": 1.4362, + "step": 10488 + }, + { + "epoch": 1.8095402397998792, + "grad_norm": 0.60546875, + "learning_rate": 4.472950816849975e-07, + "loss": 1.4141, + "step": 10489 + }, + { + "epoch": 1.8097127576986112, + "grad_norm": 0.5859375, + "learning_rate": 4.464904848480522e-07, + "loss": 1.4283, + "step": 10490 + }, + { + "epoch": 1.8098852755973431, + "grad_norm": 0.57421875, + "learning_rate": 4.4568659579478647e-07, + "loss": 1.4313, + "step": 10491 + }, + { + "epoch": 1.810057793496075, + "grad_norm": 0.58203125, + "learning_rate": 4.4488341458475247e-07, + "loss": 1.4505, + "step": 10492 + }, + { + "epoch": 1.810230311394807, + "grad_norm": 0.60546875, + "learning_rate": 4.440809412774616e-07, + "loss": 1.3547, + "step": 10493 + }, + { + "epoch": 1.8104028292935392, + "grad_norm": 0.609375, + "learning_rate": 4.432791759323618e-07, + "loss": 1.5555, + "step": 10494 + }, + { + "epoch": 1.8105753471922712, + "grad_norm": 0.63671875, + "learning_rate": 4.4247811860885335e-07, + "loss": 1.394, + "step": 10495 + }, + { + "epoch": 1.8107478650910032, + "grad_norm": 0.69140625, + "learning_rate": 4.416777693662844e-07, + "loss": 1.3825, + "step": 10496 + }, + { + "epoch": 1.8109203829897353, + "grad_norm": 0.625, + "learning_rate": 4.408781282639485e-07, + "loss": 1.586, + "step": 10497 + }, + { + "epoch": 1.8110929008884673, + "grad_norm": 0.7265625, + "learning_rate": 4.4007919536109057e-07, + "loss": 1.392, + "step": 10498 + }, + { + "epoch": 1.8112654187871993, + "grad_norm": 0.58203125, + "learning_rate": 4.392809707168977e-07, + "loss": 1.4427, + "step": 10499 + }, + { + "epoch": 1.8114379366859312, + "grad_norm": 0.5625, + "learning_rate": 4.384834543905092e-07, + "loss": 1.4485, + "step": 10500 + }, + { + "epoch": 1.8114379366859312, + "eval_loss": 1.4070926904678345, + "eval_runtime": 11.9829, + "eval_samples_per_second": 85.455, + "eval_steps_per_second": 21.364, + "step": 10500 + }, + { + "epoch": 1.8116104545846632, + "grad_norm": 0.58984375, + "learning_rate": 4.37686646441009e-07, + "loss": 1.3746, + "step": 10501 + }, + { + "epoch": 1.8117829724833951, + "grad_norm": 0.63671875, + "learning_rate": 4.3689054692743094e-07, + "loss": 1.4008, + "step": 10502 + }, + { + "epoch": 1.811955490382127, + "grad_norm": 0.625, + "learning_rate": 4.360951559087534e-07, + "loss": 1.4617, + "step": 10503 + }, + { + "epoch": 1.812128008280859, + "grad_norm": 0.5703125, + "learning_rate": 4.353004734439059e-07, + "loss": 1.3902, + "step": 10504 + }, + { + "epoch": 1.812300526179591, + "grad_norm": 0.625, + "learning_rate": 4.345064995917603e-07, + "loss": 1.3388, + "step": 10505 + }, + { + "epoch": 1.812473044078323, + "grad_norm": 0.59375, + "learning_rate": 4.337132344111439e-07, + "loss": 1.4352, + "step": 10506 + }, + { + "epoch": 1.812645561977055, + "grad_norm": 0.59765625, + "learning_rate": 4.3292067796082304e-07, + "loss": 1.4476, + "step": 10507 + }, + { + "epoch": 1.8128180798757871, + "grad_norm": 0.5625, + "learning_rate": 4.3212883029951523e-07, + "loss": 1.3818, + "step": 10508 + }, + { + "epoch": 1.812990597774519, + "grad_norm": 0.59765625, + "learning_rate": 4.313376914858869e-07, + "loss": 1.4204, + "step": 10509 + }, + { + "epoch": 1.813163115673251, + "grad_norm": 0.61328125, + "learning_rate": 4.3054726157855e-07, + "loss": 1.2998, + "step": 10510 + }, + { + "epoch": 1.8133356335719832, + "grad_norm": 0.59765625, + "learning_rate": 4.2975754063606544e-07, + "loss": 1.376, + "step": 10511 + }, + { + "epoch": 1.8135081514707152, + "grad_norm": 0.5859375, + "learning_rate": 4.289685287169398e-07, + "loss": 1.3711, + "step": 10512 + }, + { + "epoch": 1.8136806693694472, + "grad_norm": 0.57421875, + "learning_rate": 4.281802258796275e-07, + "loss": 1.429, + "step": 10513 + }, + { + "epoch": 1.8138531872681791, + "grad_norm": 1.8125, + "learning_rate": 4.273926321825328e-07, + "loss": 1.4026, + "step": 10514 + }, + { + "epoch": 1.814025705166911, + "grad_norm": 0.69921875, + "learning_rate": 4.266057476840024e-07, + "loss": 1.5232, + "step": 10515 + }, + { + "epoch": 1.814198223065643, + "grad_norm": 0.6015625, + "learning_rate": 4.258195724423353e-07, + "loss": 1.4122, + "step": 10516 + }, + { + "epoch": 1.814370740964375, + "grad_norm": 0.54296875, + "learning_rate": 4.250341065157793e-07, + "loss": 1.4027, + "step": 10517 + }, + { + "epoch": 1.814543258863107, + "grad_norm": 0.5546875, + "learning_rate": 4.242493499625222e-07, + "loss": 1.3615, + "step": 10518 + }, + { + "epoch": 1.814715776761839, + "grad_norm": 0.63671875, + "learning_rate": 4.234653028407054e-07, + "loss": 1.4586, + "step": 10519 + }, + { + "epoch": 1.8148882946605709, + "grad_norm": 1.109375, + "learning_rate": 4.2268196520841574e-07, + "loss": 1.4106, + "step": 10520 + }, + { + "epoch": 1.815060812559303, + "grad_norm": 0.56640625, + "learning_rate": 4.218993371236879e-07, + "loss": 1.4377, + "step": 10521 + }, + { + "epoch": 1.815233330458035, + "grad_norm": 0.63671875, + "learning_rate": 4.211174186445033e-07, + "loss": 1.4085, + "step": 10522 + }, + { + "epoch": 1.815405848356767, + "grad_norm": 0.58984375, + "learning_rate": 4.203362098287944e-07, + "loss": 1.3718, + "step": 10523 + }, + { + "epoch": 1.815578366255499, + "grad_norm": 0.61328125, + "learning_rate": 4.195557107344328e-07, + "loss": 1.4049, + "step": 10524 + }, + { + "epoch": 1.8157508841542311, + "grad_norm": 0.72265625, + "learning_rate": 4.187759214192477e-07, + "loss": 1.5663, + "step": 10525 + }, + { + "epoch": 1.815923402052963, + "grad_norm": 0.9453125, + "learning_rate": 4.179968419410063e-07, + "loss": 1.41, + "step": 10526 + }, + { + "epoch": 1.816095919951695, + "grad_norm": 0.55078125, + "learning_rate": 4.172184723574324e-07, + "loss": 1.3764, + "step": 10527 + }, + { + "epoch": 1.816268437850427, + "grad_norm": 0.56640625, + "learning_rate": 4.1644081272618874e-07, + "loss": 1.406, + "step": 10528 + }, + { + "epoch": 1.816440955749159, + "grad_norm": 0.59375, + "learning_rate": 4.1566386310489035e-07, + "loss": 1.4075, + "step": 10529 + }, + { + "epoch": 1.816613473647891, + "grad_norm": 0.60546875, + "learning_rate": 4.14887623551099e-07, + "loss": 1.3793, + "step": 10530 + }, + { + "epoch": 1.816785991546623, + "grad_norm": 0.61328125, + "learning_rate": 4.141120941223231e-07, + "loss": 1.441, + "step": 10531 + }, + { + "epoch": 1.8169585094453549, + "grad_norm": 0.578125, + "learning_rate": 4.133372748760178e-07, + "loss": 1.4461, + "step": 10532 + }, + { + "epoch": 1.8171310273440868, + "grad_norm": 0.796875, + "learning_rate": 4.1256316586958835e-07, + "loss": 1.3624, + "step": 10533 + }, + { + "epoch": 1.8173035452428188, + "grad_norm": 0.59375, + "learning_rate": 4.117897671603843e-07, + "loss": 1.3699, + "step": 10534 + }, + { + "epoch": 1.817476063141551, + "grad_norm": 0.60546875, + "learning_rate": 4.110170788057044e-07, + "loss": 1.3984, + "step": 10535 + }, + { + "epoch": 1.817648581040283, + "grad_norm": 0.60546875, + "learning_rate": 4.10245100862795e-07, + "loss": 1.4331, + "step": 10536 + }, + { + "epoch": 1.817821098939015, + "grad_norm": 0.58984375, + "learning_rate": 4.0947383338884594e-07, + "loss": 1.4868, + "step": 10537 + }, + { + "epoch": 1.817993616837747, + "grad_norm": 0.62890625, + "learning_rate": 4.087032764410015e-07, + "loss": 1.423, + "step": 10538 + }, + { + "epoch": 1.818166134736479, + "grad_norm": 0.82421875, + "learning_rate": 4.0793343007634713e-07, + "loss": 1.4135, + "step": 10539 + }, + { + "epoch": 1.818338652635211, + "grad_norm": 0.60546875, + "learning_rate": 4.071642943519183e-07, + "loss": 1.404, + "step": 10540 + }, + { + "epoch": 1.818511170533943, + "grad_norm": 0.62109375, + "learning_rate": 4.063958693246961e-07, + "loss": 1.4487, + "step": 10541 + }, + { + "epoch": 1.818683688432675, + "grad_norm": 0.58984375, + "learning_rate": 4.056281550516128e-07, + "loss": 1.4732, + "step": 10542 + }, + { + "epoch": 1.8188562063314069, + "grad_norm": 0.62890625, + "learning_rate": 4.0486115158954396e-07, + "loss": 1.4363, + "step": 10543 + }, + { + "epoch": 1.8190287242301388, + "grad_norm": 0.60546875, + "learning_rate": 4.040948589953153e-07, + "loss": 1.4726, + "step": 10544 + }, + { + "epoch": 1.8192012421288708, + "grad_norm": 0.5546875, + "learning_rate": 4.0332927732569473e-07, + "loss": 1.4246, + "step": 10545 + }, + { + "epoch": 1.8193737600276028, + "grad_norm": 0.57421875, + "learning_rate": 4.025644066374068e-07, + "loss": 1.2917, + "step": 10546 + }, + { + "epoch": 1.8195462779263347, + "grad_norm": 0.55078125, + "learning_rate": 4.0180024698711404e-07, + "loss": 1.3544, + "step": 10547 + }, + { + "epoch": 1.819718795825067, + "grad_norm": 0.59765625, + "learning_rate": 4.0103679843142895e-07, + "loss": 1.4888, + "step": 10548 + }, + { + "epoch": 1.8198913137237989, + "grad_norm": 0.58984375, + "learning_rate": 4.002740610269185e-07, + "loss": 1.3407, + "step": 10549 + }, + { + "epoch": 1.8200638316225308, + "grad_norm": 0.60546875, + "learning_rate": 3.995120348300863e-07, + "loss": 1.3905, + "step": 10550 + }, + { + "epoch": 1.8202363495212628, + "grad_norm": 0.65625, + "learning_rate": 3.9875071989738943e-07, + "loss": 1.432, + "step": 10551 + }, + { + "epoch": 1.820408867419995, + "grad_norm": 0.56640625, + "learning_rate": 3.9799011628522953e-07, + "loss": 1.4171, + "step": 10552 + }, + { + "epoch": 1.820581385318727, + "grad_norm": 0.6484375, + "learning_rate": 3.9723022404995926e-07, + "loss": 1.5395, + "step": 10553 + }, + { + "epoch": 1.820753903217459, + "grad_norm": 0.5546875, + "learning_rate": 3.964710432478736e-07, + "loss": 1.436, + "step": 10554 + }, + { + "epoch": 1.8209264211161909, + "grad_norm": 0.6171875, + "learning_rate": 3.957125739352208e-07, + "loss": 1.5376, + "step": 10555 + }, + { + "epoch": 1.8210989390149228, + "grad_norm": 0.60546875, + "learning_rate": 3.949548161681882e-07, + "loss": 1.4383, + "step": 10556 + }, + { + "epoch": 1.8212714569136548, + "grad_norm": 0.57421875, + "learning_rate": 3.941977700029198e-07, + "loss": 1.5523, + "step": 10557 + }, + { + "epoch": 1.8214439748123867, + "grad_norm": 0.58984375, + "learning_rate": 3.934414354954985e-07, + "loss": 1.4397, + "step": 10558 + }, + { + "epoch": 1.8216164927111187, + "grad_norm": 0.62890625, + "learning_rate": 3.9268581270196284e-07, + "loss": 1.4584, + "step": 10559 + }, + { + "epoch": 1.8217890106098507, + "grad_norm": 0.609375, + "learning_rate": 3.919309016782902e-07, + "loss": 1.3746, + "step": 10560 + }, + { + "epoch": 1.8219615285085826, + "grad_norm": 0.625, + "learning_rate": 3.911767024804092e-07, + "loss": 1.3883, + "step": 10561 + }, + { + "epoch": 1.8221340464073148, + "grad_norm": 0.5390625, + "learning_rate": 3.9042321516419844e-07, + "loss": 1.4587, + "step": 10562 + }, + { + "epoch": 1.8223065643060468, + "grad_norm": 0.54296875, + "learning_rate": 3.896704397854778e-07, + "loss": 1.2624, + "step": 10563 + }, + { + "epoch": 1.8224790822047787, + "grad_norm": 0.60546875, + "learning_rate": 3.889183764000204e-07, + "loss": 1.3084, + "step": 10564 + }, + { + "epoch": 1.822651600103511, + "grad_norm": 0.56640625, + "learning_rate": 3.8816702506354163e-07, + "loss": 1.3891, + "step": 10565 + }, + { + "epoch": 1.8228241180022429, + "grad_norm": 0.62890625, + "learning_rate": 3.8741638583170814e-07, + "loss": 1.4988, + "step": 10566 + }, + { + "epoch": 1.8229966359009748, + "grad_norm": 0.55859375, + "learning_rate": 3.866664587601299e-07, + "loss": 1.4436, + "step": 10567 + }, + { + "epoch": 1.8231691537997068, + "grad_norm": 0.58203125, + "learning_rate": 3.8591724390436904e-07, + "loss": 1.4547, + "step": 10568 + }, + { + "epoch": 1.8233416716984387, + "grad_norm": 0.55859375, + "learning_rate": 3.851687413199279e-07, + "loss": 1.3426, + "step": 10569 + }, + { + "epoch": 1.8235141895971707, + "grad_norm": 0.5625, + "learning_rate": 3.844209510622643e-07, + "loss": 1.4252, + "step": 10570 + }, + { + "epoch": 1.8236867074959027, + "grad_norm": 0.59765625, + "learning_rate": 3.836738731867773e-07, + "loss": 1.4587, + "step": 10571 + }, + { + "epoch": 1.8238592253946346, + "grad_norm": 0.609375, + "learning_rate": 3.8292750774881483e-07, + "loss": 1.5709, + "step": 10572 + }, + { + "epoch": 1.8240317432933666, + "grad_norm": 0.58203125, + "learning_rate": 3.8218185480367264e-07, + "loss": 1.3629, + "step": 10573 + }, + { + "epoch": 1.8242042611920986, + "grad_norm": 0.53515625, + "learning_rate": 3.814369144065944e-07, + "loss": 1.3875, + "step": 10574 + }, + { + "epoch": 1.8243767790908305, + "grad_norm": 0.59375, + "learning_rate": 3.8069268661276916e-07, + "loss": 1.452, + "step": 10575 + }, + { + "epoch": 1.8245492969895627, + "grad_norm": 0.56640625, + "learning_rate": 3.79949171477334e-07, + "loss": 1.4527, + "step": 10576 + }, + { + "epoch": 1.8247218148882947, + "grad_norm": 0.56640625, + "learning_rate": 3.792063690553716e-07, + "loss": 1.4349, + "step": 10577 + }, + { + "epoch": 1.8248943327870266, + "grad_norm": 0.65625, + "learning_rate": 3.7846427940191663e-07, + "loss": 1.4183, + "step": 10578 + }, + { + "epoch": 1.8250668506857588, + "grad_norm": 0.56640625, + "learning_rate": 3.777229025719453e-07, + "loss": 1.467, + "step": 10579 + }, + { + "epoch": 1.8252393685844908, + "grad_norm": 0.6328125, + "learning_rate": 3.7698223862038254e-07, + "loss": 1.3971, + "step": 10580 + }, + { + "epoch": 1.8254118864832227, + "grad_norm": 0.55859375, + "learning_rate": 3.7624228760210545e-07, + "loss": 1.3448, + "step": 10581 + }, + { + "epoch": 1.8255844043819547, + "grad_norm": 0.5625, + "learning_rate": 3.755030495719303e-07, + "loss": 1.3441, + "step": 10582 + }, + { + "epoch": 1.8257569222806866, + "grad_norm": 0.69921875, + "learning_rate": 3.7476452458462654e-07, + "loss": 1.328, + "step": 10583 + }, + { + "epoch": 1.8259294401794186, + "grad_norm": 0.5546875, + "learning_rate": 3.740267126949071e-07, + "loss": 1.3725, + "step": 10584 + }, + { + "epoch": 1.8261019580781506, + "grad_norm": 0.5625, + "learning_rate": 3.732896139574349e-07, + "loss": 1.4206, + "step": 10585 + }, + { + "epoch": 1.8262744759768825, + "grad_norm": 0.58984375, + "learning_rate": 3.7255322842681963e-07, + "loss": 1.5134, + "step": 10586 + }, + { + "epoch": 1.8264469938756145, + "grad_norm": 0.609375, + "learning_rate": 3.718175561576154e-07, + "loss": 1.4835, + "step": 10587 + }, + { + "epoch": 1.8266195117743464, + "grad_norm": 0.59765625, + "learning_rate": 3.710825972043264e-07, + "loss": 1.4358, + "step": 10588 + }, + { + "epoch": 1.8267920296730786, + "grad_norm": 0.59375, + "learning_rate": 3.7034835162140347e-07, + "loss": 1.5118, + "step": 10589 + }, + { + "epoch": 1.8269645475718106, + "grad_norm": 0.578125, + "learning_rate": 3.6961481946324205e-07, + "loss": 1.5324, + "step": 10590 + }, + { + "epoch": 1.8271370654705426, + "grad_norm": 0.5703125, + "learning_rate": 3.688820007841898e-07, + "loss": 1.4644, + "step": 10591 + }, + { + "epoch": 1.8273095833692745, + "grad_norm": 0.59765625, + "learning_rate": 3.6814989563853654e-07, + "loss": 1.4278, + "step": 10592 + }, + { + "epoch": 1.8274821012680067, + "grad_norm": 0.5703125, + "learning_rate": 3.6741850408052114e-07, + "loss": 1.4077, + "step": 10593 + }, + { + "epoch": 1.8276546191667387, + "grad_norm": 0.65625, + "learning_rate": 3.666878261643303e-07, + "loss": 1.4508, + "step": 10594 + }, + { + "epoch": 1.8278271370654706, + "grad_norm": 0.625, + "learning_rate": 3.659578619440962e-07, + "loss": 1.4561, + "step": 10595 + }, + { + "epoch": 1.8279996549642026, + "grad_norm": 0.65234375, + "learning_rate": 3.652286114739012e-07, + "loss": 1.376, + "step": 10596 + }, + { + "epoch": 1.8281721728629345, + "grad_norm": 0.56640625, + "learning_rate": 3.645000748077709e-07, + "loss": 1.4252, + "step": 10597 + }, + { + "epoch": 1.8283446907616665, + "grad_norm": 0.60546875, + "learning_rate": 3.6377225199968003e-07, + "loss": 1.4235, + "step": 10598 + }, + { + "epoch": 1.8285172086603985, + "grad_norm": 0.57421875, + "learning_rate": 3.6304514310355086e-07, + "loss": 1.4719, + "step": 10599 + }, + { + "epoch": 1.8286897265591304, + "grad_norm": 0.5390625, + "learning_rate": 3.6231874817325375e-07, + "loss": 1.4056, + "step": 10600 + }, + { + "epoch": 1.8286897265591304, + "eval_loss": 1.4070792198181152, + "eval_runtime": 10.8107, + "eval_samples_per_second": 94.721, + "eval_steps_per_second": 23.68, + "step": 10600 + }, + { + "epoch": 1.8288622444578624, + "grad_norm": 0.65625, + "learning_rate": 3.615930672626e-07, + "loss": 1.4744, + "step": 10601 + }, + { + "epoch": 1.8290347623565943, + "grad_norm": 0.55859375, + "learning_rate": 3.608681004253578e-07, + "loss": 1.3409, + "step": 10602 + }, + { + "epoch": 1.8292072802553265, + "grad_norm": 0.5859375, + "learning_rate": 3.60143847715233e-07, + "loss": 1.4295, + "step": 10603 + }, + { + "epoch": 1.8293797981540585, + "grad_norm": 0.57421875, + "learning_rate": 3.594203091858861e-07, + "loss": 1.4565, + "step": 10604 + }, + { + "epoch": 1.8295523160527905, + "grad_norm": 0.5546875, + "learning_rate": 3.586974848909186e-07, + "loss": 1.3717, + "step": 10605 + }, + { + "epoch": 1.8297248339515226, + "grad_norm": 0.60546875, + "learning_rate": 3.5797537488388326e-07, + "loss": 1.5116, + "step": 10606 + }, + { + "epoch": 1.8298973518502546, + "grad_norm": 0.59375, + "learning_rate": 3.572539792182783e-07, + "loss": 1.4081, + "step": 10607 + }, + { + "epoch": 1.8300698697489866, + "grad_norm": 0.55859375, + "learning_rate": 3.5653329794755e-07, + "loss": 1.4567, + "step": 10608 + }, + { + "epoch": 1.8302423876477185, + "grad_norm": 0.57421875, + "learning_rate": 3.558133311250889e-07, + "loss": 1.3465, + "step": 10609 + }, + { + "epoch": 1.8304149055464505, + "grad_norm": 0.609375, + "learning_rate": 3.5509407880423783e-07, + "loss": 1.4702, + "step": 10610 + }, + { + "epoch": 1.8305874234451824, + "grad_norm": 0.58984375, + "learning_rate": 3.5437554103827985e-07, + "loss": 1.4834, + "step": 10611 + }, + { + "epoch": 1.8307599413439144, + "grad_norm": 0.609375, + "learning_rate": 3.53657717880449e-07, + "loss": 1.4094, + "step": 10612 + }, + { + "epoch": 1.8309324592426464, + "grad_norm": 0.5859375, + "learning_rate": 3.5294060938393046e-07, + "loss": 1.4454, + "step": 10613 + }, + { + "epoch": 1.8311049771413783, + "grad_norm": 0.56640625, + "learning_rate": 3.522242156018474e-07, + "loss": 1.48, + "step": 10614 + }, + { + "epoch": 1.8312774950401103, + "grad_norm": 0.73046875, + "learning_rate": 3.5150853658727837e-07, + "loss": 1.3688, + "step": 10615 + }, + { + "epoch": 1.8314500129388422, + "grad_norm": 0.67578125, + "learning_rate": 3.5079357239324205e-07, + "loss": 1.3982, + "step": 10616 + }, + { + "epoch": 1.8316225308375744, + "grad_norm": 0.58984375, + "learning_rate": 3.500793230727095e-07, + "loss": 1.4082, + "step": 10617 + }, + { + "epoch": 1.8317950487363064, + "grad_norm": 0.58203125, + "learning_rate": 3.493657886785962e-07, + "loss": 1.3345, + "step": 10618 + }, + { + "epoch": 1.8319675666350383, + "grad_norm": 0.54296875, + "learning_rate": 3.486529692637641e-07, + "loss": 1.3584, + "step": 10619 + }, + { + "epoch": 1.8321400845337705, + "grad_norm": 0.73828125, + "learning_rate": 3.4794086488102564e-07, + "loss": 1.3406, + "step": 10620 + }, + { + "epoch": 1.8323126024325025, + "grad_norm": 0.57421875, + "learning_rate": 3.4722947558313737e-07, + "loss": 1.4438, + "step": 10621 + }, + { + "epoch": 1.8324851203312345, + "grad_norm": 0.59375, + "learning_rate": 3.4651880142280047e-07, + "loss": 1.3952, + "step": 10622 + }, + { + "epoch": 1.8326576382299664, + "grad_norm": 0.5859375, + "learning_rate": 3.4580884245267064e-07, + "loss": 1.3912, + "step": 10623 + }, + { + "epoch": 1.8328301561286984, + "grad_norm": 0.5625, + "learning_rate": 3.450995987253436e-07, + "loss": 1.4326, + "step": 10624 + }, + { + "epoch": 1.8330026740274303, + "grad_norm": 0.61328125, + "learning_rate": 3.443910702933639e-07, + "loss": 1.4591, + "step": 10625 + }, + { + "epoch": 1.8331751919261623, + "grad_norm": 0.67578125, + "learning_rate": 3.436832572092264e-07, + "loss": 1.357, + "step": 10626 + }, + { + "epoch": 1.8333477098248943, + "grad_norm": 0.57421875, + "learning_rate": 3.429761595253667e-07, + "loss": 1.3664, + "step": 10627 + }, + { + "epoch": 1.8335202277236262, + "grad_norm": 0.5546875, + "learning_rate": 3.422697772941741e-07, + "loss": 1.433, + "step": 10628 + }, + { + "epoch": 1.8336927456223582, + "grad_norm": 0.6875, + "learning_rate": 3.415641105679801e-07, + "loss": 1.4772, + "step": 10629 + }, + { + "epoch": 1.8338652635210904, + "grad_norm": 0.6328125, + "learning_rate": 3.408591593990662e-07, + "loss": 1.5794, + "step": 10630 + }, + { + "epoch": 1.8340377814198223, + "grad_norm": 0.5625, + "learning_rate": 3.401549238396584e-07, + "loss": 1.3741, + "step": 10631 + }, + { + "epoch": 1.8342102993185543, + "grad_norm": 0.546875, + "learning_rate": 3.3945140394193276e-07, + "loss": 1.4539, + "step": 10632 + }, + { + "epoch": 1.8343828172172862, + "grad_norm": 0.61328125, + "learning_rate": 3.3874859975800644e-07, + "loss": 1.3585, + "step": 10633 + }, + { + "epoch": 1.8345553351160184, + "grad_norm": 0.57421875, + "learning_rate": 3.3804651133995226e-07, + "loss": 1.4456, + "step": 10634 + }, + { + "epoch": 1.8347278530147504, + "grad_norm": 0.671875, + "learning_rate": 3.3734513873978193e-07, + "loss": 1.4126, + "step": 10635 + }, + { + "epoch": 1.8349003709134823, + "grad_norm": 0.58984375, + "learning_rate": 3.3664448200945943e-07, + "loss": 1.3672, + "step": 10636 + }, + { + "epoch": 1.8350728888122143, + "grad_norm": 0.62109375, + "learning_rate": 3.3594454120089216e-07, + "loss": 1.4352, + "step": 10637 + }, + { + "epoch": 1.8352454067109463, + "grad_norm": 0.8046875, + "learning_rate": 3.352453163659386e-07, + "loss": 1.3739, + "step": 10638 + }, + { + "epoch": 1.8354179246096782, + "grad_norm": 0.58984375, + "learning_rate": 3.3454680755639847e-07, + "loss": 1.5121, + "step": 10639 + }, + { + "epoch": 1.8355904425084102, + "grad_norm": 0.62890625, + "learning_rate": 3.3384901482402585e-07, + "loss": 1.3696, + "step": 10640 + }, + { + "epoch": 1.8357629604071422, + "grad_norm": 0.578125, + "learning_rate": 3.3315193822051283e-07, + "loss": 1.3752, + "step": 10641 + }, + { + "epoch": 1.8359354783058741, + "grad_norm": 0.59375, + "learning_rate": 3.3245557779750693e-07, + "loss": 1.4764, + "step": 10642 + }, + { + "epoch": 1.836107996204606, + "grad_norm": 0.59765625, + "learning_rate": 3.3175993360659684e-07, + "loss": 1.4615, + "step": 10643 + }, + { + "epoch": 1.8362805141033383, + "grad_norm": 0.5703125, + "learning_rate": 3.310650056993192e-07, + "loss": 1.4907, + "step": 10644 + }, + { + "epoch": 1.8364530320020702, + "grad_norm": 0.5859375, + "learning_rate": 3.3037079412716276e-07, + "loss": 1.4502, + "step": 10645 + }, + { + "epoch": 1.8366255499008022, + "grad_norm": 0.6171875, + "learning_rate": 3.296772989415542e-07, + "loss": 1.461, + "step": 10646 + }, + { + "epoch": 1.8367980677995344, + "grad_norm": 0.61328125, + "learning_rate": 3.2898452019387685e-07, + "loss": 1.4372, + "step": 10647 + }, + { + "epoch": 1.8369705856982663, + "grad_norm": 0.63671875, + "learning_rate": 3.2829245793545186e-07, + "loss": 1.5597, + "step": 10648 + }, + { + "epoch": 1.8371431035969983, + "grad_norm": 6.84375, + "learning_rate": 3.2760111221755375e-07, + "loss": 1.4422, + "step": 10649 + }, + { + "epoch": 1.8373156214957302, + "grad_norm": 0.578125, + "learning_rate": 3.269104830914016e-07, + "loss": 1.4101, + "step": 10650 + }, + { + "epoch": 1.8374881393944622, + "grad_norm": 0.5703125, + "learning_rate": 3.2622057060816004e-07, + "loss": 1.4499, + "step": 10651 + }, + { + "epoch": 1.8376606572931942, + "grad_norm": 0.609375, + "learning_rate": 3.255313748189437e-07, + "loss": 1.4314, + "step": 10652 + }, + { + "epoch": 1.8378331751919261, + "grad_norm": 1.2578125, + "learning_rate": 3.2484289577481286e-07, + "loss": 1.34, + "step": 10653 + }, + { + "epoch": 1.838005693090658, + "grad_norm": 0.62109375, + "learning_rate": 3.241551335267712e-07, + "loss": 1.3133, + "step": 10654 + }, + { + "epoch": 1.83817821098939, + "grad_norm": 0.5703125, + "learning_rate": 3.234680881257779e-07, + "loss": 1.438, + "step": 10655 + }, + { + "epoch": 1.838350728888122, + "grad_norm": 1.1953125, + "learning_rate": 3.2278175962272783e-07, + "loss": 1.4366, + "step": 10656 + }, + { + "epoch": 1.838523246786854, + "grad_norm": 0.56640625, + "learning_rate": 3.2209614806847256e-07, + "loss": 1.4324, + "step": 10657 + }, + { + "epoch": 1.8386957646855862, + "grad_norm": 0.609375, + "learning_rate": 3.2141125351380363e-07, + "loss": 1.4163, + "step": 10658 + }, + { + "epoch": 1.8388682825843181, + "grad_norm": 0.5859375, + "learning_rate": 3.20727076009465e-07, + "loss": 1.4913, + "step": 10659 + }, + { + "epoch": 1.83904080048305, + "grad_norm": 0.55078125, + "learning_rate": 3.200436156061426e-07, + "loss": 1.4095, + "step": 10660 + }, + { + "epoch": 1.8392133183817823, + "grad_norm": 0.55078125, + "learning_rate": 3.1936087235447165e-07, + "loss": 1.3978, + "step": 10661 + }, + { + "epoch": 1.8393858362805142, + "grad_norm": 0.578125, + "learning_rate": 3.186788463050361e-07, + "loss": 1.4577, + "step": 10662 + }, + { + "epoch": 1.8395583541792462, + "grad_norm": 0.60546875, + "learning_rate": 3.1799753750836215e-07, + "loss": 1.4572, + "step": 10663 + }, + { + "epoch": 1.8397308720779781, + "grad_norm": 0.60546875, + "learning_rate": 3.1731694601492834e-07, + "loss": 1.464, + "step": 10664 + }, + { + "epoch": 1.83990338997671, + "grad_norm": 0.609375, + "learning_rate": 3.1663707187515325e-07, + "loss": 1.4048, + "step": 10665 + }, + { + "epoch": 1.840075907875442, + "grad_norm": 0.55078125, + "learning_rate": 3.1595791513941097e-07, + "loss": 1.5117, + "step": 10666 + }, + { + "epoch": 1.840248425774174, + "grad_norm": 0.671875, + "learning_rate": 3.1527947585801246e-07, + "loss": 1.394, + "step": 10667 + }, + { + "epoch": 1.840420943672906, + "grad_norm": 0.5703125, + "learning_rate": 3.146017540812241e-07, + "loss": 1.4003, + "step": 10668 + }, + { + "epoch": 1.840593461571638, + "grad_norm": 0.640625, + "learning_rate": 3.139247498592557e-07, + "loss": 1.4011, + "step": 10669 + }, + { + "epoch": 1.84076597947037, + "grad_norm": 0.60546875, + "learning_rate": 3.1324846324226165e-07, + "loss": 1.3679, + "step": 10670 + }, + { + "epoch": 1.840938497369102, + "grad_norm": 0.59375, + "learning_rate": 3.1257289428034854e-07, + "loss": 1.437, + "step": 10671 + }, + { + "epoch": 1.841111015267834, + "grad_norm": 0.61328125, + "learning_rate": 3.1189804302356517e-07, + "loss": 1.4845, + "step": 10672 + }, + { + "epoch": 1.841283533166566, + "grad_norm": 0.58203125, + "learning_rate": 3.112239095219072e-07, + "loss": 1.4396, + "step": 10673 + }, + { + "epoch": 1.841456051065298, + "grad_norm": 0.6484375, + "learning_rate": 3.105504938253223e-07, + "loss": 1.4616, + "step": 10674 + }, + { + "epoch": 1.8416285689640302, + "grad_norm": 0.56640625, + "learning_rate": 3.098777959836974e-07, + "loss": 1.4047, + "step": 10675 + }, + { + "epoch": 1.8418010868627621, + "grad_norm": 0.5625, + "learning_rate": 3.092058160468736e-07, + "loss": 1.3966, + "step": 10676 + }, + { + "epoch": 1.841973604761494, + "grad_norm": 0.546875, + "learning_rate": 3.085345540646345e-07, + "loss": 1.4355, + "step": 10677 + }, + { + "epoch": 1.842146122660226, + "grad_norm": 0.62109375, + "learning_rate": 3.0786401008670806e-07, + "loss": 1.4064, + "step": 10678 + }, + { + "epoch": 1.842318640558958, + "grad_norm": 0.578125, + "learning_rate": 3.071941841627779e-07, + "loss": 1.3451, + "step": 10679 + }, + { + "epoch": 1.84249115845769, + "grad_norm": 0.5390625, + "learning_rate": 3.065250763424643e-07, + "loss": 1.4058, + "step": 10680 + }, + { + "epoch": 1.842663676356422, + "grad_norm": 0.546875, + "learning_rate": 3.0585668667534097e-07, + "loss": 1.3844, + "step": 10681 + }, + { + "epoch": 1.8428361942551539, + "grad_norm": 0.57421875, + "learning_rate": 3.0518901521092605e-07, + "loss": 1.4213, + "step": 10682 + }, + { + "epoch": 1.8430087121538858, + "grad_norm": 0.59375, + "learning_rate": 3.045220619986844e-07, + "loss": 1.5024, + "step": 10683 + }, + { + "epoch": 1.8431812300526178, + "grad_norm": 0.6171875, + "learning_rate": 3.038558270880287e-07, + "loss": 1.3808, + "step": 10684 + }, + { + "epoch": 1.84335374795135, + "grad_norm": 0.57421875, + "learning_rate": 3.031903105283196e-07, + "loss": 1.3525, + "step": 10685 + }, + { + "epoch": 1.843526265850082, + "grad_norm": 0.5703125, + "learning_rate": 3.025255123688575e-07, + "loss": 1.4255, + "step": 10686 + }, + { + "epoch": 1.843698783748814, + "grad_norm": 0.5703125, + "learning_rate": 3.018614326589009e-07, + "loss": 1.424, + "step": 10687 + }, + { + "epoch": 1.843871301647546, + "grad_norm": 0.5859375, + "learning_rate": 3.011980714476448e-07, + "loss": 1.4183, + "step": 10688 + }, + { + "epoch": 1.844043819546278, + "grad_norm": 0.6640625, + "learning_rate": 3.0053542878423657e-07, + "loss": 1.4809, + "step": 10689 + }, + { + "epoch": 1.84421633744501, + "grad_norm": 0.58203125, + "learning_rate": 2.998735047177692e-07, + "loss": 1.4014, + "step": 10690 + }, + { + "epoch": 1.844388855343742, + "grad_norm": 0.60546875, + "learning_rate": 2.992122992972812e-07, + "loss": 1.3376, + "step": 10691 + }, + { + "epoch": 1.844561373242474, + "grad_norm": 0.60546875, + "learning_rate": 2.9855181257176015e-07, + "loss": 1.4756, + "step": 10692 + }, + { + "epoch": 1.844733891141206, + "grad_norm": 0.5859375, + "learning_rate": 2.978920445901379e-07, + "loss": 1.3947, + "step": 10693 + }, + { + "epoch": 1.8449064090399379, + "grad_norm": 0.5390625, + "learning_rate": 2.9723299540129423e-07, + "loss": 1.4171, + "step": 10694 + }, + { + "epoch": 1.8450789269386698, + "grad_norm": 0.59765625, + "learning_rate": 2.9657466505405573e-07, + "loss": 1.4056, + "step": 10695 + }, + { + "epoch": 1.8452514448374018, + "grad_norm": 0.58203125, + "learning_rate": 2.959170535971978e-07, + "loss": 1.4627, + "step": 10696 + }, + { + "epoch": 1.8454239627361337, + "grad_norm": 0.6328125, + "learning_rate": 2.952601610794359e-07, + "loss": 1.4116, + "step": 10697 + }, + { + "epoch": 1.845596480634866, + "grad_norm": 0.5625, + "learning_rate": 2.9460398754944106e-07, + "loss": 1.4963, + "step": 10698 + }, + { + "epoch": 1.8457689985335979, + "grad_norm": 0.5546875, + "learning_rate": 2.9394853305582337e-07, + "loss": 1.4373, + "step": 10699 + }, + { + "epoch": 1.8459415164323298, + "grad_norm": 0.578125, + "learning_rate": 2.9329379764714615e-07, + "loss": 1.3966, + "step": 10700 + }, + { + "epoch": 1.8459415164323298, + "eval_loss": 1.4070720672607422, + "eval_runtime": 12.0233, + "eval_samples_per_second": 85.168, + "eval_steps_per_second": 21.292, + "step": 10700 + }, + { + "epoch": 1.8461140343310618, + "grad_norm": 0.63671875, + "learning_rate": 2.9263978137191396e-07, + "loss": 1.5311, + "step": 10701 + }, + { + "epoch": 1.846286552229794, + "grad_norm": 0.61328125, + "learning_rate": 2.919864842785802e-07, + "loss": 1.4487, + "step": 10702 + }, + { + "epoch": 1.846459070128526, + "grad_norm": 0.6484375, + "learning_rate": 2.9133390641554736e-07, + "loss": 1.4173, + "step": 10703 + }, + { + "epoch": 1.846631588027258, + "grad_norm": 0.66015625, + "learning_rate": 2.9068204783116227e-07, + "loss": 1.4152, + "step": 10704 + }, + { + "epoch": 1.8468041059259899, + "grad_norm": 0.55859375, + "learning_rate": 2.900309085737152e-07, + "loss": 1.4875, + "step": 10705 + }, + { + "epoch": 1.8469766238247218, + "grad_norm": 0.59765625, + "learning_rate": 2.8938048869145087e-07, + "loss": 1.4071, + "step": 10706 + }, + { + "epoch": 1.8471491417234538, + "grad_norm": 0.57421875, + "learning_rate": 2.8873078823255297e-07, + "loss": 1.4231, + "step": 10707 + }, + { + "epoch": 1.8473216596221858, + "grad_norm": 0.57421875, + "learning_rate": 2.8808180724515856e-07, + "loss": 1.3817, + "step": 10708 + }, + { + "epoch": 1.8474941775209177, + "grad_norm": 0.55859375, + "learning_rate": 2.8743354577734805e-07, + "loss": 1.3916, + "step": 10709 + }, + { + "epoch": 1.8476666954196497, + "grad_norm": 0.671875, + "learning_rate": 2.8678600387714417e-07, + "loss": 1.3462, + "step": 10710 + }, + { + "epoch": 1.8478392133183816, + "grad_norm": 0.60546875, + "learning_rate": 2.8613918159252627e-07, + "loss": 1.5002, + "step": 10711 + }, + { + "epoch": 1.8480117312171138, + "grad_norm": 0.5859375, + "learning_rate": 2.8549307897141274e-07, + "loss": 1.3669, + "step": 10712 + }, + { + "epoch": 1.8481842491158458, + "grad_norm": 0.5625, + "learning_rate": 2.8484769606167085e-07, + "loss": 1.4325, + "step": 10713 + }, + { + "epoch": 1.8483567670145777, + "grad_norm": 0.71484375, + "learning_rate": 2.842030329111134e-07, + "loss": 1.4155, + "step": 10714 + }, + { + "epoch": 1.8485292849133097, + "grad_norm": 0.5703125, + "learning_rate": 2.8355908956750335e-07, + "loss": 1.4419, + "step": 10715 + }, + { + "epoch": 1.8487018028120419, + "grad_norm": 0.609375, + "learning_rate": 2.8291586607854693e-07, + "loss": 1.3997, + "step": 10716 + }, + { + "epoch": 1.8488743207107738, + "grad_norm": 0.55859375, + "learning_rate": 2.822733624918994e-07, + "loss": 1.3909, + "step": 10717 + }, + { + "epoch": 1.8490468386095058, + "grad_norm": 0.609375, + "learning_rate": 2.8163157885515824e-07, + "loss": 1.4904, + "step": 10718 + }, + { + "epoch": 1.8492193565082378, + "grad_norm": 0.77734375, + "learning_rate": 2.809905152158754e-07, + "loss": 1.435, + "step": 10719 + }, + { + "epoch": 1.8493918744069697, + "grad_norm": 0.6953125, + "learning_rate": 2.8035017162154063e-07, + "loss": 1.4916, + "step": 10720 + }, + { + "epoch": 1.8495643923057017, + "grad_norm": 0.62890625, + "learning_rate": 2.7971054811959717e-07, + "loss": 1.481, + "step": 10721 + }, + { + "epoch": 1.8497369102044336, + "grad_norm": 0.56640625, + "learning_rate": 2.790716447574304e-07, + "loss": 1.4353, + "step": 10722 + }, + { + "epoch": 1.8499094281031656, + "grad_norm": 0.58984375, + "learning_rate": 2.7843346158237586e-07, + "loss": 1.3761, + "step": 10723 + }, + { + "epoch": 1.8500819460018976, + "grad_norm": 0.56640625, + "learning_rate": 2.777959986417134e-07, + "loss": 1.4107, + "step": 10724 + }, + { + "epoch": 1.8502544639006295, + "grad_norm": 0.6015625, + "learning_rate": 2.771592559826708e-07, + "loss": 1.4496, + "step": 10725 + }, + { + "epoch": 1.8504269817993617, + "grad_norm": 0.58984375, + "learning_rate": 2.765232336524215e-07, + "loss": 1.4275, + "step": 10726 + }, + { + "epoch": 1.8505994996980937, + "grad_norm": 0.5859375, + "learning_rate": 2.758879316980867e-07, + "loss": 1.4414, + "step": 10727 + }, + { + "epoch": 1.8507720175968256, + "grad_norm": 0.5625, + "learning_rate": 2.7525335016673315e-07, + "loss": 1.4265, + "step": 10728 + }, + { + "epoch": 1.8509445354955578, + "grad_norm": 0.6171875, + "learning_rate": 2.746194891053733e-07, + "loss": 1.4261, + "step": 10729 + }, + { + "epoch": 1.8511170533942898, + "grad_norm": 0.5703125, + "learning_rate": 2.739863485609695e-07, + "loss": 1.4396, + "step": 10730 + }, + { + "epoch": 1.8512895712930217, + "grad_norm": 0.59375, + "learning_rate": 2.7335392858042764e-07, + "loss": 1.4106, + "step": 10731 + }, + { + "epoch": 1.8514620891917537, + "grad_norm": 0.59375, + "learning_rate": 2.727222292106024e-07, + "loss": 1.3709, + "step": 10732 + }, + { + "epoch": 1.8516346070904857, + "grad_norm": 0.57421875, + "learning_rate": 2.72091250498292e-07, + "loss": 1.4935, + "step": 10733 + }, + { + "epoch": 1.8518071249892176, + "grad_norm": 0.55078125, + "learning_rate": 2.714609924902445e-07, + "loss": 1.4064, + "step": 10734 + }, + { + "epoch": 1.8519796428879496, + "grad_norm": 0.58984375, + "learning_rate": 2.708314552331548e-07, + "loss": 1.5138, + "step": 10735 + }, + { + "epoch": 1.8521521607866815, + "grad_norm": 0.5546875, + "learning_rate": 2.7020263877366005e-07, + "loss": 1.3863, + "step": 10736 + }, + { + "epoch": 1.8523246786854135, + "grad_norm": 0.6015625, + "learning_rate": 2.6957454315834965e-07, + "loss": 1.4406, + "step": 10737 + }, + { + "epoch": 1.8524971965841455, + "grad_norm": 0.6015625, + "learning_rate": 2.6894716843375523e-07, + "loss": 1.5074, + "step": 10738 + }, + { + "epoch": 1.8526697144828776, + "grad_norm": 0.5703125, + "learning_rate": 2.6832051464635636e-07, + "loss": 1.3569, + "step": 10739 + }, + { + "epoch": 1.8528422323816096, + "grad_norm": 0.59375, + "learning_rate": 2.6769458184258134e-07, + "loss": 1.4224, + "step": 10740 + }, + { + "epoch": 1.8530147502803416, + "grad_norm": 0.6015625, + "learning_rate": 2.6706937006880095e-07, + "loss": 1.4153, + "step": 10741 + }, + { + "epoch": 1.8531872681790735, + "grad_norm": 0.578125, + "learning_rate": 2.664448793713348e-07, + "loss": 1.5044, + "step": 10742 + }, + { + "epoch": 1.8533597860778057, + "grad_norm": 0.58203125, + "learning_rate": 2.6582110979645246e-07, + "loss": 1.5223, + "step": 10743 + }, + { + "epoch": 1.8535323039765377, + "grad_norm": 0.59375, + "learning_rate": 2.651980613903626e-07, + "loss": 1.4345, + "step": 10744 + }, + { + "epoch": 1.8537048218752696, + "grad_norm": 0.62890625, + "learning_rate": 2.645757341992261e-07, + "loss": 1.3272, + "step": 10745 + }, + { + "epoch": 1.8538773397740016, + "grad_norm": 0.58203125, + "learning_rate": 2.6395412826915046e-07, + "loss": 1.3986, + "step": 10746 + }, + { + "epoch": 1.8540498576727336, + "grad_norm": 0.578125, + "learning_rate": 2.6333324364618553e-07, + "loss": 1.3335, + "step": 10747 + }, + { + "epoch": 1.8542223755714655, + "grad_norm": 0.5625, + "learning_rate": 2.6271308037633113e-07, + "loss": 1.3878, + "step": 10748 + }, + { + "epoch": 1.8543948934701975, + "grad_norm": 0.56640625, + "learning_rate": 2.6209363850553393e-07, + "loss": 1.3827, + "step": 10749 + }, + { + "epoch": 1.8545674113689294, + "grad_norm": 0.64453125, + "learning_rate": 2.6147491807968385e-07, + "loss": 1.4321, + "step": 10750 + }, + { + "epoch": 1.8547399292676614, + "grad_norm": 0.578125, + "learning_rate": 2.6085691914462306e-07, + "loss": 1.4484, + "step": 10751 + }, + { + "epoch": 1.8549124471663934, + "grad_norm": 0.61328125, + "learning_rate": 2.6023964174613393e-07, + "loss": 1.3907, + "step": 10752 + }, + { + "epoch": 1.8550849650651255, + "grad_norm": 0.609375, + "learning_rate": 2.596230859299487e-07, + "loss": 1.3456, + "step": 10753 + }, + { + "epoch": 1.8552574829638575, + "grad_norm": 0.69921875, + "learning_rate": 2.5900725174174524e-07, + "loss": 1.347, + "step": 10754 + }, + { + "epoch": 1.8554300008625895, + "grad_norm": 0.58203125, + "learning_rate": 2.5839213922714936e-07, + "loss": 1.4167, + "step": 10755 + }, + { + "epoch": 1.8556025187613217, + "grad_norm": 0.5546875, + "learning_rate": 2.5777774843173233e-07, + "loss": 1.3874, + "step": 10756 + }, + { + "epoch": 1.8557750366600536, + "grad_norm": 0.55078125, + "learning_rate": 2.5716407940101217e-07, + "loss": 1.4425, + "step": 10757 + }, + { + "epoch": 1.8559475545587856, + "grad_norm": 0.69140625, + "learning_rate": 2.565511321804537e-07, + "loss": 1.2792, + "step": 10758 + }, + { + "epoch": 1.8561200724575175, + "grad_norm": 0.5703125, + "learning_rate": 2.559389068154661e-07, + "loss": 1.4789, + "step": 10759 + }, + { + "epoch": 1.8562925903562495, + "grad_norm": 0.61328125, + "learning_rate": 2.553274033514097e-07, + "loss": 1.5425, + "step": 10760 + }, + { + "epoch": 1.8564651082549815, + "grad_norm": 0.62109375, + "learning_rate": 2.5471662183358394e-07, + "loss": 1.4059, + "step": 10761 + }, + { + "epoch": 1.8566376261537134, + "grad_norm": 0.56640625, + "learning_rate": 2.5410656230724475e-07, + "loss": 1.3846, + "step": 10762 + }, + { + "epoch": 1.8568101440524454, + "grad_norm": 0.5859375, + "learning_rate": 2.5349722481758487e-07, + "loss": 1.427, + "step": 10763 + }, + { + "epoch": 1.8569826619511773, + "grad_norm": 0.6015625, + "learning_rate": 2.5288860940975046e-07, + "loss": 1.476, + "step": 10764 + }, + { + "epoch": 1.8571551798499093, + "grad_norm": 0.53515625, + "learning_rate": 2.52280716128831e-07, + "loss": 1.2507, + "step": 10765 + }, + { + "epoch": 1.8573276977486413, + "grad_norm": 0.55078125, + "learning_rate": 2.516735450198615e-07, + "loss": 1.4192, + "step": 10766 + }, + { + "epoch": 1.8575002156473734, + "grad_norm": 0.59765625, + "learning_rate": 2.510670961278272e-07, + "loss": 1.477, + "step": 10767 + }, + { + "epoch": 1.8576727335461054, + "grad_norm": 0.7265625, + "learning_rate": 2.5046136949765544e-07, + "loss": 1.5125, + "step": 10768 + }, + { + "epoch": 1.8578452514448374, + "grad_norm": 0.5703125, + "learning_rate": 2.4985636517422365e-07, + "loss": 1.3664, + "step": 10769 + }, + { + "epoch": 1.8580177693435695, + "grad_norm": 0.55859375, + "learning_rate": 2.4925208320235593e-07, + "loss": 1.428, + "step": 10770 + }, + { + "epoch": 1.8581902872423015, + "grad_norm": 0.5859375, + "learning_rate": 2.486485236268166e-07, + "loss": 1.4213, + "step": 10771 + }, + { + "epoch": 1.8583628051410335, + "grad_norm": 0.609375, + "learning_rate": 2.4804568649232643e-07, + "loss": 1.4684, + "step": 10772 + }, + { + "epoch": 1.8585353230397654, + "grad_norm": 0.5625, + "learning_rate": 2.4744357184354305e-07, + "loss": 1.3504, + "step": 10773 + }, + { + "epoch": 1.8587078409384974, + "grad_norm": 0.58984375, + "learning_rate": 2.468421797250764e-07, + "loss": 1.4598, + "step": 10774 + }, + { + "epoch": 1.8588803588372294, + "grad_norm": 0.69921875, + "learning_rate": 2.46241510181483e-07, + "loss": 1.4404, + "step": 10775 + }, + { + "epoch": 1.8590528767359613, + "grad_norm": 0.58984375, + "learning_rate": 2.456415632572617e-07, + "loss": 1.4462, + "step": 10776 + }, + { + "epoch": 1.8592253946346933, + "grad_norm": 0.62109375, + "learning_rate": 2.450423389968626e-07, + "loss": 1.4246, + "step": 10777 + }, + { + "epoch": 1.8593979125334252, + "grad_norm": 0.57421875, + "learning_rate": 2.444438374446778e-07, + "loss": 1.4082, + "step": 10778 + }, + { + "epoch": 1.8595704304321572, + "grad_norm": 0.55078125, + "learning_rate": 2.4384605864504973e-07, + "loss": 1.4425, + "step": 10779 + }, + { + "epoch": 1.8597429483308894, + "grad_norm": 0.6015625, + "learning_rate": 2.4324900264226405e-07, + "loss": 1.4271, + "step": 10780 + }, + { + "epoch": 1.8599154662296213, + "grad_norm": 0.56640625, + "learning_rate": 2.426526694805564e-07, + "loss": 1.5036, + "step": 10781 + }, + { + "epoch": 1.8600879841283533, + "grad_norm": 0.5859375, + "learning_rate": 2.420570592041038e-07, + "loss": 1.3449, + "step": 10782 + }, + { + "epoch": 1.8602605020270853, + "grad_norm": 0.6796875, + "learning_rate": 2.4146217185703755e-07, + "loss": 1.4024, + "step": 10783 + }, + { + "epoch": 1.8604330199258174, + "grad_norm": 0.72265625, + "learning_rate": 2.4086800748342577e-07, + "loss": 1.5128, + "step": 10784 + }, + { + "epoch": 1.8606055378245494, + "grad_norm": 0.59765625, + "learning_rate": 2.4027456612728985e-07, + "loss": 1.537, + "step": 10785 + }, + { + "epoch": 1.8607780557232814, + "grad_norm": 0.58203125, + "learning_rate": 2.396818478325968e-07, + "loss": 1.362, + "step": 10786 + }, + { + "epoch": 1.8609505736220133, + "grad_norm": 0.62109375, + "learning_rate": 2.3908985264325614e-07, + "loss": 1.4572, + "step": 10787 + }, + { + "epoch": 1.8611230915207453, + "grad_norm": 0.71484375, + "learning_rate": 2.3849858060312924e-07, + "loss": 1.4654, + "step": 10788 + }, + { + "epoch": 1.8612956094194772, + "grad_norm": 0.55078125, + "learning_rate": 2.3790803175602007e-07, + "loss": 1.4219, + "step": 10789 + }, + { + "epoch": 1.8614681273182092, + "grad_norm": 0.58203125, + "learning_rate": 2.3731820614568023e-07, + "loss": 1.396, + "step": 10790 + }, + { + "epoch": 1.8616406452169412, + "grad_norm": 0.640625, + "learning_rate": 2.3672910381580817e-07, + "loss": 1.4431, + "step": 10791 + }, + { + "epoch": 1.8618131631156731, + "grad_norm": 0.6015625, + "learning_rate": 2.3614072481004778e-07, + "loss": 1.4451, + "step": 10792 + }, + { + "epoch": 1.861985681014405, + "grad_norm": 0.58203125, + "learning_rate": 2.3555306917198872e-07, + "loss": 1.2986, + "step": 10793 + }, + { + "epoch": 1.8621581989131373, + "grad_norm": 0.5859375, + "learning_rate": 2.3496613694517056e-07, + "loss": 1.4842, + "step": 10794 + }, + { + "epoch": 1.8623307168118692, + "grad_norm": 0.54296875, + "learning_rate": 2.343799281730741e-07, + "loss": 1.3365, + "step": 10795 + }, + { + "epoch": 1.8625032347106012, + "grad_norm": 0.55859375, + "learning_rate": 2.3379444289913344e-07, + "loss": 1.4792, + "step": 10796 + }, + { + "epoch": 1.8626757526093334, + "grad_norm": 0.5546875, + "learning_rate": 2.3320968116672172e-07, + "loss": 1.3473, + "step": 10797 + }, + { + "epoch": 1.8628482705080653, + "grad_norm": 0.5546875, + "learning_rate": 2.32625643019162e-07, + "loss": 1.4253, + "step": 10798 + }, + { + "epoch": 1.8630207884067973, + "grad_norm": 0.61328125, + "learning_rate": 2.320423284997242e-07, + "loss": 1.4237, + "step": 10799 + }, + { + "epoch": 1.8631933063055293, + "grad_norm": 0.54296875, + "learning_rate": 2.3145973765162367e-07, + "loss": 1.4938, + "step": 10800 + }, + { + "epoch": 1.8631933063055293, + "eval_loss": 1.4070444107055664, + "eval_runtime": 10.8998, + "eval_samples_per_second": 93.946, + "eval_steps_per_second": 23.487, + "step": 10800 + }, + { + "epoch": 1.8633658242042612, + "grad_norm": 0.56640625, + "learning_rate": 2.3087787051802146e-07, + "loss": 1.4087, + "step": 10801 + }, + { + "epoch": 1.8635383421029932, + "grad_norm": 0.58203125, + "learning_rate": 2.3029672714202866e-07, + "loss": 1.3573, + "step": 10802 + }, + { + "epoch": 1.8637108600017251, + "grad_norm": 0.56640625, + "learning_rate": 2.2971630756669637e-07, + "loss": 1.3931, + "step": 10803 + }, + { + "epoch": 1.863883377900457, + "grad_norm": 0.55078125, + "learning_rate": 2.291366118350302e-07, + "loss": 1.4099, + "step": 10804 + }, + { + "epoch": 1.864055895799189, + "grad_norm": 0.5703125, + "learning_rate": 2.285576399899736e-07, + "loss": 1.4181, + "step": 10805 + }, + { + "epoch": 1.864228413697921, + "grad_norm": 0.55078125, + "learning_rate": 2.2797939207442e-07, + "loss": 1.3713, + "step": 10806 + }, + { + "epoch": 1.864400931596653, + "grad_norm": 0.54296875, + "learning_rate": 2.2740186813121402e-07, + "loss": 1.46, + "step": 10807 + }, + { + "epoch": 1.8645734494953852, + "grad_norm": 0.5859375, + "learning_rate": 2.268250682031392e-07, + "loss": 1.3744, + "step": 10808 + }, + { + "epoch": 1.8647459673941171, + "grad_norm": 0.5859375, + "learning_rate": 2.2624899233292807e-07, + "loss": 1.3474, + "step": 10809 + }, + { + "epoch": 1.864918485292849, + "grad_norm": 0.5859375, + "learning_rate": 2.256736405632609e-07, + "loss": 1.5073, + "step": 10810 + }, + { + "epoch": 1.8650910031915813, + "grad_norm": 0.59375, + "learning_rate": 2.250990129367636e-07, + "loss": 1.427, + "step": 10811 + }, + { + "epoch": 1.8652635210903132, + "grad_norm": 0.56640625, + "learning_rate": 2.245251094960077e-07, + "loss": 1.3456, + "step": 10812 + }, + { + "epoch": 1.8654360389890452, + "grad_norm": 0.91015625, + "learning_rate": 2.2395193028351247e-07, + "loss": 1.5013, + "step": 10813 + }, + { + "epoch": 1.8656085568877772, + "grad_norm": 0.59765625, + "learning_rate": 2.2337947534174064e-07, + "loss": 1.5743, + "step": 10814 + }, + { + "epoch": 1.8657810747865091, + "grad_norm": 0.5625, + "learning_rate": 2.2280774471310496e-07, + "loss": 1.3436, + "step": 10815 + }, + { + "epoch": 1.865953592685241, + "grad_norm": 0.55859375, + "learning_rate": 2.2223673843996263e-07, + "loss": 1.4284, + "step": 10816 + }, + { + "epoch": 1.866126110583973, + "grad_norm": 0.59765625, + "learning_rate": 2.216664565646165e-07, + "loss": 1.449, + "step": 10817 + }, + { + "epoch": 1.866298628482705, + "grad_norm": 0.578125, + "learning_rate": 2.210968991293172e-07, + "loss": 1.39, + "step": 10818 + }, + { + "epoch": 1.866471146381437, + "grad_norm": 0.5546875, + "learning_rate": 2.20528066176261e-07, + "loss": 1.4709, + "step": 10819 + }, + { + "epoch": 1.866643664280169, + "grad_norm": 0.5703125, + "learning_rate": 2.1995995774759082e-07, + "loss": 1.3929, + "step": 10820 + }, + { + "epoch": 1.866816182178901, + "grad_norm": 0.6015625, + "learning_rate": 2.1939257388539525e-07, + "loss": 1.4292, + "step": 10821 + }, + { + "epoch": 1.866988700077633, + "grad_norm": 0.5703125, + "learning_rate": 2.1882591463170956e-07, + "loss": 1.3822, + "step": 10822 + }, + { + "epoch": 1.867161217976365, + "grad_norm": 0.59375, + "learning_rate": 2.1825998002851566e-07, + "loss": 1.4464, + "step": 10823 + }, + { + "epoch": 1.867333735875097, + "grad_norm": 0.5859375, + "learning_rate": 2.1769477011774232e-07, + "loss": 1.3908, + "step": 10824 + }, + { + "epoch": 1.8675062537738292, + "grad_norm": 0.6015625, + "learning_rate": 2.1713028494126265e-07, + "loss": 1.3778, + "step": 10825 + }, + { + "epoch": 1.8676787716725611, + "grad_norm": 0.5703125, + "learning_rate": 2.1656652454089878e-07, + "loss": 1.4023, + "step": 10826 + }, + { + "epoch": 1.867851289571293, + "grad_norm": 0.640625, + "learning_rate": 2.1600348895841394e-07, + "loss": 1.5256, + "step": 10827 + }, + { + "epoch": 1.868023807470025, + "grad_norm": 0.5390625, + "learning_rate": 2.1544117823552592e-07, + "loss": 1.4666, + "step": 10828 + }, + { + "epoch": 1.868196325368757, + "grad_norm": 0.5703125, + "learning_rate": 2.1487959241389244e-07, + "loss": 1.4394, + "step": 10829 + }, + { + "epoch": 1.868368843267489, + "grad_norm": 0.59375, + "learning_rate": 2.1431873153511807e-07, + "loss": 1.4317, + "step": 10830 + }, + { + "epoch": 1.868541361166221, + "grad_norm": 0.59765625, + "learning_rate": 2.1375859564075508e-07, + "loss": 1.5176, + "step": 10831 + }, + { + "epoch": 1.868713879064953, + "grad_norm": 0.6796875, + "learning_rate": 2.131991847723036e-07, + "loss": 1.3555, + "step": 10832 + }, + { + "epoch": 1.8688863969636849, + "grad_norm": 0.59765625, + "learning_rate": 2.126404989712072e-07, + "loss": 1.4658, + "step": 10833 + }, + { + "epoch": 1.8690589148624168, + "grad_norm": 0.55078125, + "learning_rate": 2.1208253827885828e-07, + "loss": 1.4349, + "step": 10834 + }, + { + "epoch": 1.869231432761149, + "grad_norm": 0.5859375, + "learning_rate": 2.1152530273658932e-07, + "loss": 1.4306, + "step": 10835 + }, + { + "epoch": 1.869403950659881, + "grad_norm": 0.59375, + "learning_rate": 2.1096879238569068e-07, + "loss": 1.4883, + "step": 10836 + }, + { + "epoch": 1.869576468558613, + "grad_norm": 0.5625, + "learning_rate": 2.10413007267386e-07, + "loss": 1.3261, + "step": 10837 + }, + { + "epoch": 1.869748986457345, + "grad_norm": 0.6484375, + "learning_rate": 2.098579474228546e-07, + "loss": 1.4961, + "step": 10838 + }, + { + "epoch": 1.869921504356077, + "grad_norm": 0.53515625, + "learning_rate": 2.093036128932191e-07, + "loss": 1.4159, + "step": 10839 + }, + { + "epoch": 1.870094022254809, + "grad_norm": 0.6015625, + "learning_rate": 2.0875000371954557e-07, + "loss": 1.4955, + "step": 10840 + }, + { + "epoch": 1.870266540153541, + "grad_norm": 0.62109375, + "learning_rate": 2.081971199428512e-07, + "loss": 1.4283, + "step": 10841 + }, + { + "epoch": 1.870439058052273, + "grad_norm": 0.5625, + "learning_rate": 2.0764496160409653e-07, + "loss": 1.3329, + "step": 10842 + }, + { + "epoch": 1.870611575951005, + "grad_norm": 0.640625, + "learning_rate": 2.0709352874418777e-07, + "loss": 1.5467, + "step": 10843 + }, + { + "epoch": 1.8707840938497369, + "grad_norm": 0.60546875, + "learning_rate": 2.0654282140397996e-07, + "loss": 1.4522, + "step": 10844 + }, + { + "epoch": 1.8709566117484688, + "grad_norm": 0.6015625, + "learning_rate": 2.0599283962427274e-07, + "loss": 1.4411, + "step": 10845 + }, + { + "epoch": 1.8711291296472008, + "grad_norm": 0.578125, + "learning_rate": 2.0544358344580905e-07, + "loss": 1.363, + "step": 10846 + }, + { + "epoch": 1.8713016475459328, + "grad_norm": 0.62109375, + "learning_rate": 2.0489505290928747e-07, + "loss": 1.4794, + "step": 10847 + }, + { + "epoch": 1.871474165444665, + "grad_norm": 0.63671875, + "learning_rate": 2.04347248055341e-07, + "loss": 1.4492, + "step": 10848 + }, + { + "epoch": 1.871646683343397, + "grad_norm": 1.1796875, + "learning_rate": 2.0380016892455611e-07, + "loss": 1.4276, + "step": 10849 + }, + { + "epoch": 1.8718192012421289, + "grad_norm": 0.59765625, + "learning_rate": 2.0325381555746483e-07, + "loss": 1.5, + "step": 10850 + }, + { + "epoch": 1.8719917191408608, + "grad_norm": 0.56640625, + "learning_rate": 2.0270818799454472e-07, + "loss": 1.4219, + "step": 10851 + }, + { + "epoch": 1.872164237039593, + "grad_norm": 0.6171875, + "learning_rate": 2.0216328627621685e-07, + "loss": 1.5137, + "step": 10852 + }, + { + "epoch": 1.872336754938325, + "grad_norm": 0.62109375, + "learning_rate": 2.016191104428533e-07, + "loss": 1.3509, + "step": 10853 + }, + { + "epoch": 1.872509272837057, + "grad_norm": 0.56640625, + "learning_rate": 2.0107566053476856e-07, + "loss": 1.4113, + "step": 10854 + }, + { + "epoch": 1.8726817907357889, + "grad_norm": 0.5703125, + "learning_rate": 2.0053293659222595e-07, + "loss": 1.4339, + "step": 10855 + }, + { + "epoch": 1.8728543086345208, + "grad_norm": 0.57421875, + "learning_rate": 1.999909386554333e-07, + "loss": 1.5374, + "step": 10856 + }, + { + "epoch": 1.8730268265332528, + "grad_norm": 0.62109375, + "learning_rate": 1.9944966676454402e-07, + "loss": 1.4512, + "step": 10857 + }, + { + "epoch": 1.8731993444319848, + "grad_norm": 0.62109375, + "learning_rate": 1.9890912095966274e-07, + "loss": 1.3596, + "step": 10858 + }, + { + "epoch": 1.8733718623307167, + "grad_norm": 0.89453125, + "learning_rate": 1.9836930128083076e-07, + "loss": 1.3449, + "step": 10859 + }, + { + "epoch": 1.8735443802294487, + "grad_norm": 0.65625, + "learning_rate": 1.9783020776804718e-07, + "loss": 1.3641, + "step": 10860 + }, + { + "epoch": 1.8737168981281807, + "grad_norm": 0.56640625, + "learning_rate": 1.9729184046124673e-07, + "loss": 1.4446, + "step": 10861 + }, + { + "epoch": 1.8738894160269128, + "grad_norm": 0.625, + "learning_rate": 1.9675419940031748e-07, + "loss": 1.4485, + "step": 10862 + }, + { + "epoch": 1.8740619339256448, + "grad_norm": 0.54296875, + "learning_rate": 1.9621728462508981e-07, + "loss": 1.4549, + "step": 10863 + }, + { + "epoch": 1.8742344518243768, + "grad_norm": 0.56640625, + "learning_rate": 1.9568109617534304e-07, + "loss": 1.4242, + "step": 10864 + }, + { + "epoch": 1.8744069697231087, + "grad_norm": 0.56640625, + "learning_rate": 1.951456340908009e-07, + "loss": 1.4486, + "step": 10865 + }, + { + "epoch": 1.874579487621841, + "grad_norm": 0.60546875, + "learning_rate": 1.9461089841113502e-07, + "loss": 1.4479, + "step": 10866 + }, + { + "epoch": 1.8747520055205729, + "grad_norm": 0.5546875, + "learning_rate": 1.9407688917595923e-07, + "loss": 1.4525, + "step": 10867 + }, + { + "epoch": 1.8749245234193048, + "grad_norm": 0.5625, + "learning_rate": 1.9354360642483862e-07, + "loss": 1.3623, + "step": 10868 + }, + { + "epoch": 1.8750970413180368, + "grad_norm": 0.60546875, + "learning_rate": 1.9301105019728038e-07, + "loss": 1.4158, + "step": 10869 + }, + { + "epoch": 1.8752695592167687, + "grad_norm": 0.546875, + "learning_rate": 1.9247922053273972e-07, + "loss": 1.4427, + "step": 10870 + }, + { + "epoch": 1.8754420771155007, + "grad_norm": 0.5859375, + "learning_rate": 1.9194811747062058e-07, + "loss": 1.3908, + "step": 10871 + }, + { + "epoch": 1.8756145950142327, + "grad_norm": 0.5546875, + "learning_rate": 1.914177410502671e-07, + "loss": 1.4076, + "step": 10872 + }, + { + "epoch": 1.8757871129129646, + "grad_norm": 1.0859375, + "learning_rate": 1.9088809131097562e-07, + "loss": 1.4296, + "step": 10873 + }, + { + "epoch": 1.8759596308116966, + "grad_norm": 0.6484375, + "learning_rate": 1.9035916829198255e-07, + "loss": 1.4531, + "step": 10874 + }, + { + "epoch": 1.8761321487104285, + "grad_norm": 0.578125, + "learning_rate": 1.8983097203247647e-07, + "loss": 1.4507, + "step": 10875 + }, + { + "epoch": 1.8763046666091607, + "grad_norm": 0.54296875, + "learning_rate": 1.8930350257158946e-07, + "loss": 1.4777, + "step": 10876 + }, + { + "epoch": 1.8764771845078927, + "grad_norm": 0.58984375, + "learning_rate": 1.8877675994839918e-07, + "loss": 1.513, + "step": 10877 + }, + { + "epoch": 1.8766497024066247, + "grad_norm": 0.5859375, + "learning_rate": 1.882507442019288e-07, + "loss": 1.4396, + "step": 10878 + }, + { + "epoch": 1.8768222203053568, + "grad_norm": 0.60546875, + "learning_rate": 1.877254553711505e-07, + "loss": 1.4054, + "step": 10879 + }, + { + "epoch": 1.8769947382040888, + "grad_norm": 0.578125, + "learning_rate": 1.8720089349498093e-07, + "loss": 1.4699, + "step": 10880 + }, + { + "epoch": 1.8771672561028208, + "grad_norm": 0.56640625, + "learning_rate": 1.8667705861228126e-07, + "loss": 1.3528, + "step": 10881 + }, + { + "epoch": 1.8773397740015527, + "grad_norm": 0.6015625, + "learning_rate": 1.861539507618626e-07, + "loss": 1.412, + "step": 10882 + }, + { + "epoch": 1.8775122919002847, + "grad_norm": 0.58203125, + "learning_rate": 1.8563156998247844e-07, + "loss": 1.3684, + "step": 10883 + }, + { + "epoch": 1.8776848097990166, + "grad_norm": 0.578125, + "learning_rate": 1.8510991631283003e-07, + "loss": 1.3471, + "step": 10884 + }, + { + "epoch": 1.8778573276977486, + "grad_norm": 0.578125, + "learning_rate": 1.8458898979156536e-07, + "loss": 1.5138, + "step": 10885 + }, + { + "epoch": 1.8780298455964806, + "grad_norm": 0.56640625, + "learning_rate": 1.840687904572791e-07, + "loss": 1.422, + "step": 10886 + }, + { + "epoch": 1.8782023634952125, + "grad_norm": 0.62890625, + "learning_rate": 1.835493183485082e-07, + "loss": 1.4165, + "step": 10887 + }, + { + "epoch": 1.8783748813939445, + "grad_norm": 0.55859375, + "learning_rate": 1.8303057350374077e-07, + "loss": 1.4187, + "step": 10888 + }, + { + "epoch": 1.8785473992926767, + "grad_norm": 0.609375, + "learning_rate": 1.8251255596140716e-07, + "loss": 1.3732, + "step": 10889 + }, + { + "epoch": 1.8787199171914086, + "grad_norm": 0.578125, + "learning_rate": 1.819952657598867e-07, + "loss": 1.3429, + "step": 10890 + }, + { + "epoch": 1.8788924350901406, + "grad_norm": 0.5703125, + "learning_rate": 1.8147870293750092e-07, + "loss": 1.3351, + "step": 10891 + }, + { + "epoch": 1.8790649529888725, + "grad_norm": 0.5546875, + "learning_rate": 1.8096286753252368e-07, + "loss": 1.3576, + "step": 10892 + }, + { + "epoch": 1.8792374708876047, + "grad_norm": 0.640625, + "learning_rate": 1.8044775958316884e-07, + "loss": 1.4096, + "step": 10893 + }, + { + "epoch": 1.8794099887863367, + "grad_norm": 0.5703125, + "learning_rate": 1.7993337912759924e-07, + "loss": 1.4544, + "step": 10894 + }, + { + "epoch": 1.8795825066850687, + "grad_norm": 0.5703125, + "learning_rate": 1.7941972620392322e-07, + "loss": 1.4095, + "step": 10895 + }, + { + "epoch": 1.8797550245838006, + "grad_norm": 0.5859375, + "learning_rate": 1.7890680085019597e-07, + "loss": 1.4747, + "step": 10896 + }, + { + "epoch": 1.8799275424825326, + "grad_norm": 0.6171875, + "learning_rate": 1.7839460310441814e-07, + "loss": 1.3743, + "step": 10897 + }, + { + "epoch": 1.8801000603812645, + "grad_norm": 0.5625, + "learning_rate": 1.778831330045372e-07, + "loss": 1.3766, + "step": 10898 + }, + { + "epoch": 1.8802725782799965, + "grad_norm": 0.56640625, + "learning_rate": 1.773723905884428e-07, + "loss": 1.4824, + "step": 10899 + }, + { + "epoch": 1.8804450961787285, + "grad_norm": 0.67578125, + "learning_rate": 1.7686237589397914e-07, + "loss": 1.5453, + "step": 10900 + }, + { + "epoch": 1.8804450961787285, + "eval_loss": 1.4070632457733154, + "eval_runtime": 10.8553, + "eval_samples_per_second": 94.332, + "eval_steps_per_second": 23.583, + "step": 10900 + }, + { + "epoch": 1.8806176140774604, + "grad_norm": 0.56640625, + "learning_rate": 1.76353088958926e-07, + "loss": 1.4266, + "step": 10901 + }, + { + "epoch": 1.8807901319761924, + "grad_norm": 0.7109375, + "learning_rate": 1.7584452982101764e-07, + "loss": 1.5084, + "step": 10902 + }, + { + "epoch": 1.8809626498749246, + "grad_norm": 0.5546875, + "learning_rate": 1.7533669851793166e-07, + "loss": 1.3585, + "step": 10903 + }, + { + "epoch": 1.8811351677736565, + "grad_norm": 0.57421875, + "learning_rate": 1.7482959508729024e-07, + "loss": 1.4781, + "step": 10904 + }, + { + "epoch": 1.8813076856723885, + "grad_norm": 0.65234375, + "learning_rate": 1.743232195666622e-07, + "loss": 1.4678, + "step": 10905 + }, + { + "epoch": 1.8814802035711207, + "grad_norm": 0.5703125, + "learning_rate": 1.738175719935642e-07, + "loss": 1.4632, + "step": 10906 + }, + { + "epoch": 1.8816527214698526, + "grad_norm": 0.65625, + "learning_rate": 1.7331265240545624e-07, + "loss": 1.3286, + "step": 10907 + }, + { + "epoch": 1.8818252393685846, + "grad_norm": 0.6015625, + "learning_rate": 1.7280846083974735e-07, + "loss": 1.4574, + "step": 10908 + }, + { + "epoch": 1.8819977572673166, + "grad_norm": 0.55078125, + "learning_rate": 1.7230499733379202e-07, + "loss": 1.4725, + "step": 10909 + }, + { + "epoch": 1.8821702751660485, + "grad_norm": 0.58984375, + "learning_rate": 1.7180226192488715e-07, + "loss": 1.4448, + "step": 10910 + }, + { + "epoch": 1.8823427930647805, + "grad_norm": 0.609375, + "learning_rate": 1.7130025465028178e-07, + "loss": 1.4886, + "step": 10911 + }, + { + "epoch": 1.8825153109635124, + "grad_norm": 0.55859375, + "learning_rate": 1.7079897554716508e-07, + "loss": 1.4193, + "step": 10912 + }, + { + "epoch": 1.8826878288622444, + "grad_norm": 0.5703125, + "learning_rate": 1.7029842465267622e-07, + "loss": 1.4011, + "step": 10913 + }, + { + "epoch": 1.8828603467609764, + "grad_norm": 0.5703125, + "learning_rate": 1.6979860200389885e-07, + "loss": 1.4604, + "step": 10914 + }, + { + "epoch": 1.8830328646597083, + "grad_norm": 0.60546875, + "learning_rate": 1.692995076378634e-07, + "loss": 1.4078, + "step": 10915 + }, + { + "epoch": 1.8832053825584403, + "grad_norm": 0.59765625, + "learning_rate": 1.6880114159154471e-07, + "loss": 1.4332, + "step": 10916 + }, + { + "epoch": 1.8833779004571725, + "grad_norm": 0.546875, + "learning_rate": 1.6830350390186546e-07, + "loss": 1.3531, + "step": 10917 + }, + { + "epoch": 1.8835504183559044, + "grad_norm": 0.546875, + "learning_rate": 1.6780659460569505e-07, + "loss": 1.3574, + "step": 10918 + }, + { + "epoch": 1.8837229362546364, + "grad_norm": 0.55859375, + "learning_rate": 1.6731041373984513e-07, + "loss": 1.308, + "step": 10919 + }, + { + "epoch": 1.8838954541533686, + "grad_norm": 0.609375, + "learning_rate": 1.6681496134107856e-07, + "loss": 1.4647, + "step": 10920 + }, + { + "epoch": 1.8840679720521005, + "grad_norm": 0.58984375, + "learning_rate": 1.6632023744609925e-07, + "loss": 1.5234, + "step": 10921 + }, + { + "epoch": 1.8842404899508325, + "grad_norm": 0.65625, + "learning_rate": 1.658262420915613e-07, + "loss": 1.4465, + "step": 10922 + }, + { + "epoch": 1.8844130078495644, + "grad_norm": 0.63671875, + "learning_rate": 1.653329753140609e-07, + "loss": 1.3919, + "step": 10923 + }, + { + "epoch": 1.8845855257482964, + "grad_norm": 0.8046875, + "learning_rate": 1.648404371501444e-07, + "loss": 1.3689, + "step": 10924 + }, + { + "epoch": 1.8847580436470284, + "grad_norm": 0.68359375, + "learning_rate": 1.6434862763630156e-07, + "loss": 1.4015, + "step": 10925 + }, + { + "epoch": 1.8849305615457603, + "grad_norm": 0.5703125, + "learning_rate": 1.6385754680896758e-07, + "loss": 1.452, + "step": 10926 + }, + { + "epoch": 1.8851030794444923, + "grad_norm": 0.60546875, + "learning_rate": 1.6336719470452566e-07, + "loss": 1.4592, + "step": 10927 + }, + { + "epoch": 1.8852755973432243, + "grad_norm": 0.578125, + "learning_rate": 1.6287757135930448e-07, + "loss": 1.5216, + "step": 10928 + }, + { + "epoch": 1.8854481152419562, + "grad_norm": 0.5703125, + "learning_rate": 1.6238867680957726e-07, + "loss": 1.3362, + "step": 10929 + }, + { + "epoch": 1.8856206331406884, + "grad_norm": 0.60546875, + "learning_rate": 1.6190051109156613e-07, + "loss": 1.4932, + "step": 10930 + }, + { + "epoch": 1.8857931510394204, + "grad_norm": 0.578125, + "learning_rate": 1.6141307424143549e-07, + "loss": 1.3702, + "step": 10931 + }, + { + "epoch": 1.8859656689381523, + "grad_norm": 0.6015625, + "learning_rate": 1.609263662952998e-07, + "loss": 1.383, + "step": 10932 + }, + { + "epoch": 1.8861381868368843, + "grad_norm": 0.55859375, + "learning_rate": 1.6044038728921575e-07, + "loss": 1.3771, + "step": 10933 + }, + { + "epoch": 1.8863107047356165, + "grad_norm": 0.61328125, + "learning_rate": 1.5995513725918676e-07, + "loss": 1.4453, + "step": 10934 + }, + { + "epoch": 1.8864832226343484, + "grad_norm": 0.58984375, + "learning_rate": 1.5947061624116634e-07, + "loss": 1.4363, + "step": 10935 + }, + { + "epoch": 1.8866557405330804, + "grad_norm": 0.59765625, + "learning_rate": 1.5898682427104905e-07, + "loss": 1.4706, + "step": 10936 + }, + { + "epoch": 1.8868282584318123, + "grad_norm": 0.61328125, + "learning_rate": 1.5850376138467626e-07, + "loss": 1.2775, + "step": 10937 + }, + { + "epoch": 1.8870007763305443, + "grad_norm": 0.5703125, + "learning_rate": 1.5802142761783824e-07, + "loss": 1.5225, + "step": 10938 + }, + { + "epoch": 1.8871732942292763, + "grad_norm": 0.7421875, + "learning_rate": 1.5753982300626859e-07, + "loss": 1.3915, + "step": 10939 + }, + { + "epoch": 1.8873458121280082, + "grad_norm": 0.55078125, + "learning_rate": 1.5705894758564654e-07, + "loss": 1.4447, + "step": 10940 + }, + { + "epoch": 1.8875183300267402, + "grad_norm": 0.56640625, + "learning_rate": 1.565788013916003e-07, + "loss": 1.3441, + "step": 10941 + }, + { + "epoch": 1.8876908479254721, + "grad_norm": 0.546875, + "learning_rate": 1.560993844596992e-07, + "loss": 1.3908, + "step": 10942 + }, + { + "epoch": 1.887863365824204, + "grad_norm": 0.6015625, + "learning_rate": 1.5562069682546587e-07, + "loss": 1.4749, + "step": 10943 + }, + { + "epoch": 1.8880358837229363, + "grad_norm": 0.578125, + "learning_rate": 1.551427385243587e-07, + "loss": 1.3988, + "step": 10944 + }, + { + "epoch": 1.8882084016216683, + "grad_norm": 0.6015625, + "learning_rate": 1.546655095917937e-07, + "loss": 1.5236, + "step": 10945 + }, + { + "epoch": 1.8883809195204002, + "grad_norm": 0.60546875, + "learning_rate": 1.541890100631227e-07, + "loss": 1.4393, + "step": 10946 + }, + { + "epoch": 1.8885534374191324, + "grad_norm": 0.58203125, + "learning_rate": 1.5371323997364962e-07, + "loss": 1.4745, + "step": 10947 + }, + { + "epoch": 1.8887259553178644, + "grad_norm": 0.57421875, + "learning_rate": 1.5323819935862183e-07, + "loss": 1.491, + "step": 10948 + }, + { + "epoch": 1.8888984732165963, + "grad_norm": 0.72265625, + "learning_rate": 1.5276388825323452e-07, + "loss": 1.3892, + "step": 10949 + }, + { + "epoch": 1.8890709911153283, + "grad_norm": 0.5546875, + "learning_rate": 1.5229030669262622e-07, + "loss": 1.3417, + "step": 10950 + }, + { + "epoch": 1.8892435090140602, + "grad_norm": 0.5625, + "learning_rate": 1.5181745471188336e-07, + "loss": 1.4163, + "step": 10951 + }, + { + "epoch": 1.8894160269127922, + "grad_norm": 0.578125, + "learning_rate": 1.5134533234603786e-07, + "loss": 1.4501, + "step": 10952 + }, + { + "epoch": 1.8895885448115242, + "grad_norm": 0.56640625, + "learning_rate": 1.5087393963006736e-07, + "loss": 1.4131, + "step": 10953 + }, + { + "epoch": 1.8897610627102561, + "grad_norm": 0.61328125, + "learning_rate": 1.504032765988961e-07, + "loss": 1.4359, + "step": 10954 + }, + { + "epoch": 1.889933580608988, + "grad_norm": 0.5859375, + "learning_rate": 1.4993334328739174e-07, + "loss": 1.3766, + "step": 10955 + }, + { + "epoch": 1.89010609850772, + "grad_norm": 0.71484375, + "learning_rate": 1.494641397303731e-07, + "loss": 1.4124, + "step": 10956 + }, + { + "epoch": 1.890278616406452, + "grad_norm": 0.56640625, + "learning_rate": 1.4899566596259907e-07, + "loss": 1.4417, + "step": 10957 + }, + { + "epoch": 1.8904511343051842, + "grad_norm": 0.62890625, + "learning_rate": 1.4852792201877742e-07, + "loss": 1.3079, + "step": 10958 + }, + { + "epoch": 1.8906236522039161, + "grad_norm": 0.58203125, + "learning_rate": 1.4806090793356266e-07, + "loss": 1.5389, + "step": 10959 + }, + { + "epoch": 1.890796170102648, + "grad_norm": 0.56640625, + "learning_rate": 1.4759462374155376e-07, + "loss": 1.4332, + "step": 10960 + }, + { + "epoch": 1.8909686880013803, + "grad_norm": 0.6328125, + "learning_rate": 1.4712906947729643e-07, + "loss": 1.3728, + "step": 10961 + }, + { + "epoch": 1.8911412059001123, + "grad_norm": 0.62109375, + "learning_rate": 1.4666424517528088e-07, + "loss": 1.3431, + "step": 10962 + }, + { + "epoch": 1.8913137237988442, + "grad_norm": 0.546875, + "learning_rate": 1.4620015086994398e-07, + "loss": 1.3682, + "step": 10963 + }, + { + "epoch": 1.8914862416975762, + "grad_norm": 0.5625, + "learning_rate": 1.457367865956705e-07, + "loss": 1.4846, + "step": 10964 + }, + { + "epoch": 1.8916587595963081, + "grad_norm": 0.578125, + "learning_rate": 1.4527415238678622e-07, + "loss": 1.4709, + "step": 10965 + }, + { + "epoch": 1.89183127749504, + "grad_norm": 0.58984375, + "learning_rate": 1.4481224827756824e-07, + "loss": 1.3932, + "step": 10966 + }, + { + "epoch": 1.892003795393772, + "grad_norm": 0.8359375, + "learning_rate": 1.4435107430223806e-07, + "loss": 1.3974, + "step": 10967 + }, + { + "epoch": 1.892176313292504, + "grad_norm": 0.5703125, + "learning_rate": 1.4389063049496165e-07, + "loss": 1.4036, + "step": 10968 + }, + { + "epoch": 1.892348831191236, + "grad_norm": 0.6015625, + "learning_rate": 1.4343091688984956e-07, + "loss": 1.406, + "step": 10969 + }, + { + "epoch": 1.892521349089968, + "grad_norm": 0.61328125, + "learning_rate": 1.4297193352096228e-07, + "loss": 1.3486, + "step": 10970 + }, + { + "epoch": 1.8926938669887001, + "grad_norm": 0.68359375, + "learning_rate": 1.4251368042230485e-07, + "loss": 1.4483, + "step": 10971 + }, + { + "epoch": 1.892866384887432, + "grad_norm": 0.55859375, + "learning_rate": 1.4205615762782566e-07, + "loss": 1.4674, + "step": 10972 + }, + { + "epoch": 1.893038902786164, + "grad_norm": 0.56640625, + "learning_rate": 1.415993651714209e-07, + "loss": 1.3881, + "step": 10973 + }, + { + "epoch": 1.893211420684896, + "grad_norm": 0.578125, + "learning_rate": 1.4114330308693358e-07, + "loss": 1.3689, + "step": 10974 + }, + { + "epoch": 1.8933839385836282, + "grad_norm": 0.57421875, + "learning_rate": 1.4068797140815217e-07, + "loss": 1.4367, + "step": 10975 + }, + { + "epoch": 1.8935564564823602, + "grad_norm": 0.546875, + "learning_rate": 1.4023337016880856e-07, + "loss": 1.3465, + "step": 10976 + }, + { + "epoch": 1.8937289743810921, + "grad_norm": 0.56640625, + "learning_rate": 1.3977949940258474e-07, + "loss": 1.4499, + "step": 10977 + }, + { + "epoch": 1.893901492279824, + "grad_norm": 0.58984375, + "learning_rate": 1.3932635914310488e-07, + "loss": 1.4315, + "step": 10978 + }, + { + "epoch": 1.894074010178556, + "grad_norm": 0.59765625, + "learning_rate": 1.3887394942393994e-07, + "loss": 1.5568, + "step": 10979 + }, + { + "epoch": 1.894246528077288, + "grad_norm": 0.58984375, + "learning_rate": 1.3842227027860866e-07, + "loss": 1.4174, + "step": 10980 + }, + { + "epoch": 1.89441904597602, + "grad_norm": 0.5859375, + "learning_rate": 1.3797132174057315e-07, + "loss": 1.3474, + "step": 10981 + }, + { + "epoch": 1.894591563874752, + "grad_norm": 0.59375, + "learning_rate": 1.3752110384324336e-07, + "loss": 1.4749, + "step": 10982 + }, + { + "epoch": 1.8947640817734839, + "grad_norm": 0.75390625, + "learning_rate": 1.370716166199726e-07, + "loss": 1.4026, + "step": 10983 + }, + { + "epoch": 1.8949365996722158, + "grad_norm": 0.58203125, + "learning_rate": 1.3662286010406423e-07, + "loss": 1.3766, + "step": 10984 + }, + { + "epoch": 1.895109117570948, + "grad_norm": 0.66796875, + "learning_rate": 1.3617483432876278e-07, + "loss": 1.4039, + "step": 10985 + }, + { + "epoch": 1.89528163546968, + "grad_norm": 0.5859375, + "learning_rate": 1.357275393272628e-07, + "loss": 1.3893, + "step": 10986 + }, + { + "epoch": 1.895454153368412, + "grad_norm": 0.609375, + "learning_rate": 1.352809751327e-07, + "loss": 1.4591, + "step": 10987 + }, + { + "epoch": 1.8956266712671441, + "grad_norm": 0.55859375, + "learning_rate": 1.3483514177816127e-07, + "loss": 1.4423, + "step": 10988 + }, + { + "epoch": 1.895799189165876, + "grad_norm": 0.65234375, + "learning_rate": 1.343900392966757e-07, + "loss": 1.3756, + "step": 10989 + }, + { + "epoch": 1.895971707064608, + "grad_norm": 0.73828125, + "learning_rate": 1.339456677212192e-07, + "loss": 1.4249, + "step": 10990 + }, + { + "epoch": 1.89614422496334, + "grad_norm": 0.69921875, + "learning_rate": 1.3350202708471316e-07, + "loss": 1.4022, + "step": 10991 + }, + { + "epoch": 1.896316742862072, + "grad_norm": 0.65625, + "learning_rate": 1.3305911742002575e-07, + "loss": 1.3189, + "step": 10992 + }, + { + "epoch": 1.896489260760804, + "grad_norm": 0.56640625, + "learning_rate": 1.3261693875996962e-07, + "loss": 1.3691, + "step": 10993 + }, + { + "epoch": 1.896661778659536, + "grad_norm": 0.5703125, + "learning_rate": 1.3217549113730633e-07, + "loss": 1.4177, + "step": 10994 + }, + { + "epoch": 1.8968342965582679, + "grad_norm": 0.5703125, + "learning_rate": 1.317347745847386e-07, + "loss": 1.4069, + "step": 10995 + }, + { + "epoch": 1.8970068144569998, + "grad_norm": 0.58203125, + "learning_rate": 1.3129478913491923e-07, + "loss": 1.4167, + "step": 10996 + }, + { + "epoch": 1.8971793323557318, + "grad_norm": 0.5859375, + "learning_rate": 1.3085553482044434e-07, + "loss": 1.4617, + "step": 10997 + }, + { + "epoch": 1.8973518502544637, + "grad_norm": 0.6328125, + "learning_rate": 1.3041701167385567e-07, + "loss": 1.4375, + "step": 10998 + }, + { + "epoch": 1.897524368153196, + "grad_norm": 0.59375, + "learning_rate": 1.2997921972764395e-07, + "loss": 1.4244, + "step": 10999 + }, + { + "epoch": 1.8976968860519279, + "grad_norm": 0.62890625, + "learning_rate": 1.2954215901424204e-07, + "loss": 1.4471, + "step": 11000 + }, + { + "epoch": 1.8976968860519279, + "eval_loss": 1.4070650339126587, + "eval_runtime": 10.8027, + "eval_samples_per_second": 94.791, + "eval_steps_per_second": 23.698, + "step": 11000 + }, + { + "epoch": 1.8978694039506598, + "grad_norm": 0.59375, + "learning_rate": 1.291058295660308e-07, + "loss": 1.426, + "step": 11001 + }, + { + "epoch": 1.898041921849392, + "grad_norm": 0.6015625, + "learning_rate": 1.2867023141533542e-07, + "loss": 1.4994, + "step": 11002 + }, + { + "epoch": 1.898214439748124, + "grad_norm": 0.58203125, + "learning_rate": 1.2823536459442788e-07, + "loss": 1.4619, + "step": 11003 + }, + { + "epoch": 1.898386957646856, + "grad_norm": 0.62109375, + "learning_rate": 1.2780122913552684e-07, + "loss": 1.4617, + "step": 11004 + }, + { + "epoch": 1.898559475545588, + "grad_norm": 0.6875, + "learning_rate": 1.2736782507079436e-07, + "loss": 1.3247, + "step": 11005 + }, + { + "epoch": 1.8987319934443199, + "grad_norm": 0.59765625, + "learning_rate": 1.2693515243234146e-07, + "loss": 1.5786, + "step": 11006 + }, + { + "epoch": 1.8989045113430518, + "grad_norm": 0.59375, + "learning_rate": 1.2650321125222243e-07, + "loss": 1.4376, + "step": 11007 + }, + { + "epoch": 1.8990770292417838, + "grad_norm": 0.5546875, + "learning_rate": 1.2607200156243615e-07, + "loss": 1.4922, + "step": 11008 + }, + { + "epoch": 1.8992495471405157, + "grad_norm": 0.53125, + "learning_rate": 1.2564152339493263e-07, + "loss": 1.3807, + "step": 11009 + }, + { + "epoch": 1.8994220650392477, + "grad_norm": 2.203125, + "learning_rate": 1.2521177678160302e-07, + "loss": 1.3754, + "step": 11010 + }, + { + "epoch": 1.8995945829379797, + "grad_norm": 0.58203125, + "learning_rate": 1.2478276175428516e-07, + "loss": 1.451, + "step": 11011 + }, + { + "epoch": 1.8997671008367119, + "grad_norm": 0.6171875, + "learning_rate": 1.2435447834476254e-07, + "loss": 1.472, + "step": 11012 + }, + { + "epoch": 1.8999396187354438, + "grad_norm": 0.55859375, + "learning_rate": 1.2392692658476758e-07, + "loss": 1.4268, + "step": 11013 + }, + { + "epoch": 1.9001121366341758, + "grad_norm": 0.58203125, + "learning_rate": 1.2350010650597378e-07, + "loss": 1.4491, + "step": 11014 + }, + { + "epoch": 1.9002846545329077, + "grad_norm": 0.57421875, + "learning_rate": 1.2307401814000252e-07, + "loss": 1.4408, + "step": 11015 + }, + { + "epoch": 1.90045717243164, + "grad_norm": 0.58203125, + "learning_rate": 1.2264866151842304e-07, + "loss": 1.3987, + "step": 11016 + }, + { + "epoch": 1.9006296903303719, + "grad_norm": 0.56640625, + "learning_rate": 1.2222403667274673e-07, + "loss": 1.3893, + "step": 11017 + }, + { + "epoch": 1.9008022082291038, + "grad_norm": 0.5703125, + "learning_rate": 1.2180014363443404e-07, + "loss": 1.3289, + "step": 11018 + }, + { + "epoch": 1.9009747261278358, + "grad_norm": 0.5859375, + "learning_rate": 1.213769824348865e-07, + "loss": 1.3839, + "step": 11019 + }, + { + "epoch": 1.9011472440265678, + "grad_norm": 0.578125, + "learning_rate": 1.2095455310545788e-07, + "loss": 1.3243, + "step": 11020 + }, + { + "epoch": 1.9013197619252997, + "grad_norm": 0.5703125, + "learning_rate": 1.2053285567744323e-07, + "loss": 1.4218, + "step": 11021 + }, + { + "epoch": 1.9014922798240317, + "grad_norm": 0.5859375, + "learning_rate": 1.2011189018208414e-07, + "loss": 1.4655, + "step": 11022 + }, + { + "epoch": 1.9016647977227636, + "grad_norm": 1.453125, + "learning_rate": 1.1969165665056904e-07, + "loss": 1.4107, + "step": 11023 + }, + { + "epoch": 1.9018373156214956, + "grad_norm": 0.56640625, + "learning_rate": 1.1927215511402968e-07, + "loss": 1.4179, + "step": 11024 + }, + { + "epoch": 1.9020098335202276, + "grad_norm": 0.578125, + "learning_rate": 1.1885338560354787e-07, + "loss": 1.3512, + "step": 11025 + }, + { + "epoch": 1.9021823514189597, + "grad_norm": 0.62890625, + "learning_rate": 1.1843534815014767e-07, + "loss": 1.4255, + "step": 11026 + }, + { + "epoch": 1.9023548693176917, + "grad_norm": 0.57421875, + "learning_rate": 1.1801804278479877e-07, + "loss": 1.3959, + "step": 11027 + }, + { + "epoch": 1.9025273872164237, + "grad_norm": 0.57421875, + "learning_rate": 1.1760146953841978e-07, + "loss": 1.4338, + "step": 11028 + }, + { + "epoch": 1.9026999051151559, + "grad_norm": 0.59375, + "learning_rate": 1.1718562844187153e-07, + "loss": 1.4376, + "step": 11029 + }, + { + "epoch": 1.9028724230138878, + "grad_norm": 0.58203125, + "learning_rate": 1.1677051952596164e-07, + "loss": 1.4145, + "step": 11030 + }, + { + "epoch": 1.9030449409126198, + "grad_norm": 0.58203125, + "learning_rate": 1.1635614282144658e-07, + "loss": 1.3891, + "step": 11031 + }, + { + "epoch": 1.9032174588113517, + "grad_norm": 0.64453125, + "learning_rate": 1.1594249835902294e-07, + "loss": 1.4681, + "step": 11032 + }, + { + "epoch": 1.9033899767100837, + "grad_norm": 0.5703125, + "learning_rate": 1.155295861693384e-07, + "loss": 1.4878, + "step": 11033 + }, + { + "epoch": 1.9035624946088157, + "grad_norm": 0.58984375, + "learning_rate": 1.1511740628298296e-07, + "loss": 1.343, + "step": 11034 + }, + { + "epoch": 1.9037350125075476, + "grad_norm": 0.625, + "learning_rate": 1.147059587304944e-07, + "loss": 1.3924, + "step": 11035 + }, + { + "epoch": 1.9039075304062796, + "grad_norm": 0.56640625, + "learning_rate": 1.1429524354235277e-07, + "loss": 1.4782, + "step": 11036 + }, + { + "epoch": 1.9040800483050115, + "grad_norm": 0.61328125, + "learning_rate": 1.1388526074898931e-07, + "loss": 1.4173, + "step": 11037 + }, + { + "epoch": 1.9042525662037435, + "grad_norm": 0.57421875, + "learning_rate": 1.134760103807775e-07, + "loss": 1.3841, + "step": 11038 + }, + { + "epoch": 1.9044250841024757, + "grad_norm": 0.6015625, + "learning_rate": 1.130674924680364e-07, + "loss": 1.4633, + "step": 11039 + }, + { + "epoch": 1.9045976020012076, + "grad_norm": 0.56640625, + "learning_rate": 1.1265970704103069e-07, + "loss": 1.3245, + "step": 11040 + }, + { + "epoch": 1.9047701198999396, + "grad_norm": 0.58203125, + "learning_rate": 1.122526541299751e-07, + "loss": 1.4491, + "step": 11041 + }, + { + "epoch": 1.9049426377986716, + "grad_norm": 0.60546875, + "learning_rate": 1.1184633376502218e-07, + "loss": 1.4918, + "step": 11042 + }, + { + "epoch": 1.9051151556974038, + "grad_norm": 0.625, + "learning_rate": 1.1144074597627785e-07, + "loss": 1.4963, + "step": 11043 + }, + { + "epoch": 1.9052876735961357, + "grad_norm": 0.6328125, + "learning_rate": 1.1103589079378918e-07, + "loss": 1.3569, + "step": 11044 + }, + { + "epoch": 1.9054601914948677, + "grad_norm": 0.58203125, + "learning_rate": 1.1063176824755107e-07, + "loss": 1.4771, + "step": 11045 + }, + { + "epoch": 1.9056327093935996, + "grad_norm": 0.61328125, + "learning_rate": 1.1022837836750399e-07, + "loss": 1.3965, + "step": 11046 + }, + { + "epoch": 1.9058052272923316, + "grad_norm": 0.61328125, + "learning_rate": 1.0982572118353186e-07, + "loss": 1.5245, + "step": 11047 + }, + { + "epoch": 1.9059777451910636, + "grad_norm": 0.58203125, + "learning_rate": 1.0942379672546743e-07, + "loss": 1.3126, + "step": 11048 + }, + { + "epoch": 1.9061502630897955, + "grad_norm": 0.6484375, + "learning_rate": 1.0902260502308692e-07, + "loss": 1.4799, + "step": 11049 + }, + { + "epoch": 1.9063227809885275, + "grad_norm": 0.60546875, + "learning_rate": 1.0862214610611432e-07, + "loss": 1.3847, + "step": 11050 + }, + { + "epoch": 1.9064952988872594, + "grad_norm": 0.6171875, + "learning_rate": 1.082224200042159e-07, + "loss": 1.5114, + "step": 11051 + }, + { + "epoch": 1.9066678167859914, + "grad_norm": 0.6875, + "learning_rate": 1.0782342674700907e-07, + "loss": 1.3262, + "step": 11052 + }, + { + "epoch": 1.9068403346847236, + "grad_norm": 4.6875, + "learning_rate": 1.074251663640502e-07, + "loss": 1.3784, + "step": 11053 + }, + { + "epoch": 1.9070128525834555, + "grad_norm": 0.63671875, + "learning_rate": 1.0702763888484791e-07, + "loss": 1.4332, + "step": 11054 + }, + { + "epoch": 1.9071853704821875, + "grad_norm": 0.83203125, + "learning_rate": 1.0663084433885196e-07, + "loss": 1.3191, + "step": 11055 + }, + { + "epoch": 1.9073578883809197, + "grad_norm": 0.59765625, + "learning_rate": 1.0623478275545884e-07, + "loss": 1.4464, + "step": 11056 + }, + { + "epoch": 1.9075304062796516, + "grad_norm": 0.6484375, + "learning_rate": 1.0583945416401286e-07, + "loss": 1.4451, + "step": 11057 + }, + { + "epoch": 1.9077029241783836, + "grad_norm": 0.62890625, + "learning_rate": 1.0544485859380172e-07, + "loss": 1.4179, + "step": 11058 + }, + { + "epoch": 1.9078754420771156, + "grad_norm": 0.9375, + "learning_rate": 1.0505099607405866e-07, + "loss": 1.4735, + "step": 11059 + }, + { + "epoch": 1.9080479599758475, + "grad_norm": 0.5390625, + "learning_rate": 1.0465786663396593e-07, + "loss": 1.4257, + "step": 11060 + }, + { + "epoch": 1.9082204778745795, + "grad_norm": 0.58203125, + "learning_rate": 1.0426547030264577e-07, + "loss": 1.4477, + "step": 11061 + }, + { + "epoch": 1.9083929957733115, + "grad_norm": 0.55859375, + "learning_rate": 1.0387380710917161e-07, + "loss": 1.4187, + "step": 11062 + }, + { + "epoch": 1.9085655136720434, + "grad_norm": 0.578125, + "learning_rate": 1.0348287708256021e-07, + "loss": 1.4474, + "step": 11063 + }, + { + "epoch": 1.9087380315707754, + "grad_norm": 0.58203125, + "learning_rate": 1.0309268025177288e-07, + "loss": 1.4031, + "step": 11064 + }, + { + "epoch": 1.9089105494695073, + "grad_norm": 0.66796875, + "learning_rate": 1.0270321664571981e-07, + "loss": 1.4469, + "step": 11065 + }, + { + "epoch": 1.9090830673682393, + "grad_norm": 0.55859375, + "learning_rate": 1.0231448629325236e-07, + "loss": 1.2811, + "step": 11066 + }, + { + "epoch": 1.9092555852669715, + "grad_norm": 0.61328125, + "learning_rate": 1.0192648922317084e-07, + "loss": 1.4864, + "step": 11067 + }, + { + "epoch": 1.9094281031657034, + "grad_norm": 0.6015625, + "learning_rate": 1.0153922546422223e-07, + "loss": 1.4214, + "step": 11068 + }, + { + "epoch": 1.9096006210644354, + "grad_norm": 0.6171875, + "learning_rate": 1.0115269504509583e-07, + "loss": 1.4991, + "step": 11069 + }, + { + "epoch": 1.9097731389631676, + "grad_norm": 0.6015625, + "learning_rate": 1.0076689799442874e-07, + "loss": 1.3097, + "step": 11070 + }, + { + "epoch": 1.9099456568618995, + "grad_norm": 0.6328125, + "learning_rate": 1.0038183434080363e-07, + "loss": 1.3634, + "step": 11071 + }, + { + "epoch": 1.9101181747606315, + "grad_norm": 0.5859375, + "learning_rate": 9.999750411274655e-08, + "loss": 1.3467, + "step": 11072 + }, + { + "epoch": 1.9102906926593635, + "grad_norm": 0.57421875, + "learning_rate": 9.961390733873366e-08, + "loss": 1.456, + "step": 11073 + }, + { + "epoch": 1.9104632105580954, + "grad_norm": 0.578125, + "learning_rate": 9.92310440471822e-08, + "loss": 1.3618, + "step": 11074 + }, + { + "epoch": 1.9106357284568274, + "grad_norm": 0.61328125, + "learning_rate": 9.884891426645837e-08, + "loss": 1.4833, + "step": 11075 + }, + { + "epoch": 1.9108082463555593, + "grad_norm": 0.65625, + "learning_rate": 9.846751802487175e-08, + "loss": 1.4058, + "step": 11076 + }, + { + "epoch": 1.9109807642542913, + "grad_norm": 0.5546875, + "learning_rate": 9.808685535067863e-08, + "loss": 1.5136, + "step": 11077 + }, + { + "epoch": 1.9111532821530233, + "grad_norm": 0.58203125, + "learning_rate": 9.7706926272082e-08, + "loss": 1.4108, + "step": 11078 + }, + { + "epoch": 1.9113258000517552, + "grad_norm": 0.59375, + "learning_rate": 9.732773081722824e-08, + "loss": 1.4449, + "step": 11079 + }, + { + "epoch": 1.9114983179504874, + "grad_norm": 0.60546875, + "learning_rate": 9.694926901421153e-08, + "loss": 1.4676, + "step": 11080 + }, + { + "epoch": 1.9116708358492194, + "grad_norm": 0.60546875, + "learning_rate": 9.657154089106946e-08, + "loss": 1.4309, + "step": 11081 + }, + { + "epoch": 1.9118433537479513, + "grad_norm": 0.546875, + "learning_rate": 9.619454647578852e-08, + "loss": 1.3618, + "step": 11082 + }, + { + "epoch": 1.9120158716466833, + "grad_norm": 0.5390625, + "learning_rate": 9.581828579629526e-08, + "loss": 1.3586, + "step": 11083 + }, + { + "epoch": 1.9121883895454155, + "grad_norm": 0.5625, + "learning_rate": 9.544275888046961e-08, + "loss": 1.3806, + "step": 11084 + }, + { + "epoch": 1.9123609074441474, + "grad_norm": 0.6015625, + "learning_rate": 9.506796575612931e-08, + "loss": 1.3433, + "step": 11085 + }, + { + "epoch": 1.9125334253428794, + "grad_norm": 0.59375, + "learning_rate": 9.469390645104437e-08, + "loss": 1.4082, + "step": 11086 + }, + { + "epoch": 1.9127059432416114, + "grad_norm": 0.6484375, + "learning_rate": 9.432058099292484e-08, + "loss": 1.3976, + "step": 11087 + }, + { + "epoch": 1.9128784611403433, + "grad_norm": 0.5546875, + "learning_rate": 9.394798940943083e-08, + "loss": 1.353, + "step": 11088 + }, + { + "epoch": 1.9130509790390753, + "grad_norm": 0.57421875, + "learning_rate": 9.357613172816471e-08, + "loss": 1.377, + "step": 11089 + }, + { + "epoch": 1.9132234969378072, + "grad_norm": 0.59765625, + "learning_rate": 9.320500797667886e-08, + "loss": 1.4407, + "step": 11090 + }, + { + "epoch": 1.9133960148365392, + "grad_norm": 0.57421875, + "learning_rate": 9.283461818246464e-08, + "loss": 1.3338, + "step": 11091 + }, + { + "epoch": 1.9135685327352712, + "grad_norm": 0.6015625, + "learning_rate": 9.246496237296565e-08, + "loss": 1.5407, + "step": 11092 + }, + { + "epoch": 1.9137410506340031, + "grad_norm": 0.62890625, + "learning_rate": 9.209604057556665e-08, + "loss": 1.448, + "step": 11093 + }, + { + "epoch": 1.9139135685327353, + "grad_norm": 0.5546875, + "learning_rate": 9.172785281760132e-08, + "loss": 1.381, + "step": 11094 + }, + { + "epoch": 1.9140860864314673, + "grad_norm": 0.640625, + "learning_rate": 9.136039912634675e-08, + "loss": 1.4523, + "step": 11095 + }, + { + "epoch": 1.9142586043301992, + "grad_norm": 0.5703125, + "learning_rate": 9.099367952902449e-08, + "loss": 1.4942, + "step": 11096 + }, + { + "epoch": 1.9144311222289314, + "grad_norm": 0.60546875, + "learning_rate": 9.062769405280614e-08, + "loss": 1.369, + "step": 11097 + }, + { + "epoch": 1.9146036401276634, + "grad_norm": 0.609375, + "learning_rate": 9.026244272480445e-08, + "loss": 1.3735, + "step": 11098 + }, + { + "epoch": 1.9147761580263953, + "grad_norm": 0.59765625, + "learning_rate": 8.989792557207889e-08, + "loss": 1.4239, + "step": 11099 + }, + { + "epoch": 1.9149486759251273, + "grad_norm": 0.62109375, + "learning_rate": 8.953414262163674e-08, + "loss": 1.3988, + "step": 11100 + }, + { + "epoch": 1.9149486759251273, + "eval_loss": 1.4070982933044434, + "eval_runtime": 10.8142, + "eval_samples_per_second": 94.69, + "eval_steps_per_second": 23.673, + "step": 11100 + }, + { + "epoch": 1.9151211938238593, + "grad_norm": 0.5703125, + "learning_rate": 8.917109390042866e-08, + "loss": 1.4235, + "step": 11101 + }, + { + "epoch": 1.9152937117225912, + "grad_norm": 0.53125, + "learning_rate": 8.880877943535204e-08, + "loss": 1.3556, + "step": 11102 + }, + { + "epoch": 1.9154662296213232, + "grad_norm": 0.59765625, + "learning_rate": 8.844719925324985e-08, + "loss": 1.46, + "step": 11103 + }, + { + "epoch": 1.9156387475200551, + "grad_norm": 0.58984375, + "learning_rate": 8.808635338090732e-08, + "loss": 1.4678, + "step": 11104 + }, + { + "epoch": 1.915811265418787, + "grad_norm": 0.578125, + "learning_rate": 8.772624184506196e-08, + "loss": 1.4166, + "step": 11105 + }, + { + "epoch": 1.915983783317519, + "grad_norm": 0.6328125, + "learning_rate": 8.736686467239131e-08, + "loss": 1.435, + "step": 11106 + }, + { + "epoch": 1.916156301216251, + "grad_norm": 0.5703125, + "learning_rate": 8.700822188951963e-08, + "loss": 1.5098, + "step": 11107 + }, + { + "epoch": 1.9163288191149832, + "grad_norm": 0.66796875, + "learning_rate": 8.665031352301789e-08, + "loss": 1.3868, + "step": 11108 + }, + { + "epoch": 1.9165013370137152, + "grad_norm": 0.5703125, + "learning_rate": 8.629313959940266e-08, + "loss": 1.3686, + "step": 11109 + }, + { + "epoch": 1.9166738549124471, + "grad_norm": 0.57421875, + "learning_rate": 8.5936700145135e-08, + "loss": 1.4824, + "step": 11110 + }, + { + "epoch": 1.9168463728111793, + "grad_norm": 0.8203125, + "learning_rate": 8.558099518662378e-08, + "loss": 1.4923, + "step": 11111 + }, + { + "epoch": 1.9170188907099113, + "grad_norm": 0.5625, + "learning_rate": 8.522602475021902e-08, + "loss": 1.3338, + "step": 11112 + }, + { + "epoch": 1.9171914086086432, + "grad_norm": 0.59765625, + "learning_rate": 8.487178886222192e-08, + "loss": 1.3928, + "step": 11113 + }, + { + "epoch": 1.9173639265073752, + "grad_norm": 0.5625, + "learning_rate": 8.451828754887481e-08, + "loss": 1.4036, + "step": 11114 + }, + { + "epoch": 1.9175364444061072, + "grad_norm": 0.6171875, + "learning_rate": 8.416552083636676e-08, + "loss": 1.4697, + "step": 11115 + }, + { + "epoch": 1.9177089623048391, + "grad_norm": 0.5859375, + "learning_rate": 8.381348875083573e-08, + "loss": 1.3408, + "step": 11116 + }, + { + "epoch": 1.917881480203571, + "grad_norm": 0.57421875, + "learning_rate": 8.346219131835976e-08, + "loss": 1.5091, + "step": 11117 + }, + { + "epoch": 1.918053998102303, + "grad_norm": 0.6171875, + "learning_rate": 8.31116285649658e-08, + "loss": 1.3755, + "step": 11118 + }, + { + "epoch": 1.918226516001035, + "grad_norm": 0.5859375, + "learning_rate": 8.276180051662641e-08, + "loss": 1.4617, + "step": 11119 + }, + { + "epoch": 1.918399033899767, + "grad_norm": 0.57421875, + "learning_rate": 8.241270719925865e-08, + "loss": 1.414, + "step": 11120 + }, + { + "epoch": 1.9185715517984991, + "grad_norm": 0.5703125, + "learning_rate": 8.206434863872514e-08, + "loss": 1.4393, + "step": 11121 + }, + { + "epoch": 1.918744069697231, + "grad_norm": 0.5703125, + "learning_rate": 8.171672486083526e-08, + "loss": 1.3922, + "step": 11122 + }, + { + "epoch": 1.918916587595963, + "grad_norm": 0.5703125, + "learning_rate": 8.136983589134173e-08, + "loss": 1.4418, + "step": 11123 + }, + { + "epoch": 1.919089105494695, + "grad_norm": 0.5703125, + "learning_rate": 8.102368175594733e-08, + "loss": 1.4394, + "step": 11124 + }, + { + "epoch": 1.9192616233934272, + "grad_norm": 0.56640625, + "learning_rate": 8.067826248029264e-08, + "loss": 1.3664, + "step": 11125 + }, + { + "epoch": 1.9194341412921592, + "grad_norm": 0.5625, + "learning_rate": 8.033357808997278e-08, + "loss": 1.4185, + "step": 11126 + }, + { + "epoch": 1.9196066591908911, + "grad_norm": 0.61328125, + "learning_rate": 7.998962861052173e-08, + "loss": 1.3817, + "step": 11127 + }, + { + "epoch": 1.919779177089623, + "grad_norm": 0.58203125, + "learning_rate": 7.964641406742135e-08, + "loss": 1.3643, + "step": 11128 + }, + { + "epoch": 1.919951694988355, + "grad_norm": 0.64453125, + "learning_rate": 7.93039344861013e-08, + "loss": 1.4467, + "step": 11129 + }, + { + "epoch": 1.920124212887087, + "grad_norm": 0.59765625, + "learning_rate": 7.896218989193239e-08, + "loss": 1.4712, + "step": 11130 + }, + { + "epoch": 1.920296730785819, + "grad_norm": 0.57421875, + "learning_rate": 7.862118031023436e-08, + "loss": 1.4224, + "step": 11131 + }, + { + "epoch": 1.920469248684551, + "grad_norm": 0.5859375, + "learning_rate": 7.828090576627034e-08, + "loss": 1.4686, + "step": 11132 + }, + { + "epoch": 1.920641766583283, + "grad_norm": 0.55859375, + "learning_rate": 7.794136628525129e-08, + "loss": 1.4157, + "step": 11133 + }, + { + "epoch": 1.9208142844820149, + "grad_norm": 0.6171875, + "learning_rate": 7.760256189233151e-08, + "loss": 1.4661, + "step": 11134 + }, + { + "epoch": 1.920986802380747, + "grad_norm": 0.6015625, + "learning_rate": 7.726449261261205e-08, + "loss": 1.4879, + "step": 11135 + }, + { + "epoch": 1.921159320279479, + "grad_norm": 0.58984375, + "learning_rate": 7.69271584711384e-08, + "loss": 1.424, + "step": 11136 + }, + { + "epoch": 1.921331838178211, + "grad_norm": 0.6328125, + "learning_rate": 7.659055949290395e-08, + "loss": 1.4365, + "step": 11137 + }, + { + "epoch": 1.9215043560769431, + "grad_norm": 0.76171875, + "learning_rate": 7.625469570284427e-08, + "loss": 1.4613, + "step": 11138 + }, + { + "epoch": 1.921676873975675, + "grad_norm": 0.60546875, + "learning_rate": 7.591956712584392e-08, + "loss": 1.5346, + "step": 11139 + }, + { + "epoch": 1.921849391874407, + "grad_norm": 1.28125, + "learning_rate": 7.55851737867297e-08, + "loss": 1.4781, + "step": 11140 + }, + { + "epoch": 1.922021909773139, + "grad_norm": 0.57421875, + "learning_rate": 7.525151571027734e-08, + "loss": 1.4212, + "step": 11141 + }, + { + "epoch": 1.922194427671871, + "grad_norm": 0.56640625, + "learning_rate": 7.491859292120484e-08, + "loss": 1.4737, + "step": 11142 + }, + { + "epoch": 1.922366945570603, + "grad_norm": 0.578125, + "learning_rate": 7.458640544417806e-08, + "loss": 1.4486, + "step": 11143 + }, + { + "epoch": 1.922539463469335, + "grad_norm": 0.56640625, + "learning_rate": 7.425495330380617e-08, + "loss": 1.4661, + "step": 11144 + }, + { + "epoch": 1.9227119813680669, + "grad_norm": 0.60546875, + "learning_rate": 7.392423652464731e-08, + "loss": 1.4051, + "step": 11145 + }, + { + "epoch": 1.9228844992667988, + "grad_norm": 0.6015625, + "learning_rate": 7.3594255131203e-08, + "loss": 1.4672, + "step": 11146 + }, + { + "epoch": 1.9230570171655308, + "grad_norm": 0.625, + "learning_rate": 7.326500914791701e-08, + "loss": 1.3742, + "step": 11147 + }, + { + "epoch": 1.9232295350642628, + "grad_norm": 0.66015625, + "learning_rate": 7.29364985991865e-08, + "loss": 1.3044, + "step": 11148 + }, + { + "epoch": 1.923402052962995, + "grad_norm": 0.59765625, + "learning_rate": 7.260872350934533e-08, + "loss": 1.3507, + "step": 11149 + }, + { + "epoch": 1.923574570861727, + "grad_norm": 0.59765625, + "learning_rate": 7.228168390268075e-08, + "loss": 1.3799, + "step": 11150 + }, + { + "epoch": 1.9237470887604589, + "grad_norm": 0.58203125, + "learning_rate": 7.195537980341894e-08, + "loss": 1.4878, + "step": 11151 + }, + { + "epoch": 1.923919606659191, + "grad_norm": 0.5703125, + "learning_rate": 7.162981123573609e-08, + "loss": 1.4538, + "step": 11152 + }, + { + "epoch": 1.924092124557923, + "grad_norm": 0.59375, + "learning_rate": 7.130497822375293e-08, + "loss": 1.5007, + "step": 11153 + }, + { + "epoch": 1.924264642456655, + "grad_norm": 0.63671875, + "learning_rate": 7.098088079153353e-08, + "loss": 1.5421, + "step": 11154 + }, + { + "epoch": 1.924437160355387, + "grad_norm": 1.125, + "learning_rate": 7.065751896309092e-08, + "loss": 1.3665, + "step": 11155 + }, + { + "epoch": 1.9246096782541189, + "grad_norm": 0.625, + "learning_rate": 7.033489276238037e-08, + "loss": 1.4511, + "step": 11156 + }, + { + "epoch": 1.9247821961528508, + "grad_norm": 0.609375, + "learning_rate": 7.001300221330387e-08, + "loss": 1.398, + "step": 11157 + }, + { + "epoch": 1.9249547140515828, + "grad_norm": 0.5859375, + "learning_rate": 6.969184733970902e-08, + "loss": 1.3415, + "step": 11158 + }, + { + "epoch": 1.9251272319503148, + "grad_norm": 0.63671875, + "learning_rate": 6.937142816539121e-08, + "loss": 1.3978, + "step": 11159 + }, + { + "epoch": 1.9252997498490467, + "grad_norm": 0.56640625, + "learning_rate": 6.905174471408594e-08, + "loss": 1.4171, + "step": 11160 + }, + { + "epoch": 1.9254722677477787, + "grad_norm": 0.58203125, + "learning_rate": 6.873279700947977e-08, + "loss": 1.4101, + "step": 11161 + }, + { + "epoch": 1.9256447856465109, + "grad_norm": 0.578125, + "learning_rate": 6.84145850752016e-08, + "loss": 1.4081, + "step": 11162 + }, + { + "epoch": 1.9258173035452428, + "grad_norm": 0.5625, + "learning_rate": 6.809710893482591e-08, + "loss": 1.367, + "step": 11163 + }, + { + "epoch": 1.9259898214439748, + "grad_norm": 0.578125, + "learning_rate": 6.778036861187277e-08, + "loss": 1.4203, + "step": 11164 + }, + { + "epoch": 1.9261623393427068, + "grad_norm": 0.6328125, + "learning_rate": 6.746436412981117e-08, + "loss": 1.5098, + "step": 11165 + }, + { + "epoch": 1.926334857241439, + "grad_norm": 0.5859375, + "learning_rate": 6.714909551204907e-08, + "loss": 1.3379, + "step": 11166 + }, + { + "epoch": 1.926507375140171, + "grad_norm": 0.578125, + "learning_rate": 6.683456278194666e-08, + "loss": 1.4345, + "step": 11167 + }, + { + "epoch": 1.9266798930389029, + "grad_norm": 0.6171875, + "learning_rate": 6.652076596280422e-08, + "loss": 1.4645, + "step": 11168 + }, + { + "epoch": 1.9268524109376348, + "grad_norm": 0.5859375, + "learning_rate": 6.6207705077872e-08, + "loss": 1.4089, + "step": 11169 + }, + { + "epoch": 1.9270249288363668, + "grad_norm": 0.5859375, + "learning_rate": 6.589538015034148e-08, + "loss": 1.3834, + "step": 11170 + }, + { + "epoch": 1.9271974467350987, + "grad_norm": 0.6015625, + "learning_rate": 6.55837912033519e-08, + "loss": 1.4337, + "step": 11171 + }, + { + "epoch": 1.9273699646338307, + "grad_norm": 0.59765625, + "learning_rate": 6.527293825998815e-08, + "loss": 1.4587, + "step": 11172 + }, + { + "epoch": 1.9275424825325627, + "grad_norm": 0.60546875, + "learning_rate": 6.496282134328069e-08, + "loss": 1.4673, + "step": 11173 + }, + { + "epoch": 1.9277150004312946, + "grad_norm": 0.61328125, + "learning_rate": 6.465344047620336e-08, + "loss": 1.416, + "step": 11174 + }, + { + "epoch": 1.9278875183300266, + "grad_norm": 0.61328125, + "learning_rate": 6.434479568167896e-08, + "loss": 1.4711, + "step": 11175 + }, + { + "epoch": 1.9280600362287588, + "grad_norm": 0.5859375, + "learning_rate": 6.40368869825736e-08, + "loss": 1.4945, + "step": 11176 + }, + { + "epoch": 1.9282325541274907, + "grad_norm": 0.58984375, + "learning_rate": 6.372971440169684e-08, + "loss": 1.3634, + "step": 11177 + }, + { + "epoch": 1.9284050720262227, + "grad_norm": 0.58203125, + "learning_rate": 6.342327796180936e-08, + "loss": 1.4011, + "step": 11178 + }, + { + "epoch": 1.9285775899249549, + "grad_norm": 0.56640625, + "learning_rate": 6.311757768560967e-08, + "loss": 1.4638, + "step": 11179 + }, + { + "epoch": 1.9287501078236868, + "grad_norm": 0.57421875, + "learning_rate": 6.281261359575074e-08, + "loss": 1.4616, + "step": 11180 + }, + { + "epoch": 1.9289226257224188, + "grad_norm": 0.59765625, + "learning_rate": 6.250838571482231e-08, + "loss": 1.5161, + "step": 11181 + }, + { + "epoch": 1.9290951436211508, + "grad_norm": 0.5390625, + "learning_rate": 6.220489406536523e-08, + "loss": 1.3275, + "step": 11182 + }, + { + "epoch": 1.9292676615198827, + "grad_norm": 0.5703125, + "learning_rate": 6.190213866986483e-08, + "loss": 1.4386, + "step": 11183 + }, + { + "epoch": 1.9294401794186147, + "grad_norm": 0.6171875, + "learning_rate": 6.160011955074874e-08, + "loss": 1.4496, + "step": 11184 + }, + { + "epoch": 1.9296126973173466, + "grad_norm": 0.5859375, + "learning_rate": 6.12988367303946e-08, + "loss": 1.3581, + "step": 11185 + }, + { + "epoch": 1.9297852152160786, + "grad_norm": 0.5703125, + "learning_rate": 6.099829023112236e-08, + "loss": 1.4737, + "step": 11186 + }, + { + "epoch": 1.9299577331148106, + "grad_norm": 0.5625, + "learning_rate": 6.069848007519863e-08, + "loss": 1.3849, + "step": 11187 + }, + { + "epoch": 1.9301302510135425, + "grad_norm": 0.58203125, + "learning_rate": 6.039940628483454e-08, + "loss": 1.4742, + "step": 11188 + }, + { + "epoch": 1.9303027689122747, + "grad_norm": 0.6875, + "learning_rate": 6.010106888218792e-08, + "loss": 1.4461, + "step": 11189 + }, + { + "epoch": 1.9304752868110067, + "grad_norm": 0.62890625, + "learning_rate": 5.98034678893622e-08, + "loss": 1.4375, + "step": 11190 + }, + { + "epoch": 1.9306478047097386, + "grad_norm": 0.5625, + "learning_rate": 5.950660332840419e-08, + "loss": 1.3679, + "step": 11191 + }, + { + "epoch": 1.9308203226084706, + "grad_norm": 0.81640625, + "learning_rate": 5.9210475221308515e-08, + "loss": 1.4385, + "step": 11192 + }, + { + "epoch": 1.9309928405072028, + "grad_norm": 0.609375, + "learning_rate": 5.8915083590013186e-08, + "loss": 1.4601, + "step": 11193 + }, + { + "epoch": 1.9311653584059347, + "grad_norm": 0.5703125, + "learning_rate": 5.862042845640403e-08, + "loss": 1.3369, + "step": 11194 + }, + { + "epoch": 1.9313378763046667, + "grad_norm": 0.55859375, + "learning_rate": 5.832650984231025e-08, + "loss": 1.4013, + "step": 11195 + }, + { + "epoch": 1.9315103942033987, + "grad_norm": 0.56640625, + "learning_rate": 5.8033327769505546e-08, + "loss": 1.4503, + "step": 11196 + }, + { + "epoch": 1.9316829121021306, + "grad_norm": 0.546875, + "learning_rate": 5.774088225971364e-08, + "loss": 1.264, + "step": 11197 + }, + { + "epoch": 1.9318554300008626, + "grad_norm": 0.83984375, + "learning_rate": 5.7449173334598316e-08, + "loss": 1.4681, + "step": 11198 + }, + { + "epoch": 1.9320279478995945, + "grad_norm": 0.609375, + "learning_rate": 5.7158201015773404e-08, + "loss": 1.5123, + "step": 11199 + }, + { + "epoch": 1.9322004657983265, + "grad_norm": 0.578125, + "learning_rate": 5.6867965324793886e-08, + "loss": 1.3687, + "step": 11200 + }, + { + "epoch": 1.9322004657983265, + "eval_loss": 1.4070696830749512, + "eval_runtime": 10.8729, + "eval_samples_per_second": 94.179, + "eval_steps_per_second": 23.545, + "step": 11200 + }, + { + "epoch": 1.9323729836970585, + "grad_norm": 0.65234375, + "learning_rate": 5.657846628316366e-08, + "loss": 1.3929, + "step": 11201 + }, + { + "epoch": 1.9325455015957904, + "grad_norm": 0.58203125, + "learning_rate": 5.628970391232891e-08, + "loss": 1.5419, + "step": 11202 + }, + { + "epoch": 1.9327180194945226, + "grad_norm": 0.55859375, + "learning_rate": 5.600167823368474e-08, + "loss": 1.3657, + "step": 11203 + }, + { + "epoch": 1.9328905373932546, + "grad_norm": 0.609375, + "learning_rate": 5.571438926856964e-08, + "loss": 1.4119, + "step": 11204 + }, + { + "epoch": 1.9330630552919865, + "grad_norm": 0.6484375, + "learning_rate": 5.5427837038266595e-08, + "loss": 1.409, + "step": 11205 + }, + { + "epoch": 1.9332355731907187, + "grad_norm": 0.6640625, + "learning_rate": 5.5142021564006386e-08, + "loss": 1.6224, + "step": 11206 + }, + { + "epoch": 1.9334080910894507, + "grad_norm": 0.6015625, + "learning_rate": 5.485694286696319e-08, + "loss": 1.4667, + "step": 11207 + }, + { + "epoch": 1.9335806089881826, + "grad_norm": 0.58203125, + "learning_rate": 5.4572600968257894e-08, + "loss": 1.3575, + "step": 11208 + }, + { + "epoch": 1.9337531268869146, + "grad_norm": 0.609375, + "learning_rate": 5.428899588895586e-08, + "loss": 1.4499, + "step": 11209 + }, + { + "epoch": 1.9339256447856465, + "grad_norm": 0.59375, + "learning_rate": 5.4006127650069185e-08, + "loss": 1.3383, + "step": 11210 + }, + { + "epoch": 1.9340981626843785, + "grad_norm": 0.58984375, + "learning_rate": 5.372399627255442e-08, + "loss": 1.5185, + "step": 11211 + }, + { + "epoch": 1.9342706805831105, + "grad_norm": 0.6015625, + "learning_rate": 5.344260177731264e-08, + "loss": 1.4495, + "step": 11212 + }, + { + "epoch": 1.9344431984818424, + "grad_norm": 0.58203125, + "learning_rate": 5.31619441851916e-08, + "loss": 1.4181, + "step": 11213 + }, + { + "epoch": 1.9346157163805744, + "grad_norm": 0.5859375, + "learning_rate": 5.288202351698468e-08, + "loss": 1.4742, + "step": 11214 + }, + { + "epoch": 1.9347882342793064, + "grad_norm": 0.59765625, + "learning_rate": 5.260283979343084e-08, + "loss": 1.4408, + "step": 11215 + }, + { + "epoch": 1.9349607521780383, + "grad_norm": 0.5625, + "learning_rate": 5.232439303521131e-08, + "loss": 1.5229, + "step": 11216 + }, + { + "epoch": 1.9351332700767705, + "grad_norm": 0.64453125, + "learning_rate": 5.2046683262957366e-08, + "loss": 1.441, + "step": 11217 + }, + { + "epoch": 1.9353057879755025, + "grad_norm": 0.59765625, + "learning_rate": 5.1769710497243664e-08, + "loss": 1.4484, + "step": 11218 + }, + { + "epoch": 1.9354783058742344, + "grad_norm": 0.61328125, + "learning_rate": 5.149347475858824e-08, + "loss": 1.4355, + "step": 11219 + }, + { + "epoch": 1.9356508237729666, + "grad_norm": 0.5859375, + "learning_rate": 5.121797606745804e-08, + "loss": 1.3858, + "step": 11220 + }, + { + "epoch": 1.9358233416716986, + "grad_norm": 0.59375, + "learning_rate": 5.094321444426231e-08, + "loss": 1.4759, + "step": 11221 + }, + { + "epoch": 1.9359958595704305, + "grad_norm": 0.58984375, + "learning_rate": 5.0669189909358094e-08, + "loss": 1.4333, + "step": 11222 + }, + { + "epoch": 1.9361683774691625, + "grad_norm": 0.6171875, + "learning_rate": 5.0395902483046934e-08, + "loss": 1.4954, + "step": 11223 + }, + { + "epoch": 1.9363408953678944, + "grad_norm": 0.60546875, + "learning_rate": 5.012335218557374e-08, + "loss": 1.3914, + "step": 11224 + }, + { + "epoch": 1.9365134132666264, + "grad_norm": 0.6171875, + "learning_rate": 4.985153903713458e-08, + "loss": 1.3743, + "step": 11225 + }, + { + "epoch": 1.9366859311653584, + "grad_norm": 0.578125, + "learning_rate": 4.9580463057863345e-08, + "loss": 1.5157, + "step": 11226 + }, + { + "epoch": 1.9368584490640903, + "grad_norm": 0.65234375, + "learning_rate": 4.9310124267845095e-08, + "loss": 1.4125, + "step": 11227 + }, + { + "epoch": 1.9370309669628223, + "grad_norm": 0.55078125, + "learning_rate": 4.904052268710713e-08, + "loss": 1.4332, + "step": 11228 + }, + { + "epoch": 1.9372034848615542, + "grad_norm": 0.6796875, + "learning_rate": 4.8771658335623476e-08, + "loss": 1.4508, + "step": 11229 + }, + { + "epoch": 1.9373760027602864, + "grad_norm": 0.58984375, + "learning_rate": 4.850353123331486e-08, + "loss": 1.4796, + "step": 11230 + }, + { + "epoch": 1.9375485206590184, + "grad_norm": 0.5546875, + "learning_rate": 4.823614140004429e-08, + "loss": 1.4209, + "step": 11231 + }, + { + "epoch": 1.9377210385577504, + "grad_norm": 0.58984375, + "learning_rate": 4.796948885562036e-08, + "loss": 1.3682, + "step": 11232 + }, + { + "epoch": 1.9378935564564823, + "grad_norm": 0.53125, + "learning_rate": 4.7703573619800604e-08, + "loss": 1.3958, + "step": 11233 + }, + { + "epoch": 1.9380660743552145, + "grad_norm": 0.80859375, + "learning_rate": 4.743839571228592e-08, + "loss": 1.4617, + "step": 11234 + }, + { + "epoch": 1.9382385922539465, + "grad_norm": 0.6015625, + "learning_rate": 4.7173955152719496e-08, + "loss": 1.4021, + "step": 11235 + }, + { + "epoch": 1.9384111101526784, + "grad_norm": 4.59375, + "learning_rate": 4.6910251960695655e-08, + "loss": 1.43, + "step": 11236 + }, + { + "epoch": 1.9385836280514104, + "grad_norm": 0.640625, + "learning_rate": 4.664728615574987e-08, + "loss": 1.5197, + "step": 11237 + }, + { + "epoch": 1.9387561459501423, + "grad_norm": 0.5625, + "learning_rate": 4.638505775736546e-08, + "loss": 1.4641, + "step": 11238 + }, + { + "epoch": 1.9389286638488743, + "grad_norm": 0.64453125, + "learning_rate": 4.612356678496799e-08, + "loss": 1.4546, + "step": 11239 + }, + { + "epoch": 1.9391011817476063, + "grad_norm": 0.56640625, + "learning_rate": 4.5862813257931957e-08, + "loss": 1.4626, + "step": 11240 + }, + { + "epoch": 1.9392736996463382, + "grad_norm": 0.58203125, + "learning_rate": 4.5602797195574143e-08, + "loss": 1.4488, + "step": 11241 + }, + { + "epoch": 1.9394462175450702, + "grad_norm": 0.56640625, + "learning_rate": 4.534351861716024e-08, + "loss": 1.4772, + "step": 11242 + }, + { + "epoch": 1.9396187354438021, + "grad_norm": 0.59375, + "learning_rate": 4.5084977541897116e-08, + "loss": 1.3595, + "step": 11243 + }, + { + "epoch": 1.9397912533425343, + "grad_norm": 0.56640625, + "learning_rate": 4.482717398894165e-08, + "loss": 1.303, + "step": 11244 + }, + { + "epoch": 1.9399637712412663, + "grad_norm": 0.5703125, + "learning_rate": 4.4570107977389696e-08, + "loss": 1.3287, + "step": 11245 + }, + { + "epoch": 1.9401362891399982, + "grad_norm": 0.60546875, + "learning_rate": 4.4313779526290457e-08, + "loss": 1.4473, + "step": 11246 + }, + { + "epoch": 1.9403088070387304, + "grad_norm": 0.61328125, + "learning_rate": 4.4058188654630965e-08, + "loss": 1.3413, + "step": 11247 + }, + { + "epoch": 1.9404813249374624, + "grad_norm": 0.609375, + "learning_rate": 4.3803335381349396e-08, + "loss": 1.4993, + "step": 11248 + }, + { + "epoch": 1.9406538428361944, + "grad_norm": 0.62890625, + "learning_rate": 4.354921972532511e-08, + "loss": 1.3871, + "step": 11249 + }, + { + "epoch": 1.9408263607349263, + "grad_norm": 0.625, + "learning_rate": 4.3295841705386365e-08, + "loss": 1.3752, + "step": 11250 + }, + { + "epoch": 1.9409988786336583, + "grad_norm": 0.59375, + "learning_rate": 4.30432013403026e-08, + "loss": 1.4766, + "step": 11251 + }, + { + "epoch": 1.9411713965323902, + "grad_norm": 0.6015625, + "learning_rate": 4.279129864879439e-08, + "loss": 1.3649, + "step": 11252 + }, + { + "epoch": 1.9413439144311222, + "grad_norm": 0.62890625, + "learning_rate": 4.2540133649520145e-08, + "loss": 1.4277, + "step": 11253 + }, + { + "epoch": 1.9415164323298542, + "grad_norm": 0.58203125, + "learning_rate": 4.2289706361091643e-08, + "loss": 1.4751, + "step": 11254 + }, + { + "epoch": 1.9416889502285861, + "grad_norm": 0.6015625, + "learning_rate": 4.2040016802059604e-08, + "loss": 1.4691, + "step": 11255 + }, + { + "epoch": 1.941861468127318, + "grad_norm": 0.57421875, + "learning_rate": 4.179106499092367e-08, + "loss": 1.4345, + "step": 11256 + }, + { + "epoch": 1.94203398602605, + "grad_norm": 0.57421875, + "learning_rate": 4.1542850946126864e-08, + "loss": 1.359, + "step": 11257 + }, + { + "epoch": 1.9422065039247822, + "grad_norm": 0.578125, + "learning_rate": 4.129537468605893e-08, + "loss": 1.4108, + "step": 11258 + }, + { + "epoch": 1.9423790218235142, + "grad_norm": 0.59375, + "learning_rate": 4.1048636229055194e-08, + "loss": 1.4584, + "step": 11259 + }, + { + "epoch": 1.9425515397222461, + "grad_norm": 0.61328125, + "learning_rate": 4.080263559339437e-08, + "loss": 1.4231, + "step": 11260 + }, + { + "epoch": 1.9427240576209783, + "grad_norm": 1.4453125, + "learning_rate": 4.0557372797302984e-08, + "loss": 1.3745, + "step": 11261 + }, + { + "epoch": 1.9428965755197103, + "grad_norm": 0.71875, + "learning_rate": 4.0312847858949846e-08, + "loss": 1.4831, + "step": 11262 + }, + { + "epoch": 1.9430690934184423, + "grad_norm": 0.53515625, + "learning_rate": 4.006906079645267e-08, + "loss": 1.3649, + "step": 11263 + }, + { + "epoch": 1.9432416113171742, + "grad_norm": 0.60546875, + "learning_rate": 3.982601162787147e-08, + "loss": 1.4526, + "step": 11264 + }, + { + "epoch": 1.9434141292159062, + "grad_norm": 0.609375, + "learning_rate": 3.9583700371214064e-08, + "loss": 1.4766, + "step": 11265 + }, + { + "epoch": 1.9435866471146381, + "grad_norm": 0.6015625, + "learning_rate": 3.9342127044430524e-08, + "loss": 1.4487, + "step": 11266 + }, + { + "epoch": 1.94375916501337, + "grad_norm": 0.5625, + "learning_rate": 3.910129166541987e-08, + "loss": 1.4204, + "step": 11267 + }, + { + "epoch": 1.943931682912102, + "grad_norm": 0.57421875, + "learning_rate": 3.8861194252024504e-08, + "loss": 1.3871, + "step": 11268 + }, + { + "epoch": 1.944104200810834, + "grad_norm": 0.6171875, + "learning_rate": 3.86218348220313e-08, + "loss": 1.4365, + "step": 11269 + }, + { + "epoch": 1.944276718709566, + "grad_norm": 0.578125, + "learning_rate": 3.8383213393174965e-08, + "loss": 1.4831, + "step": 11270 + }, + { + "epoch": 1.9444492366082982, + "grad_norm": 0.57421875, + "learning_rate": 3.814532998313247e-08, + "loss": 1.4162, + "step": 11271 + }, + { + "epoch": 1.9446217545070301, + "grad_norm": 0.6015625, + "learning_rate": 3.790818460952861e-08, + "loss": 1.356, + "step": 11272 + }, + { + "epoch": 1.944794272405762, + "grad_norm": 0.60546875, + "learning_rate": 3.7671777289932654e-08, + "loss": 1.4233, + "step": 11273 + }, + { + "epoch": 1.944966790304494, + "grad_norm": 0.5859375, + "learning_rate": 3.743610804185949e-08, + "loss": 1.4831, + "step": 11274 + }, + { + "epoch": 1.9451393082032262, + "grad_norm": 0.56640625, + "learning_rate": 3.720117688276737e-08, + "loss": 1.3817, + "step": 11275 + }, + { + "epoch": 1.9453118261019582, + "grad_norm": 0.5546875, + "learning_rate": 3.6966983830063477e-08, + "loss": 1.3679, + "step": 11276 + }, + { + "epoch": 1.9454843440006901, + "grad_norm": 0.578125, + "learning_rate": 3.673352890109616e-08, + "loss": 1.4826, + "step": 11277 + }, + { + "epoch": 1.945656861899422, + "grad_norm": 0.5859375, + "learning_rate": 3.650081211316381e-08, + "loss": 1.4326, + "step": 11278 + }, + { + "epoch": 1.945829379798154, + "grad_norm": 0.56640625, + "learning_rate": 3.626883348350485e-08, + "loss": 1.4831, + "step": 11279 + }, + { + "epoch": 1.946001897696886, + "grad_norm": 0.55859375, + "learning_rate": 3.603759302930776e-08, + "loss": 1.4117, + "step": 11280 + }, + { + "epoch": 1.946174415595618, + "grad_norm": 0.69921875, + "learning_rate": 3.5807090767703276e-08, + "loss": 1.4233, + "step": 11281 + }, + { + "epoch": 1.94634693349435, + "grad_norm": 0.59375, + "learning_rate": 3.557732671576885e-08, + "loss": 1.4643, + "step": 11282 + }, + { + "epoch": 1.946519451393082, + "grad_norm": 0.546875, + "learning_rate": 3.534830089052532e-08, + "loss": 1.4645, + "step": 11283 + }, + { + "epoch": 1.9466919692918139, + "grad_norm": 0.57421875, + "learning_rate": 3.512001330894355e-08, + "loss": 1.3985, + "step": 11284 + }, + { + "epoch": 1.946864487190546, + "grad_norm": 0.5546875, + "learning_rate": 3.4892463987933335e-08, + "loss": 1.4634, + "step": 11285 + }, + { + "epoch": 1.947037005089278, + "grad_norm": 0.57421875, + "learning_rate": 3.4665652944355646e-08, + "loss": 1.4396, + "step": 11286 + }, + { + "epoch": 1.94720952298801, + "grad_norm": 0.60546875, + "learning_rate": 3.443958019501148e-08, + "loss": 1.5846, + "step": 11287 + }, + { + "epoch": 1.9473820408867422, + "grad_norm": 0.60546875, + "learning_rate": 3.421424575665078e-08, + "loss": 1.4767, + "step": 11288 + }, + { + "epoch": 1.9475545587854741, + "grad_norm": 0.59765625, + "learning_rate": 3.398964964596907e-08, + "loss": 1.524, + "step": 11289 + }, + { + "epoch": 1.947727076684206, + "grad_norm": 0.58203125, + "learning_rate": 3.376579187960305e-08, + "loss": 1.3998, + "step": 11290 + }, + { + "epoch": 1.947899594582938, + "grad_norm": 0.60546875, + "learning_rate": 3.354267247414056e-08, + "loss": 1.4296, + "step": 11291 + }, + { + "epoch": 1.94807211248167, + "grad_norm": 0.66796875, + "learning_rate": 3.332029144610949e-08, + "loss": 1.4121, + "step": 11292 + }, + { + "epoch": 1.948244630380402, + "grad_norm": 0.6171875, + "learning_rate": 3.309864881198555e-08, + "loss": 1.4809, + "step": 11293 + }, + { + "epoch": 1.948417148279134, + "grad_norm": 0.56640625, + "learning_rate": 3.2877744588190044e-08, + "loss": 1.3625, + "step": 11294 + }, + { + "epoch": 1.9485896661778659, + "grad_norm": 0.578125, + "learning_rate": 3.2657578791088775e-08, + "loss": 1.4392, + "step": 11295 + }, + { + "epoch": 1.9487621840765978, + "grad_norm": 0.6953125, + "learning_rate": 3.243815143699314e-08, + "loss": 1.3932, + "step": 11296 + }, + { + "epoch": 1.9489347019753298, + "grad_norm": 0.58203125, + "learning_rate": 3.2219462542159016e-08, + "loss": 1.4316, + "step": 11297 + }, + { + "epoch": 1.9491072198740618, + "grad_norm": 0.55859375, + "learning_rate": 3.2001512122789014e-08, + "loss": 1.5107, + "step": 11298 + }, + { + "epoch": 1.949279737772794, + "grad_norm": 0.61328125, + "learning_rate": 3.17843001950302e-08, + "loss": 1.417, + "step": 11299 + }, + { + "epoch": 1.949452255671526, + "grad_norm": 0.6328125, + "learning_rate": 3.1567826774974166e-08, + "loss": 1.4208, + "step": 11300 + }, + { + "epoch": 1.949452255671526, + "eval_loss": 1.4070621728897095, + "eval_runtime": 10.985, + "eval_samples_per_second": 93.218, + "eval_steps_per_second": 23.305, + "step": 11300 + }, + { + "epoch": 1.9496247735702579, + "grad_norm": 0.57421875, + "learning_rate": 3.135209187865917e-08, + "loss": 1.4143, + "step": 11301 + }, + { + "epoch": 1.94979729146899, + "grad_norm": 0.56640625, + "learning_rate": 3.1137095522068006e-08, + "loss": 1.4008, + "step": 11302 + }, + { + "epoch": 1.949969809367722, + "grad_norm": 0.56640625, + "learning_rate": 3.092283772113014e-08, + "loss": 1.5115, + "step": 11303 + }, + { + "epoch": 1.950142327266454, + "grad_norm": 0.60546875, + "learning_rate": 3.070931849171732e-08, + "loss": 1.3597, + "step": 11304 + }, + { + "epoch": 1.950314845165186, + "grad_norm": 0.58203125, + "learning_rate": 3.049653784964912e-08, + "loss": 1.505, + "step": 11305 + }, + { + "epoch": 1.950487363063918, + "grad_norm": 0.609375, + "learning_rate": 3.028449581068959e-08, + "loss": 1.4266, + "step": 11306 + }, + { + "epoch": 1.9506598809626499, + "grad_norm": 0.63671875, + "learning_rate": 3.00731923905484e-08, + "loss": 1.4253, + "step": 11307 + }, + { + "epoch": 1.9508323988613818, + "grad_norm": 0.625, + "learning_rate": 2.986262760488079e-08, + "loss": 1.5135, + "step": 11308 + }, + { + "epoch": 1.9510049167601138, + "grad_norm": 0.6015625, + "learning_rate": 2.9652801469285396e-08, + "loss": 1.4114, + "step": 11309 + }, + { + "epoch": 1.9511774346588457, + "grad_norm": 2.265625, + "learning_rate": 2.9443713999308676e-08, + "loss": 1.4138, + "step": 11310 + }, + { + "epoch": 1.9513499525575777, + "grad_norm": 0.59765625, + "learning_rate": 2.923536521044046e-08, + "loss": 1.3957, + "step": 11311 + }, + { + "epoch": 1.9515224704563099, + "grad_norm": 0.5546875, + "learning_rate": 2.9027755118116175e-08, + "loss": 1.447, + "step": 11312 + }, + { + "epoch": 1.9516949883550418, + "grad_norm": 0.54296875, + "learning_rate": 2.8820883737716853e-08, + "loss": 1.3001, + "step": 11313 + }, + { + "epoch": 1.9518675062537738, + "grad_norm": 0.546875, + "learning_rate": 2.8614751084570236e-08, + "loss": 1.4411, + "step": 11314 + }, + { + "epoch": 1.9520400241525058, + "grad_norm": 0.5546875, + "learning_rate": 2.8409357173946327e-08, + "loss": 1.3716, + "step": 11315 + }, + { + "epoch": 1.952212542051238, + "grad_norm": 1.4140625, + "learning_rate": 2.8204702021062958e-08, + "loss": 1.5538, + "step": 11316 + }, + { + "epoch": 1.95238505994997, + "grad_norm": 0.59375, + "learning_rate": 2.800078564108133e-08, + "loss": 1.3402, + "step": 11317 + }, + { + "epoch": 1.9525575778487019, + "grad_norm": 0.578125, + "learning_rate": 2.779760804911047e-08, + "loss": 1.3476, + "step": 11318 + }, + { + "epoch": 1.9527300957474338, + "grad_norm": 0.5859375, + "learning_rate": 2.759516926020056e-08, + "loss": 1.3268, + "step": 11319 + }, + { + "epoch": 1.9529026136461658, + "grad_norm": 0.65625, + "learning_rate": 2.7393469289351825e-08, + "loss": 1.4073, + "step": 11320 + }, + { + "epoch": 1.9530751315448978, + "grad_norm": 0.5546875, + "learning_rate": 2.7192508151506758e-08, + "loss": 1.412, + "step": 11321 + }, + { + "epoch": 1.9532476494436297, + "grad_norm": 0.59765625, + "learning_rate": 2.6992285861553447e-08, + "loss": 1.5015, + "step": 11322 + }, + { + "epoch": 1.9534201673423617, + "grad_norm": 0.64453125, + "learning_rate": 2.6792802434326692e-08, + "loss": 1.5071, + "step": 11323 + }, + { + "epoch": 1.9535926852410936, + "grad_norm": 0.55078125, + "learning_rate": 2.6594057884603565e-08, + "loss": 1.3677, + "step": 11324 + }, + { + "epoch": 1.9537652031398256, + "grad_norm": 0.62109375, + "learning_rate": 2.639605222710895e-08, + "loss": 1.4214, + "step": 11325 + }, + { + "epoch": 1.9539377210385578, + "grad_norm": 0.6171875, + "learning_rate": 2.6198785476513333e-08, + "loss": 1.3839, + "step": 11326 + }, + { + "epoch": 1.9541102389372897, + "grad_norm": 0.62109375, + "learning_rate": 2.6002257647431694e-08, + "loss": 1.5533, + "step": 11327 + }, + { + "epoch": 1.9542827568360217, + "grad_norm": 0.5703125, + "learning_rate": 2.5806468754422388e-08, + "loss": 1.3973, + "step": 11328 + }, + { + "epoch": 1.954455274734754, + "grad_norm": 0.6015625, + "learning_rate": 2.5611418811991586e-08, + "loss": 1.4482, + "step": 11329 + }, + { + "epoch": 1.9546277926334858, + "grad_norm": 0.58203125, + "learning_rate": 2.541710783458884e-08, + "loss": 1.4284, + "step": 11330 + }, + { + "epoch": 1.9548003105322178, + "grad_norm": 0.57421875, + "learning_rate": 2.5223535836612634e-08, + "loss": 1.4421, + "step": 11331 + }, + { + "epoch": 1.9549728284309498, + "grad_norm": 0.62890625, + "learning_rate": 2.503070283240039e-08, + "loss": 1.4014, + "step": 11332 + }, + { + "epoch": 1.9551453463296817, + "grad_norm": 0.59375, + "learning_rate": 2.4838608836241783e-08, + "loss": 1.3639, + "step": 11333 + }, + { + "epoch": 1.9553178642284137, + "grad_norm": 0.59375, + "learning_rate": 2.4647253862365438e-08, + "loss": 1.4609, + "step": 11334 + }, + { + "epoch": 1.9554903821271457, + "grad_norm": 0.828125, + "learning_rate": 2.445663792495001e-08, + "loss": 1.3816, + "step": 11335 + }, + { + "epoch": 1.9556629000258776, + "grad_norm": 0.60546875, + "learning_rate": 2.4266761038116428e-08, + "loss": 1.418, + "step": 11336 + }, + { + "epoch": 1.9558354179246096, + "grad_norm": 0.61328125, + "learning_rate": 2.4077623215933432e-08, + "loss": 1.495, + "step": 11337 + }, + { + "epoch": 1.9560079358233415, + "grad_norm": 0.609375, + "learning_rate": 2.388922447241204e-08, + "loss": 1.4245, + "step": 11338 + }, + { + "epoch": 1.9561804537220737, + "grad_norm": 0.5625, + "learning_rate": 2.370156482150998e-08, + "loss": 1.4573, + "step": 11339 + }, + { + "epoch": 1.9563529716208057, + "grad_norm": 0.58984375, + "learning_rate": 2.3514644277131682e-08, + "loss": 1.4742, + "step": 11340 + }, + { + "epoch": 1.9565254895195376, + "grad_norm": 0.57421875, + "learning_rate": 2.3328462853123846e-08, + "loss": 1.5041, + "step": 11341 + }, + { + "epoch": 1.9566980074182696, + "grad_norm": 0.58984375, + "learning_rate": 2.3143020563280993e-08, + "loss": 1.4036, + "step": 11342 + }, + { + "epoch": 1.9568705253170018, + "grad_norm": 0.58984375, + "learning_rate": 2.2958317421341026e-08, + "loss": 1.3942, + "step": 11343 + }, + { + "epoch": 1.9570430432157337, + "grad_norm": 0.578125, + "learning_rate": 2.277435344098855e-08, + "loss": 1.4601, + "step": 11344 + }, + { + "epoch": 1.9572155611144657, + "grad_norm": 0.5703125, + "learning_rate": 2.2591128635852666e-08, + "loss": 1.3396, + "step": 11345 + }, + { + "epoch": 1.9573880790131977, + "grad_norm": 0.56640625, + "learning_rate": 2.240864301950807e-08, + "loss": 1.4657, + "step": 11346 + }, + { + "epoch": 1.9575605969119296, + "grad_norm": 0.58203125, + "learning_rate": 2.2226896605473945e-08, + "loss": 1.4744, + "step": 11347 + }, + { + "epoch": 1.9577331148106616, + "grad_norm": 1.578125, + "learning_rate": 2.2045889407215082e-08, + "loss": 1.4355, + "step": 11348 + }, + { + "epoch": 1.9579056327093936, + "grad_norm": 0.57421875, + "learning_rate": 2.186562143814186e-08, + "loss": 1.3912, + "step": 11349 + }, + { + "epoch": 1.9580781506081255, + "grad_norm": 0.6796875, + "learning_rate": 2.1686092711609154e-08, + "loss": 1.4397, + "step": 11350 + }, + { + "epoch": 1.9582506685068575, + "grad_norm": 0.59765625, + "learning_rate": 2.1507303240918543e-08, + "loss": 1.4747, + "step": 11351 + }, + { + "epoch": 1.9584231864055894, + "grad_norm": 0.625, + "learning_rate": 2.132925303931499e-08, + "loss": 1.4209, + "step": 11352 + }, + { + "epoch": 1.9585957043043216, + "grad_norm": 0.5703125, + "learning_rate": 2.1151942119991274e-08, + "loss": 1.3934, + "step": 11353 + }, + { + "epoch": 1.9587682222030536, + "grad_norm": 0.578125, + "learning_rate": 2.0975370496081336e-08, + "loss": 1.4919, + "step": 11354 + }, + { + "epoch": 1.9589407401017855, + "grad_norm": 0.61328125, + "learning_rate": 2.0799538180668044e-08, + "loss": 1.4422, + "step": 11355 + }, + { + "epoch": 1.9591132580005175, + "grad_norm": 0.76171875, + "learning_rate": 2.0624445186777643e-08, + "loss": 1.4383, + "step": 11356 + }, + { + "epoch": 1.9592857758992497, + "grad_norm": 0.57421875, + "learning_rate": 2.045009152738309e-08, + "loss": 1.4113, + "step": 11357 + }, + { + "epoch": 1.9594582937979816, + "grad_norm": 0.55859375, + "learning_rate": 2.0276477215399604e-08, + "loss": 1.3592, + "step": 11358 + }, + { + "epoch": 1.9596308116967136, + "grad_norm": 0.5625, + "learning_rate": 2.0103602263692455e-08, + "loss": 1.4196, + "step": 11359 + }, + { + "epoch": 1.9598033295954456, + "grad_norm": 0.65234375, + "learning_rate": 1.993146668506585e-08, + "loss": 1.4562, + "step": 11360 + }, + { + "epoch": 1.9599758474941775, + "grad_norm": 0.61328125, + "learning_rate": 1.976007049227624e-08, + "loss": 1.5406, + "step": 11361 + }, + { + "epoch": 1.9601483653929095, + "grad_norm": 0.609375, + "learning_rate": 1.9589413698019034e-08, + "loss": 1.4636, + "step": 11362 + }, + { + "epoch": 1.9603208832916414, + "grad_norm": 0.546875, + "learning_rate": 1.9419496314939667e-08, + "loss": 1.4362, + "step": 11363 + }, + { + "epoch": 1.9604934011903734, + "grad_norm": 0.62890625, + "learning_rate": 1.925031835562474e-08, + "loss": 1.5589, + "step": 11364 + }, + { + "epoch": 1.9606659190891054, + "grad_norm": 0.609375, + "learning_rate": 1.9081879832608674e-08, + "loss": 1.4037, + "step": 11365 + }, + { + "epoch": 1.9608384369878373, + "grad_norm": 0.58984375, + "learning_rate": 1.891418075837037e-08, + "loss": 1.5149, + "step": 11366 + }, + { + "epoch": 1.9610109548865695, + "grad_norm": 0.609375, + "learning_rate": 1.8747221145334337e-08, + "loss": 1.4395, + "step": 11367 + }, + { + "epoch": 1.9611834727853015, + "grad_norm": 0.5546875, + "learning_rate": 1.858100100587068e-08, + "loss": 1.3103, + "step": 11368 + }, + { + "epoch": 1.9613559906840334, + "grad_norm": 0.64453125, + "learning_rate": 1.841552035229288e-08, + "loss": 1.4143, + "step": 11369 + }, + { + "epoch": 1.9615285085827656, + "grad_norm": 0.68359375, + "learning_rate": 1.8250779196861136e-08, + "loss": 1.31, + "step": 11370 + }, + { + "epoch": 1.9617010264814976, + "grad_norm": 0.6015625, + "learning_rate": 1.8086777551780122e-08, + "loss": 1.4315, + "step": 11371 + }, + { + "epoch": 1.9618735443802295, + "grad_norm": 0.6015625, + "learning_rate": 1.7923515429201232e-08, + "loss": 1.4189, + "step": 11372 + }, + { + "epoch": 1.9620460622789615, + "grad_norm": 0.59375, + "learning_rate": 1.7760992841219237e-08, + "loss": 1.3975, + "step": 11373 + }, + { + "epoch": 1.9622185801776935, + "grad_norm": 0.6015625, + "learning_rate": 1.7599209799874505e-08, + "loss": 1.4508, + "step": 11374 + }, + { + "epoch": 1.9623910980764254, + "grad_norm": 0.55859375, + "learning_rate": 1.7438166317153005e-08, + "loss": 1.4656, + "step": 11375 + }, + { + "epoch": 1.9625636159751574, + "grad_norm": 0.546875, + "learning_rate": 1.72778624049863e-08, + "loss": 1.3995, + "step": 11376 + }, + { + "epoch": 1.9627361338738893, + "grad_norm": 0.671875, + "learning_rate": 1.711829807525045e-08, + "loss": 1.3984, + "step": 11377 + }, + { + "epoch": 1.9629086517726213, + "grad_norm": 0.6484375, + "learning_rate": 1.6959473339765997e-08, + "loss": 1.4435, + "step": 11378 + }, + { + "epoch": 1.9630811696713533, + "grad_norm": 0.5625, + "learning_rate": 1.6801388210302416e-08, + "loss": 1.3658, + "step": 11379 + }, + { + "epoch": 1.9632536875700854, + "grad_norm": 0.578125, + "learning_rate": 1.6644042698569228e-08, + "loss": 1.4498, + "step": 11380 + }, + { + "epoch": 1.9634262054688174, + "grad_norm": 0.625, + "learning_rate": 1.648743681622378e-08, + "loss": 1.4733, + "step": 11381 + }, + { + "epoch": 1.9635987233675494, + "grad_norm": 0.59765625, + "learning_rate": 1.6331570574869005e-08, + "loss": 1.4511, + "step": 11382 + }, + { + "epoch": 1.9637712412662813, + "grad_norm": 0.55859375, + "learning_rate": 1.6176443986052337e-08, + "loss": 1.4096, + "step": 11383 + }, + { + "epoch": 1.9639437591650135, + "grad_norm": 0.625, + "learning_rate": 1.6022057061266804e-08, + "loss": 1.4136, + "step": 11384 + }, + { + "epoch": 1.9641162770637455, + "grad_norm": 0.5859375, + "learning_rate": 1.5868409811949926e-08, + "loss": 1.435, + "step": 11385 + }, + { + "epoch": 1.9642887949624774, + "grad_norm": 1.171875, + "learning_rate": 1.5715502249484816e-08, + "loss": 1.3491, + "step": 11386 + }, + { + "epoch": 1.9644613128612094, + "grad_norm": 0.59375, + "learning_rate": 1.55633343852013e-08, + "loss": 1.339, + "step": 11387 + }, + { + "epoch": 1.9646338307599414, + "grad_norm": 0.6328125, + "learning_rate": 1.5411906230370366e-08, + "loss": 1.3539, + "step": 11388 + }, + { + "epoch": 1.9648063486586733, + "grad_norm": 0.5703125, + "learning_rate": 1.5261217796211923e-08, + "loss": 1.4352, + "step": 11389 + }, + { + "epoch": 1.9649788665574053, + "grad_norm": 0.6171875, + "learning_rate": 1.5111269093890378e-08, + "loss": 1.4615, + "step": 11390 + }, + { + "epoch": 1.9651513844561372, + "grad_norm": 0.5859375, + "learning_rate": 1.4962060134513512e-08, + "loss": 1.3839, + "step": 11391 + }, + { + "epoch": 1.9653239023548692, + "grad_norm": 0.578125, + "learning_rate": 1.4813590929138032e-08, + "loss": 1.5242, + "step": 11392 + }, + { + "epoch": 1.9654964202536012, + "grad_norm": 0.6015625, + "learning_rate": 1.4665861488761813e-08, + "loss": 1.3994, + "step": 11393 + }, + { + "epoch": 1.9656689381523333, + "grad_norm": 0.5625, + "learning_rate": 1.4518871824329428e-08, + "loss": 1.354, + "step": 11394 + }, + { + "epoch": 1.9658414560510653, + "grad_norm": 0.65625, + "learning_rate": 1.4372621946731058e-08, + "loss": 1.4321, + "step": 11395 + }, + { + "epoch": 1.9660139739497973, + "grad_norm": 0.58203125, + "learning_rate": 1.4227111866802479e-08, + "loss": 1.307, + "step": 11396 + }, + { + "epoch": 1.9661864918485294, + "grad_norm": 0.5625, + "learning_rate": 1.4082341595322846e-08, + "loss": 1.4024, + "step": 11397 + }, + { + "epoch": 1.9663590097472614, + "grad_norm": 0.54296875, + "learning_rate": 1.3938311143018024e-08, + "loss": 1.3452, + "step": 11398 + }, + { + "epoch": 1.9665315276459934, + "grad_norm": 0.56640625, + "learning_rate": 1.3795020520559477e-08, + "loss": 1.3352, + "step": 11399 + }, + { + "epoch": 1.9667040455447253, + "grad_norm": 0.55859375, + "learning_rate": 1.3652469738562046e-08, + "loss": 1.3326, + "step": 11400 + }, + { + "epoch": 1.9667040455447253, + "eval_loss": 1.407050371170044, + "eval_runtime": 10.8811, + "eval_samples_per_second": 94.108, + "eval_steps_per_second": 23.527, + "step": 11400 + }, + { + "epoch": 1.9668765634434573, + "grad_norm": 0.625, + "learning_rate": 1.3510658807588394e-08, + "loss": 1.3218, + "step": 11401 + }, + { + "epoch": 1.9670490813421893, + "grad_norm": 0.9140625, + "learning_rate": 1.3369587738142343e-08, + "loss": 1.3314, + "step": 11402 + }, + { + "epoch": 1.9672215992409212, + "grad_norm": 0.578125, + "learning_rate": 1.3229256540676638e-08, + "loss": 1.4445, + "step": 11403 + }, + { + "epoch": 1.9673941171396532, + "grad_norm": 0.58203125, + "learning_rate": 1.3089665225588522e-08, + "loss": 1.3972, + "step": 11404 + }, + { + "epoch": 1.9675666350383851, + "grad_norm": 0.87109375, + "learning_rate": 1.295081380321861e-08, + "loss": 1.4608, + "step": 11405 + }, + { + "epoch": 1.967739152937117, + "grad_norm": 0.58984375, + "learning_rate": 1.2812702283855338e-08, + "loss": 1.4053, + "step": 11406 + }, + { + "epoch": 1.967911670835849, + "grad_norm": 0.58984375, + "learning_rate": 1.2675330677729413e-08, + "loss": 1.4719, + "step": 11407 + }, + { + "epoch": 1.9680841887345812, + "grad_norm": 0.5859375, + "learning_rate": 1.253869899501825e-08, + "loss": 1.523, + "step": 11408 + }, + { + "epoch": 1.9682567066333132, + "grad_norm": 0.57421875, + "learning_rate": 1.2402807245844861e-08, + "loss": 1.4111, + "step": 11409 + }, + { + "epoch": 1.9684292245320452, + "grad_norm": 0.59375, + "learning_rate": 1.226765544027675e-08, + "loss": 1.4143, + "step": 11410 + }, + { + "epoch": 1.9686017424307773, + "grad_norm": 0.56640625, + "learning_rate": 1.2133243588327014e-08, + "loss": 1.4456, + "step": 11411 + }, + { + "epoch": 1.9687742603295093, + "grad_norm": 0.609375, + "learning_rate": 1.199957169995436e-08, + "loss": 1.4577, + "step": 11412 + }, + { + "epoch": 1.9689467782282413, + "grad_norm": 0.5546875, + "learning_rate": 1.1866639785060862e-08, + "loss": 1.3764, + "step": 11413 + }, + { + "epoch": 1.9691192961269732, + "grad_norm": 0.55078125, + "learning_rate": 1.1734447853495312e-08, + "loss": 1.3514, + "step": 11414 + }, + { + "epoch": 1.9692918140257052, + "grad_norm": 0.56640625, + "learning_rate": 1.1602995915050985e-08, + "loss": 1.5606, + "step": 11415 + }, + { + "epoch": 1.9694643319244372, + "grad_norm": 0.5625, + "learning_rate": 1.1472283979467868e-08, + "loss": 1.4479, + "step": 11416 + }, + { + "epoch": 1.9696368498231691, + "grad_norm": 0.58984375, + "learning_rate": 1.1342312056429328e-08, + "loss": 1.5718, + "step": 11417 + }, + { + "epoch": 1.969809367721901, + "grad_norm": 0.640625, + "learning_rate": 1.1213080155564327e-08, + "loss": 1.5088, + "step": 11418 + }, + { + "epoch": 1.969981885620633, + "grad_norm": 0.61328125, + "learning_rate": 1.1084588286446319e-08, + "loss": 1.467, + "step": 11419 + }, + { + "epoch": 1.970154403519365, + "grad_norm": 0.5703125, + "learning_rate": 1.0956836458596576e-08, + "loss": 1.3432, + "step": 11420 + }, + { + "epoch": 1.9703269214180972, + "grad_norm": 0.5625, + "learning_rate": 1.082982468147864e-08, + "loss": 1.4005, + "step": 11421 + }, + { + "epoch": 1.9704994393168291, + "grad_norm": 0.6015625, + "learning_rate": 1.070355296450165e-08, + "loss": 1.3914, + "step": 11422 + }, + { + "epoch": 1.970671957215561, + "grad_norm": 0.56640625, + "learning_rate": 1.0578021317022569e-08, + "loss": 1.315, + "step": 11423 + }, + { + "epoch": 1.970844475114293, + "grad_norm": 0.58984375, + "learning_rate": 1.045322974833951e-08, + "loss": 1.4263, + "step": 11424 + }, + { + "epoch": 1.9710169930130252, + "grad_norm": 0.5859375, + "learning_rate": 1.0329178267699525e-08, + "loss": 1.4316, + "step": 11425 + }, + { + "epoch": 1.9711895109117572, + "grad_norm": 0.578125, + "learning_rate": 1.0205866884291926e-08, + "loss": 1.4727, + "step": 11426 + }, + { + "epoch": 1.9713620288104892, + "grad_norm": 0.5625, + "learning_rate": 1.0083295607252741e-08, + "loss": 1.3613, + "step": 11427 + }, + { + "epoch": 1.9715345467092211, + "grad_norm": 0.5859375, + "learning_rate": 9.961464445663594e-09, + "loss": 1.4517, + "step": 11428 + }, + { + "epoch": 1.971707064607953, + "grad_norm": 0.65625, + "learning_rate": 9.840373408548376e-09, + "loss": 1.3807, + "step": 11429 + }, + { + "epoch": 1.971879582506685, + "grad_norm": 0.5859375, + "learning_rate": 9.720022504881022e-09, + "loss": 1.36, + "step": 11430 + }, + { + "epoch": 1.972052100405417, + "grad_norm": 0.59765625, + "learning_rate": 9.600411743576621e-09, + "loss": 1.4093, + "step": 11431 + }, + { + "epoch": 1.972224618304149, + "grad_norm": 0.578125, + "learning_rate": 9.481541133495864e-09, + "loss": 1.4134, + "step": 11432 + }, + { + "epoch": 1.972397136202881, + "grad_norm": 1.5625, + "learning_rate": 9.36341068344615e-09, + "loss": 1.4477, + "step": 11433 + }, + { + "epoch": 1.972569654101613, + "grad_norm": 0.56640625, + "learning_rate": 9.246020402179368e-09, + "loss": 1.4009, + "step": 11434 + }, + { + "epoch": 1.972742172000345, + "grad_norm": 0.58203125, + "learning_rate": 9.129370298393004e-09, + "loss": 1.4254, + "step": 11435 + }, + { + "epoch": 1.972914689899077, + "grad_norm": 0.57421875, + "learning_rate": 9.013460380729033e-09, + "loss": 1.414, + "step": 11436 + }, + { + "epoch": 1.973087207797809, + "grad_norm": 0.609375, + "learning_rate": 8.898290657773923e-09, + "loss": 1.4608, + "step": 11437 + }, + { + "epoch": 1.9732597256965412, + "grad_norm": 0.64453125, + "learning_rate": 8.783861138060845e-09, + "loss": 1.5263, + "step": 11438 + }, + { + "epoch": 1.9734322435952731, + "grad_norm": 0.56640625, + "learning_rate": 8.670171830067464e-09, + "loss": 1.3921, + "step": 11439 + }, + { + "epoch": 1.973604761494005, + "grad_norm": 0.61328125, + "learning_rate": 8.557222742215932e-09, + "loss": 1.3992, + "step": 11440 + }, + { + "epoch": 1.973777279392737, + "grad_norm": 0.578125, + "learning_rate": 8.445013882875108e-09, + "loss": 1.547, + "step": 11441 + }, + { + "epoch": 1.973949797291469, + "grad_norm": 0.58984375, + "learning_rate": 8.333545260357235e-09, + "loss": 1.4586, + "step": 11442 + }, + { + "epoch": 1.974122315190201, + "grad_norm": 0.57421875, + "learning_rate": 8.222816882922368e-09, + "loss": 1.3983, + "step": 11443 + }, + { + "epoch": 1.974294833088933, + "grad_norm": 2.140625, + "learning_rate": 8.11282875877173e-09, + "loss": 1.557, + "step": 11444 + }, + { + "epoch": 1.974467350987665, + "grad_norm": 0.59375, + "learning_rate": 8.003580896055462e-09, + "loss": 1.363, + "step": 11445 + }, + { + "epoch": 1.9746398688863969, + "grad_norm": 0.6171875, + "learning_rate": 7.895073302865985e-09, + "loss": 1.3117, + "step": 11446 + }, + { + "epoch": 1.9748123867851288, + "grad_norm": 0.55859375, + "learning_rate": 7.787305987243532e-09, + "loss": 1.329, + "step": 11447 + }, + { + "epoch": 1.9749849046838608, + "grad_norm": 0.5625, + "learning_rate": 7.680278957171716e-09, + "loss": 1.2998, + "step": 11448 + }, + { + "epoch": 1.975157422582593, + "grad_norm": 0.58203125, + "learning_rate": 7.573992220580862e-09, + "loss": 1.3532, + "step": 11449 + }, + { + "epoch": 1.975329940481325, + "grad_norm": 0.64453125, + "learning_rate": 7.468445785342448e-09, + "loss": 1.341, + "step": 11450 + }, + { + "epoch": 1.975502458380057, + "grad_norm": 0.62890625, + "learning_rate": 7.363639659279109e-09, + "loss": 1.4146, + "step": 11451 + }, + { + "epoch": 1.975674976278789, + "grad_norm": 0.8203125, + "learning_rate": 7.259573850153523e-09, + "loss": 1.5061, + "step": 11452 + }, + { + "epoch": 1.975847494177521, + "grad_norm": 0.58984375, + "learning_rate": 7.156248365676188e-09, + "loss": 1.323, + "step": 11453 + }, + { + "epoch": 1.976020012076253, + "grad_norm": 0.546875, + "learning_rate": 7.053663213502093e-09, + "loss": 1.3581, + "step": 11454 + }, + { + "epoch": 1.976192529974985, + "grad_norm": 0.58203125, + "learning_rate": 6.951818401231825e-09, + "loss": 1.372, + "step": 11455 + }, + { + "epoch": 1.976365047873717, + "grad_norm": 0.6328125, + "learning_rate": 6.850713936410458e-09, + "loss": 1.3757, + "step": 11456 + }, + { + "epoch": 1.9765375657724489, + "grad_norm": 0.5859375, + "learning_rate": 6.750349826527558e-09, + "loss": 1.3695, + "step": 11457 + }, + { + "epoch": 1.9767100836711808, + "grad_norm": 0.57421875, + "learning_rate": 6.650726079019398e-09, + "loss": 1.3879, + "step": 11458 + }, + { + "epoch": 1.9768826015699128, + "grad_norm": 0.59765625, + "learning_rate": 6.551842701267852e-09, + "loss": 1.5441, + "step": 11459 + }, + { + "epoch": 1.9770551194686448, + "grad_norm": 0.6015625, + "learning_rate": 6.45369970059706e-09, + "loss": 1.3529, + "step": 11460 + }, + { + "epoch": 1.9772276373673767, + "grad_norm": 0.68359375, + "learning_rate": 6.356297084278984e-09, + "loss": 1.5121, + "step": 11461 + }, + { + "epoch": 1.977400155266109, + "grad_norm": 0.58984375, + "learning_rate": 6.259634859528962e-09, + "loss": 1.3742, + "step": 11462 + }, + { + "epoch": 1.9775726731648409, + "grad_norm": 0.59765625, + "learning_rate": 6.1637130335090446e-09, + "loss": 1.4024, + "step": 11463 + }, + { + "epoch": 1.9777451910635728, + "grad_norm": 0.5859375, + "learning_rate": 6.068531613326878e-09, + "loss": 1.4148, + "step": 11464 + }, + { + "epoch": 1.9779177089623048, + "grad_norm": 0.5703125, + "learning_rate": 5.9740906060312685e-09, + "loss": 1.4551, + "step": 11465 + }, + { + "epoch": 1.978090226861037, + "grad_norm": 0.63671875, + "learning_rate": 5.880390018621063e-09, + "loss": 1.4255, + "step": 11466 + }, + { + "epoch": 1.978262744759769, + "grad_norm": 0.55859375, + "learning_rate": 5.787429858038485e-09, + "loss": 1.4293, + "step": 11467 + }, + { + "epoch": 1.978435262658501, + "grad_norm": 0.625, + "learning_rate": 5.695210131169137e-09, + "loss": 1.4679, + "step": 11468 + }, + { + "epoch": 1.9786077805572329, + "grad_norm": 0.57421875, + "learning_rate": 5.603730844846444e-09, + "loss": 1.3792, + "step": 11469 + }, + { + "epoch": 1.9787802984559648, + "grad_norm": 0.6328125, + "learning_rate": 5.512992005846096e-09, + "loss": 1.4969, + "step": 11470 + }, + { + "epoch": 1.9789528163546968, + "grad_norm": 0.5390625, + "learning_rate": 5.422993620892713e-09, + "loss": 1.4239, + "step": 11471 + }, + { + "epoch": 1.9791253342534287, + "grad_norm": 0.62890625, + "learning_rate": 5.333735696653186e-09, + "loss": 1.442, + "step": 11472 + }, + { + "epoch": 1.9792978521521607, + "grad_norm": 0.62109375, + "learning_rate": 5.245218239740002e-09, + "loss": 1.4307, + "step": 11473 + }, + { + "epoch": 1.9794703700508927, + "grad_norm": 0.578125, + "learning_rate": 5.157441256710138e-09, + "loss": 1.4257, + "step": 11474 + }, + { + "epoch": 1.9796428879496246, + "grad_norm": 0.6484375, + "learning_rate": 5.070404754068392e-09, + "loss": 1.5279, + "step": 11475 + }, + { + "epoch": 1.9798154058483568, + "grad_norm": 0.58203125, + "learning_rate": 4.984108738261828e-09, + "loss": 1.4082, + "step": 11476 + }, + { + "epoch": 1.9799879237470888, + "grad_norm": 0.6015625, + "learning_rate": 4.898553215685331e-09, + "loss": 1.4421, + "step": 11477 + }, + { + "epoch": 1.9801604416458207, + "grad_norm": 0.5234375, + "learning_rate": 4.813738192676054e-09, + "loss": 1.338, + "step": 11478 + }, + { + "epoch": 1.980332959544553, + "grad_norm": 0.61328125, + "learning_rate": 4.729663675516749e-09, + "loss": 1.5036, + "step": 11479 + }, + { + "epoch": 1.9805054774432849, + "grad_norm": 0.5859375, + "learning_rate": 4.6463296704379876e-09, + "loss": 1.402, + "step": 11480 + }, + { + "epoch": 1.9806779953420168, + "grad_norm": 0.56640625, + "learning_rate": 4.5637361836126106e-09, + "loss": 1.39, + "step": 11481 + }, + { + "epoch": 1.9808505132407488, + "grad_norm": 0.6015625, + "learning_rate": 4.481883221160166e-09, + "loss": 1.3499, + "step": 11482 + }, + { + "epoch": 1.9810230311394808, + "grad_norm": 0.69921875, + "learning_rate": 4.400770789145803e-09, + "loss": 1.5186, + "step": 11483 + }, + { + "epoch": 1.9811955490382127, + "grad_norm": 0.57421875, + "learning_rate": 4.320398893576938e-09, + "loss": 1.4402, + "step": 11484 + }, + { + "epoch": 1.9813680669369447, + "grad_norm": 0.5703125, + "learning_rate": 4.240767540407698e-09, + "loss": 1.5271, + "step": 11485 + }, + { + "epoch": 1.9815405848356766, + "grad_norm": 0.5546875, + "learning_rate": 4.161876735540027e-09, + "loss": 1.4659, + "step": 11486 + }, + { + "epoch": 1.9817131027344086, + "grad_norm": 0.58203125, + "learning_rate": 4.08372648481703e-09, + "loss": 1.4919, + "step": 11487 + }, + { + "epoch": 1.9818856206331406, + "grad_norm": 0.57421875, + "learning_rate": 4.00631679402963e-09, + "loss": 1.3573, + "step": 11488 + }, + { + "epoch": 1.9820581385318725, + "grad_norm": 0.66015625, + "learning_rate": 3.9296476689110185e-09, + "loss": 1.469, + "step": 11489 + }, + { + "epoch": 1.9822306564306047, + "grad_norm": 0.5546875, + "learning_rate": 3.853719115143317e-09, + "loss": 1.4017, + "step": 11490 + }, + { + "epoch": 1.9824031743293367, + "grad_norm": 0.62890625, + "learning_rate": 3.778531138350916e-09, + "loss": 1.4678, + "step": 11491 + }, + { + "epoch": 1.9825756922280686, + "grad_norm": 0.62109375, + "learning_rate": 3.7040837441038035e-09, + "loss": 1.4091, + "step": 11492 + }, + { + "epoch": 1.9827482101268008, + "grad_norm": 0.56640625, + "learning_rate": 3.630376937917568e-09, + "loss": 1.4825, + "step": 11493 + }, + { + "epoch": 1.9829207280255328, + "grad_norm": 0.78515625, + "learning_rate": 3.5574107252533963e-09, + "loss": 1.3433, + "step": 11494 + }, + { + "epoch": 1.9830932459242647, + "grad_norm": 0.59765625, + "learning_rate": 3.4851851115180745e-09, + "loss": 1.4208, + "step": 11495 + }, + { + "epoch": 1.9832657638229967, + "grad_norm": 0.57421875, + "learning_rate": 3.4137001020595473e-09, + "loss": 1.5352, + "step": 11496 + }, + { + "epoch": 1.9834382817217286, + "grad_norm": 0.65234375, + "learning_rate": 3.3429557021769087e-09, + "loss": 1.3472, + "step": 11497 + }, + { + "epoch": 1.9836107996204606, + "grad_norm": 0.578125, + "learning_rate": 3.2729519171093018e-09, + "loss": 1.4102, + "step": 11498 + }, + { + "epoch": 1.9837833175191926, + "grad_norm": 0.578125, + "learning_rate": 3.203688752044798e-09, + "loss": 1.4648, + "step": 11499 + }, + { + "epoch": 1.9839558354179245, + "grad_norm": 0.66015625, + "learning_rate": 3.1351662121137384e-09, + "loss": 1.2301, + "step": 11500 + }, + { + "epoch": 1.9839558354179245, + "eval_loss": 1.4070264101028442, + "eval_runtime": 10.8116, + "eval_samples_per_second": 94.713, + "eval_steps_per_second": 23.678, + "step": 11500 + }, + { + "epoch": 1.9841283533166565, + "grad_norm": 0.5546875, + "learning_rate": 3.0673843023920623e-09, + "loss": 1.3121, + "step": 11501 + }, + { + "epoch": 1.9843008712153885, + "grad_norm": 0.5703125, + "learning_rate": 3.0003430279024193e-09, + "loss": 1.3848, + "step": 11502 + }, + { + "epoch": 1.9844733891141206, + "grad_norm": 0.53125, + "learning_rate": 2.9340423936119466e-09, + "loss": 1.4146, + "step": 11503 + }, + { + "epoch": 1.9846459070128526, + "grad_norm": 0.5703125, + "learning_rate": 2.868482404432271e-09, + "loss": 1.4652, + "step": 11504 + }, + { + "epoch": 1.9848184249115846, + "grad_norm": 0.5703125, + "learning_rate": 2.8036630652206187e-09, + "loss": 1.4103, + "step": 11505 + }, + { + "epoch": 1.9849909428103165, + "grad_norm": 0.63671875, + "learning_rate": 2.7395843807775934e-09, + "loss": 1.3924, + "step": 11506 + }, + { + "epoch": 1.9851634607090487, + "grad_norm": 0.56640625, + "learning_rate": 2.67624635585384e-09, + "loss": 1.4179, + "step": 11507 + }, + { + "epoch": 1.9853359786077807, + "grad_norm": 0.609375, + "learning_rate": 2.6136489951378295e-09, + "loss": 1.3898, + "step": 11508 + }, + { + "epoch": 1.9855084965065126, + "grad_norm": 0.5859375, + "learning_rate": 2.5517923032714053e-09, + "loss": 1.457, + "step": 11509 + }, + { + "epoch": 1.9856810144052446, + "grad_norm": 0.6484375, + "learning_rate": 2.490676284833127e-09, + "loss": 1.3667, + "step": 11510 + }, + { + "epoch": 1.9858535323039765, + "grad_norm": 0.59765625, + "learning_rate": 2.430300944353814e-09, + "loss": 1.4318, + "step": 11511 + }, + { + "epoch": 1.9860260502027085, + "grad_norm": 0.58984375, + "learning_rate": 2.3706662863054452e-09, + "loss": 1.3655, + "step": 11512 + }, + { + "epoch": 1.9861985681014405, + "grad_norm": 0.60546875, + "learning_rate": 2.311772315106708e-09, + "loss": 1.4294, + "step": 11513 + }, + { + "epoch": 1.9863710860001724, + "grad_norm": 0.60546875, + "learning_rate": 2.2536190351196697e-09, + "loss": 1.4547, + "step": 11514 + }, + { + "epoch": 1.9865436038989044, + "grad_norm": 0.59375, + "learning_rate": 2.1962064506542146e-09, + "loss": 1.4139, + "step": 11515 + }, + { + "epoch": 1.9867161217976363, + "grad_norm": 0.578125, + "learning_rate": 2.1395345659613877e-09, + "loss": 1.4734, + "step": 11516 + }, + { + "epoch": 1.9868886396963685, + "grad_norm": 0.63671875, + "learning_rate": 2.0836033852422723e-09, + "loss": 1.3815, + "step": 11517 + }, + { + "epoch": 1.9870611575951005, + "grad_norm": 0.58203125, + "learning_rate": 2.0284129126402207e-09, + "loss": 1.4186, + "step": 11518 + }, + { + "epoch": 1.9872336754938325, + "grad_norm": 0.5859375, + "learning_rate": 1.9739631522430746e-09, + "loss": 1.3891, + "step": 11519 + }, + { + "epoch": 1.9874061933925646, + "grad_norm": 0.5859375, + "learning_rate": 1.9202541080853844e-09, + "loss": 1.489, + "step": 11520 + }, + { + "epoch": 1.9875787112912966, + "grad_norm": 0.5625, + "learning_rate": 1.867285784146189e-09, + "loss": 1.4045, + "step": 11521 + }, + { + "epoch": 1.9877512291900286, + "grad_norm": 0.6171875, + "learning_rate": 1.8150581843490167e-09, + "loss": 1.3625, + "step": 11522 + }, + { + "epoch": 1.9879237470887605, + "grad_norm": 0.55078125, + "learning_rate": 1.7635713125641052e-09, + "loss": 1.473, + "step": 11523 + }, + { + "epoch": 1.9880962649874925, + "grad_norm": 0.8515625, + "learning_rate": 1.7128251726061805e-09, + "loss": 1.4697, + "step": 11524 + }, + { + "epoch": 1.9882687828862244, + "grad_norm": 0.6171875, + "learning_rate": 1.6628197682344583e-09, + "loss": 1.5217, + "step": 11525 + }, + { + "epoch": 1.9884413007849564, + "grad_norm": 0.59765625, + "learning_rate": 1.613555103152642e-09, + "loss": 1.5008, + "step": 11526 + }, + { + "epoch": 1.9886138186836884, + "grad_norm": 0.59375, + "learning_rate": 1.5650311810122555e-09, + "loss": 1.3161, + "step": 11527 + }, + { + "epoch": 1.9887863365824203, + "grad_norm": 0.5859375, + "learning_rate": 1.5172480054070903e-09, + "loss": 1.4783, + "step": 11528 + }, + { + "epoch": 1.9889588544811523, + "grad_norm": 0.5703125, + "learning_rate": 1.4702055798776482e-09, + "loss": 1.5052, + "step": 11529 + }, + { + "epoch": 1.9891313723798845, + "grad_norm": 0.59765625, + "learning_rate": 1.423903907908919e-09, + "loss": 1.4766, + "step": 11530 + }, + { + "epoch": 1.9893038902786164, + "grad_norm": 0.5546875, + "learning_rate": 1.3783429929314918e-09, + "loss": 1.3936, + "step": 11531 + }, + { + "epoch": 1.9894764081773484, + "grad_norm": 0.609375, + "learning_rate": 1.3335228383215548e-09, + "loss": 1.4857, + "step": 11532 + }, + { + "epoch": 1.9896489260760803, + "grad_norm": 0.6171875, + "learning_rate": 1.2894434473975648e-09, + "loss": 1.435, + "step": 11533 + }, + { + "epoch": 1.9898214439748125, + "grad_norm": 0.5859375, + "learning_rate": 1.246104823426908e-09, + "loss": 1.4502, + "step": 11534 + }, + { + "epoch": 1.9899939618735445, + "grad_norm": 0.71875, + "learning_rate": 1.2035069696203494e-09, + "loss": 1.3413, + "step": 11535 + }, + { + "epoch": 1.9901664797722765, + "grad_norm": 0.61328125, + "learning_rate": 1.161649889133143e-09, + "loss": 1.4472, + "step": 11536 + }, + { + "epoch": 1.9903389976710084, + "grad_norm": 0.5625, + "learning_rate": 1.1205335850661414e-09, + "loss": 1.5178, + "step": 11537 + }, + { + "epoch": 1.9905115155697404, + "grad_norm": 0.60546875, + "learning_rate": 1.080158060465797e-09, + "loss": 1.3904, + "step": 11538 + }, + { + "epoch": 1.9906840334684723, + "grad_norm": 0.61328125, + "learning_rate": 1.0405233183241604e-09, + "loss": 1.4666, + "step": 11539 + }, + { + "epoch": 1.9908565513672043, + "grad_norm": 0.57421875, + "learning_rate": 1.0016293615766615e-09, + "loss": 1.4624, + "step": 11540 + }, + { + "epoch": 1.9910290692659363, + "grad_norm": 0.6484375, + "learning_rate": 9.63476193104329e-10, + "loss": 1.3808, + "step": 11541 + }, + { + "epoch": 1.9912015871646682, + "grad_norm": 0.609375, + "learning_rate": 9.260638157360113e-10, + "loss": 1.4517, + "step": 11542 + }, + { + "epoch": 1.9913741050634002, + "grad_norm": 0.578125, + "learning_rate": 8.893922322406046e-10, + "loss": 1.4662, + "step": 11543 + }, + { + "epoch": 1.9915466229621324, + "grad_norm": 0.59375, + "learning_rate": 8.534614453370449e-10, + "loss": 1.4781, + "step": 11544 + }, + { + "epoch": 1.9917191408608643, + "grad_norm": 0.5859375, + "learning_rate": 8.182714576865369e-10, + "loss": 1.3584, + "step": 11545 + }, + { + "epoch": 1.9918916587595963, + "grad_norm": 0.6484375, + "learning_rate": 7.838222718958844e-10, + "loss": 1.4533, + "step": 11546 + }, + { + "epoch": 1.9920641766583285, + "grad_norm": 0.5625, + "learning_rate": 7.501138905186e-10, + "loss": 1.4187, + "step": 11547 + }, + { + "epoch": 1.9922366945570604, + "grad_norm": 0.56640625, + "learning_rate": 7.171463160504655e-10, + "loss": 1.5579, + "step": 11548 + }, + { + "epoch": 1.9924092124557924, + "grad_norm": 0.609375, + "learning_rate": 6.849195509339712e-10, + "loss": 1.5175, + "step": 11549 + }, + { + "epoch": 1.9925817303545243, + "grad_norm": 0.6015625, + "learning_rate": 6.534335975583173e-10, + "loss": 1.3491, + "step": 11550 + }, + { + "epoch": 1.9927542482532563, + "grad_norm": 0.55859375, + "learning_rate": 6.226884582538618e-10, + "loss": 1.3489, + "step": 11551 + }, + { + "epoch": 1.9929267661519883, + "grad_norm": 0.5703125, + "learning_rate": 5.926841353010026e-10, + "loss": 1.4231, + "step": 11552 + }, + { + "epoch": 1.9930992840507202, + "grad_norm": 0.65625, + "learning_rate": 5.634206309201861e-10, + "loss": 1.4619, + "step": 11553 + }, + { + "epoch": 1.9932718019494522, + "grad_norm": 0.56640625, + "learning_rate": 5.34897947280788e-10, + "loss": 1.4616, + "step": 11554 + }, + { + "epoch": 1.9934443198481842, + "grad_norm": 0.60546875, + "learning_rate": 5.071160864966729e-10, + "loss": 1.3633, + "step": 11555 + }, + { + "epoch": 1.9936168377469161, + "grad_norm": 0.65625, + "learning_rate": 4.800750506239737e-10, + "loss": 1.4285, + "step": 11556 + }, + { + "epoch": 1.993789355645648, + "grad_norm": 0.58203125, + "learning_rate": 4.537748416677534e-10, + "loss": 1.432, + "step": 11557 + }, + { + "epoch": 1.9939618735443803, + "grad_norm": 0.609375, + "learning_rate": 4.2821546157534313e-10, + "loss": 1.3494, + "step": 11558 + }, + { + "epoch": 1.9941343914431122, + "grad_norm": 0.5703125, + "learning_rate": 4.0339691224189346e-10, + "loss": 1.457, + "step": 11559 + }, + { + "epoch": 1.9943069093418442, + "grad_norm": 0.59375, + "learning_rate": 3.7931919550482364e-10, + "loss": 1.4494, + "step": 11560 + }, + { + "epoch": 1.9944794272405764, + "grad_norm": 0.6484375, + "learning_rate": 3.559823131471518e-10, + "loss": 1.345, + "step": 11561 + }, + { + "epoch": 1.9946519451393083, + "grad_norm": 0.5546875, + "learning_rate": 3.3338626690082587e-10, + "loss": 1.4648, + "step": 11562 + }, + { + "epoch": 1.9948244630380403, + "grad_norm": 0.56640625, + "learning_rate": 3.115310584367315e-10, + "loss": 1.3445, + "step": 11563 + }, + { + "epoch": 1.9949969809367722, + "grad_norm": 0.59375, + "learning_rate": 2.9041668937579426e-10, + "loss": 1.3534, + "step": 11564 + }, + { + "epoch": 1.9951694988355042, + "grad_norm": 0.61328125, + "learning_rate": 2.7004316128231844e-10, + "loss": 1.428, + "step": 11565 + }, + { + "epoch": 1.9953420167342362, + "grad_norm": 0.5859375, + "learning_rate": 2.504104756639869e-10, + "loss": 1.4076, + "step": 11566 + }, + { + "epoch": 1.9955145346329681, + "grad_norm": 0.609375, + "learning_rate": 2.3151863397741225e-10, + "loss": 1.406, + "step": 11567 + }, + { + "epoch": 1.9956870525317, + "grad_norm": 0.56640625, + "learning_rate": 2.133676376214755e-10, + "loss": 1.4402, + "step": 11568 + }, + { + "epoch": 1.995859570430432, + "grad_norm": 0.5859375, + "learning_rate": 1.9595748794065673e-10, + "loss": 1.5273, + "step": 11569 + }, + { + "epoch": 1.996032088329164, + "grad_norm": 0.6015625, + "learning_rate": 1.792881862250351e-10, + "loss": 1.457, + "step": 11570 + }, + { + "epoch": 1.9962046062278962, + "grad_norm": 0.5390625, + "learning_rate": 1.6335973370917858e-10, + "loss": 1.3983, + "step": 11571 + }, + { + "epoch": 1.9963771241266282, + "grad_norm": 0.57421875, + "learning_rate": 1.481721315743645e-10, + "loss": 1.4884, + "step": 11572 + }, + { + "epoch": 1.9965496420253601, + "grad_norm": 0.58203125, + "learning_rate": 1.3372538094413856e-10, + "loss": 1.4246, + "step": 11573 + }, + { + "epoch": 1.996722159924092, + "grad_norm": 0.609375, + "learning_rate": 1.2001948288986598e-10, + "loss": 1.4212, + "step": 11574 + }, + { + "epoch": 1.9968946778228243, + "grad_norm": 0.5703125, + "learning_rate": 1.070544384262906e-10, + "loss": 1.3712, + "step": 11575 + }, + { + "epoch": 1.9970671957215562, + "grad_norm": 0.59375, + "learning_rate": 9.483024851486556e-11, + "loss": 1.3957, + "step": 11576 + }, + { + "epoch": 1.9972397136202882, + "grad_norm": 0.56640625, + "learning_rate": 8.334691406042261e-11, + "loss": 1.4156, + "step": 11577 + }, + { + "epoch": 1.9974122315190201, + "grad_norm": 0.53515625, + "learning_rate": 7.260443591450283e-11, + "loss": 1.4135, + "step": 11578 + }, + { + "epoch": 1.997584749417752, + "grad_norm": 0.578125, + "learning_rate": 6.26028148720259e-11, + "loss": 1.3832, + "step": 11579 + }, + { + "epoch": 1.997757267316484, + "grad_norm": 0.59765625, + "learning_rate": 5.334205167462081e-11, + "loss": 1.4101, + "step": 11580 + }, + { + "epoch": 1.997929785215216, + "grad_norm": 0.5625, + "learning_rate": 4.482214700729515e-11, + "loss": 1.394, + "step": 11581 + }, + { + "epoch": 1.998102303113948, + "grad_norm": 0.546875, + "learning_rate": 3.704310150287604e-11, + "loss": 1.4096, + "step": 11582 + }, + { + "epoch": 1.99827482101268, + "grad_norm": 0.57421875, + "learning_rate": 3.000491573756925e-11, + "loss": 1.4367, + "step": 11583 + }, + { + "epoch": 1.998447338911412, + "grad_norm": 0.609375, + "learning_rate": 2.3707590230959144e-11, + "loss": 1.4118, + "step": 11584 + }, + { + "epoch": 1.998619856810144, + "grad_norm": 0.57421875, + "learning_rate": 1.8151125451559838e-11, + "loss": 1.3415, + "step": 11585 + }, + { + "epoch": 1.998792374708876, + "grad_norm": 0.58984375, + "learning_rate": 1.3335521810153851e-11, + "loss": 1.4252, + "step": 11586 + }, + { + "epoch": 1.998964892607608, + "grad_norm": 0.58203125, + "learning_rate": 9.260779664233e-12, + "loss": 1.4393, + "step": 11587 + }, + { + "epoch": 1.9991374105063402, + "grad_norm": 0.56640625, + "learning_rate": 5.926899314667723e-12, + "loss": 1.4364, + "step": 11588 + }, + { + "epoch": 1.9993099284050722, + "grad_norm": 0.5625, + "learning_rate": 3.3338810079275307e-12, + "loss": 1.4977, + "step": 11589 + }, + { + "epoch": 1.9994824463038041, + "grad_norm": 0.55859375, + "learning_rate": 1.4817249383014543e-12, + "loss": 1.3664, + "step": 11590 + }, + { + "epoch": 1.999654964202536, + "grad_norm": 0.56640625, + "learning_rate": 3.704312412367017e-13, + "loss": 1.3706, + "step": 11591 + }, + { + "epoch": 1.999827482101268, + "grad_norm": 0.55078125, + "learning_rate": 0.0, + "loss": 1.4666, + "step": 11592 + } + ], + "logging_steps": 1, + "max_steps": 11592, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.613498406381342e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}