{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 475, "global_step": 1900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005263157894736842, "grad_norm": 1.3774428367614746, "learning_rate": 7.5e-07, "loss": 0.7461, "step": 1 }, { "epoch": 0.0005263157894736842, "eval_loss": 1.3115713596343994, "eval_runtime": 13.3879, "eval_samples_per_second": 7.469, "eval_steps_per_second": 7.469, "step": 1 }, { "epoch": 0.0010526315789473684, "grad_norm": 0.40974903106689453, "learning_rate": 1.5e-06, "loss": 1.2756, "step": 2 }, { "epoch": 0.0015789473684210526, "grad_norm": 0.5177225470542908, "learning_rate": 2.25e-06, "loss": 1.7994, "step": 3 }, { "epoch": 0.002105263157894737, "grad_norm": 0.5397021174430847, "learning_rate": 3e-06, "loss": 1.3129, "step": 4 }, { "epoch": 0.002631578947368421, "grad_norm": 0.37644293904304504, "learning_rate": 3.75e-06, "loss": 1.2963, "step": 5 }, { "epoch": 0.003157894736842105, "grad_norm": 0.6736767292022705, "learning_rate": 4.5e-06, "loss": 1.0839, "step": 6 }, { "epoch": 0.0036842105263157894, "grad_norm": 0.9168409109115601, "learning_rate": 5.25e-06, "loss": 1.2321, "step": 7 }, { "epoch": 0.004210526315789474, "grad_norm": 0.5211862325668335, "learning_rate": 6e-06, "loss": 1.567, "step": 8 }, { "epoch": 0.004736842105263158, "grad_norm": 0.6043412685394287, "learning_rate": 6.750000000000001e-06, "loss": 1.236, "step": 9 }, { "epoch": 0.005263157894736842, "grad_norm": 0.9153957366943359, "learning_rate": 7.5e-06, "loss": 1.9274, "step": 10 }, { "epoch": 0.005789473684210527, "grad_norm": 0.3442760407924652, "learning_rate": 8.25e-06, "loss": 1.7819, "step": 11 }, { "epoch": 0.00631578947368421, "grad_norm": 3.2623486518859863, "learning_rate": 9e-06, "loss": 3.5503, "step": 12 }, { "epoch": 0.006842105263157895, "grad_norm": 0.5294060111045837, "learning_rate": 9.75e-06, "loss": 1.0113, "step": 13 }, { "epoch": 0.007368421052631579, "grad_norm": 0.7050935626029968, "learning_rate": 1.05e-05, "loss": 0.8552, "step": 14 }, { "epoch": 0.007894736842105263, "grad_norm": 0.3988778591156006, "learning_rate": 1.125e-05, "loss": 1.2478, "step": 15 }, { "epoch": 0.008421052631578947, "grad_norm": 1.1330819129943848, "learning_rate": 1.2e-05, "loss": 0.5481, "step": 16 }, { "epoch": 0.008947368421052631, "grad_norm": 0.3928195834159851, "learning_rate": 1.275e-05, "loss": 1.2705, "step": 17 }, { "epoch": 0.009473684210526316, "grad_norm": 0.5050056576728821, "learning_rate": 1.3500000000000001e-05, "loss": 1.0001, "step": 18 }, { "epoch": 0.01, "grad_norm": 0.40563464164733887, "learning_rate": 1.4249999999999999e-05, "loss": 1.3443, "step": 19 }, { "epoch": 0.010526315789473684, "grad_norm": 0.802213728427887, "learning_rate": 1.5e-05, "loss": 1.6883, "step": 20 }, { "epoch": 0.011052631578947368, "grad_norm": 0.6669230461120605, "learning_rate": 1.575e-05, "loss": 1.8126, "step": 21 }, { "epoch": 0.011578947368421053, "grad_norm": 0.7659299373626709, "learning_rate": 1.65e-05, "loss": 0.892, "step": 22 }, { "epoch": 0.012105263157894737, "grad_norm": 0.42965367436408997, "learning_rate": 1.725e-05, "loss": 1.5116, "step": 23 }, { "epoch": 0.01263157894736842, "grad_norm": 0.7102161049842834, "learning_rate": 1.8e-05, "loss": 0.6337, "step": 24 }, { "epoch": 0.013157894736842105, "grad_norm": 1.3091274499893188, "learning_rate": 1.8750000000000002e-05, "loss": 1.0829, "step": 25 }, { "epoch": 0.01368421052631579, "grad_norm": 4.387722969055176, "learning_rate": 1.95e-05, "loss": 0.7469, "step": 26 }, { "epoch": 0.014210526315789474, "grad_norm": 1.5182483196258545, "learning_rate": 2.025e-05, "loss": 0.7235, "step": 27 }, { "epoch": 0.014736842105263158, "grad_norm": 4.085071563720703, "learning_rate": 2.1e-05, "loss": 2.2212, "step": 28 }, { "epoch": 0.015263157894736841, "grad_norm": 0.7777957916259766, "learning_rate": 2.175e-05, "loss": 1.114, "step": 29 }, { "epoch": 0.015789473684210527, "grad_norm": 2.115023136138916, "learning_rate": 2.25e-05, "loss": 0.3802, "step": 30 }, { "epoch": 0.01631578947368421, "grad_norm": 6.118307113647461, "learning_rate": 2.3250000000000003e-05, "loss": 1.2828, "step": 31 }, { "epoch": 0.016842105263157894, "grad_norm": 1.7932595014572144, "learning_rate": 2.4e-05, "loss": 1.1272, "step": 32 }, { "epoch": 0.017368421052631578, "grad_norm": 1.0127062797546387, "learning_rate": 2.475e-05, "loss": 1.3309, "step": 33 }, { "epoch": 0.017894736842105262, "grad_norm": 0.7217763066291809, "learning_rate": 2.55e-05, "loss": 1.2299, "step": 34 }, { "epoch": 0.018421052631578946, "grad_norm": 4.896430492401123, "learning_rate": 2.625e-05, "loss": 0.8715, "step": 35 }, { "epoch": 0.018947368421052633, "grad_norm": 1.978946328163147, "learning_rate": 2.7000000000000002e-05, "loss": 1.078, "step": 36 }, { "epoch": 0.019473684210526317, "grad_norm": 4.97585916519165, "learning_rate": 2.7750000000000004e-05, "loss": 1.4184, "step": 37 }, { "epoch": 0.02, "grad_norm": 2.3499672412872314, "learning_rate": 2.8499999999999998e-05, "loss": 1.373, "step": 38 }, { "epoch": 0.020526315789473684, "grad_norm": 3.067195415496826, "learning_rate": 2.925e-05, "loss": 0.4262, "step": 39 }, { "epoch": 0.021052631578947368, "grad_norm": 11.364575386047363, "learning_rate": 3e-05, "loss": 1.8748, "step": 40 }, { "epoch": 0.02157894736842105, "grad_norm": 1.4466383457183838, "learning_rate": 2.999999979408673e-05, "loss": 1.0143, "step": 41 }, { "epoch": 0.022105263157894735, "grad_norm": 1.8457032442092896, "learning_rate": 2.9999999176346915e-05, "loss": 1.176, "step": 42 }, { "epoch": 0.022631578947368423, "grad_norm": 1.4849117994308472, "learning_rate": 2.9999998146780576e-05, "loss": 1.1838, "step": 43 }, { "epoch": 0.023157894736842106, "grad_norm": 1.6621780395507812, "learning_rate": 2.9999996705387744e-05, "loss": 1.0695, "step": 44 }, { "epoch": 0.02368421052631579, "grad_norm": 2.543567180633545, "learning_rate": 2.9999994852168458e-05, "loss": 1.2559, "step": 45 }, { "epoch": 0.024210526315789474, "grad_norm": 2.46481990814209, "learning_rate": 2.999999258712277e-05, "loss": 1.2556, "step": 46 }, { "epoch": 0.024736842105263158, "grad_norm": 3.249171733856201, "learning_rate": 2.999998991025073e-05, "loss": 0.1304, "step": 47 }, { "epoch": 0.02526315789473684, "grad_norm": 8.809581756591797, "learning_rate": 2.9999986821552427e-05, "loss": 3.5048, "step": 48 }, { "epoch": 0.025789473684210525, "grad_norm": 2.1939406394958496, "learning_rate": 2.999998332102794e-05, "loss": 1.024, "step": 49 }, { "epoch": 0.02631578947368421, "grad_norm": 5.13323450088501, "learning_rate": 2.9999979408677368e-05, "loss": 0.7764, "step": 50 }, { "epoch": 0.026842105263157896, "grad_norm": 1.4928818941116333, "learning_rate": 2.999997508450081e-05, "loss": 1.3143, "step": 51 }, { "epoch": 0.02736842105263158, "grad_norm": 5.20947265625, "learning_rate": 2.999997034849839e-05, "loss": 2.3539, "step": 52 }, { "epoch": 0.027894736842105264, "grad_norm": 1.5518254041671753, "learning_rate": 2.999996520067024e-05, "loss": 1.0206, "step": 53 }, { "epoch": 0.028421052631578948, "grad_norm": 1.5228031873703003, "learning_rate": 2.9999959641016498e-05, "loss": 1.1746, "step": 54 }, { "epoch": 0.02894736842105263, "grad_norm": 4.204188823699951, "learning_rate": 2.999995366953732e-05, "loss": 1.6748, "step": 55 }, { "epoch": 0.029473684210526315, "grad_norm": 12.567036628723145, "learning_rate": 2.999994728623287e-05, "loss": 1.6706, "step": 56 }, { "epoch": 0.03, "grad_norm": 2.302267074584961, "learning_rate": 2.9999940491103316e-05, "loss": 1.2736, "step": 57 }, { "epoch": 0.030526315789473683, "grad_norm": 2.4936375617980957, "learning_rate": 2.999993328414885e-05, "loss": 1.1823, "step": 58 }, { "epoch": 0.03105263157894737, "grad_norm": 2.828848361968994, "learning_rate": 2.9999925665369675e-05, "loss": 2.1706, "step": 59 }, { "epoch": 0.031578947368421054, "grad_norm": 3.864727258682251, "learning_rate": 2.999991763476599e-05, "loss": 1.4049, "step": 60 }, { "epoch": 0.032105263157894734, "grad_norm": 2.246717691421509, "learning_rate": 2.9999909192338023e-05, "loss": 1.7979, "step": 61 }, { "epoch": 0.03263157894736842, "grad_norm": 2.686671257019043, "learning_rate": 2.9999900338086e-05, "loss": 1.2399, "step": 62 }, { "epoch": 0.03315789473684211, "grad_norm": 4.649468898773193, "learning_rate": 2.9999891072010173e-05, "loss": 1.6921, "step": 63 }, { "epoch": 0.03368421052631579, "grad_norm": 6.839999675750732, "learning_rate": 2.9999881394110785e-05, "loss": 1.2485, "step": 64 }, { "epoch": 0.034210526315789476, "grad_norm": 2.245439052581787, "learning_rate": 2.9999871304388115e-05, "loss": 1.8246, "step": 65 }, { "epoch": 0.034736842105263156, "grad_norm": 3.225515127182007, "learning_rate": 2.9999860802842423e-05, "loss": 1.4347, "step": 66 }, { "epoch": 0.035263157894736843, "grad_norm": 5.565622806549072, "learning_rate": 2.9999849889474012e-05, "loss": 1.2189, "step": 67 }, { "epoch": 0.035789473684210524, "grad_norm": 10.95042896270752, "learning_rate": 2.9999838564283172e-05, "loss": 0.4937, "step": 68 }, { "epoch": 0.03631578947368421, "grad_norm": 2.187350034713745, "learning_rate": 2.9999826827270223e-05, "loss": 1.4298, "step": 69 }, { "epoch": 0.03684210526315789, "grad_norm": 2.14497447013855, "learning_rate": 2.999981467843548e-05, "loss": 1.1944, "step": 70 }, { "epoch": 0.03736842105263158, "grad_norm": 1.1492475271224976, "learning_rate": 2.9999802117779277e-05, "loss": 0.6928, "step": 71 }, { "epoch": 0.037894736842105266, "grad_norm": 4.218234062194824, "learning_rate": 2.9999789145301967e-05, "loss": 0.0767, "step": 72 }, { "epoch": 0.038421052631578946, "grad_norm": 7.308555603027344, "learning_rate": 2.9999775761003895e-05, "loss": 1.2755, "step": 73 }, { "epoch": 0.03894736842105263, "grad_norm": 2.922872543334961, "learning_rate": 2.9999761964885436e-05, "loss": 1.1238, "step": 74 }, { "epoch": 0.039473684210526314, "grad_norm": 2.961275339126587, "learning_rate": 2.9999747756946967e-05, "loss": 1.0357, "step": 75 }, { "epoch": 0.04, "grad_norm": 12.174813270568848, "learning_rate": 2.9999733137188872e-05, "loss": 0.7883, "step": 76 }, { "epoch": 0.04052631578947368, "grad_norm": 2.3447883129119873, "learning_rate": 2.9999718105611564e-05, "loss": 3.1787, "step": 77 }, { "epoch": 0.04105263157894737, "grad_norm": 1.4516093730926514, "learning_rate": 2.9999702662215446e-05, "loss": 1.1768, "step": 78 }, { "epoch": 0.041578947368421056, "grad_norm": 2.680931568145752, "learning_rate": 2.9999686807000945e-05, "loss": 1.6191, "step": 79 }, { "epoch": 0.042105263157894736, "grad_norm": 2.4612650871276855, "learning_rate": 2.99996705399685e-05, "loss": 0.9811, "step": 80 }, { "epoch": 0.04263157894736842, "grad_norm": 10.88182258605957, "learning_rate": 2.999965386111855e-05, "loss": 1.5144, "step": 81 }, { "epoch": 0.0431578947368421, "grad_norm": 1.9869924783706665, "learning_rate": 2.9999636770451562e-05, "loss": 0.7623, "step": 82 }, { "epoch": 0.04368421052631579, "grad_norm": 25.559877395629883, "learning_rate": 2.9999619267968e-05, "loss": 0.4775, "step": 83 }, { "epoch": 0.04421052631578947, "grad_norm": 2.2357089519500732, "learning_rate": 2.9999601353668344e-05, "loss": 0.9626, "step": 84 }, { "epoch": 0.04473684210526316, "grad_norm": 2.5922367572784424, "learning_rate": 2.9999583027553084e-05, "loss": 1.161, "step": 85 }, { "epoch": 0.045263157894736845, "grad_norm": 3.0082902908325195, "learning_rate": 2.999956428962273e-05, "loss": 0.6429, "step": 86 }, { "epoch": 0.045789473684210526, "grad_norm": 1.8411225080490112, "learning_rate": 2.9999545139877787e-05, "loss": 1.2799, "step": 87 }, { "epoch": 0.04631578947368421, "grad_norm": 61.66618347167969, "learning_rate": 2.999952557831879e-05, "loss": 3.875, "step": 88 }, { "epoch": 0.04684210526315789, "grad_norm": 1.83926522731781, "learning_rate": 2.9999505604946272e-05, "loss": 1.1743, "step": 89 }, { "epoch": 0.04736842105263158, "grad_norm": 3.020488977432251, "learning_rate": 2.9999485219760786e-05, "loss": 2.1196, "step": 90 }, { "epoch": 0.04789473684210526, "grad_norm": 3.429885149002075, "learning_rate": 2.999946442276288e-05, "loss": 2.4292, "step": 91 }, { "epoch": 0.04842105263157895, "grad_norm": 2.8225109577178955, "learning_rate": 2.9999443213953137e-05, "loss": 1.2811, "step": 92 }, { "epoch": 0.04894736842105263, "grad_norm": 3.0931406021118164, "learning_rate": 2.9999421593332133e-05, "loss": 0.8613, "step": 93 }, { "epoch": 0.049473684210526316, "grad_norm": 1.8076380491256714, "learning_rate": 2.999939956090046e-05, "loss": 0.9606, "step": 94 }, { "epoch": 0.05, "grad_norm": 4.066877365112305, "learning_rate": 2.999937711665873e-05, "loss": 1.0512, "step": 95 }, { "epoch": 0.05052631578947368, "grad_norm": 1.641595482826233, "learning_rate": 2.9999354260607556e-05, "loss": 0.7993, "step": 96 }, { "epoch": 0.05105263157894737, "grad_norm": 9.759380340576172, "learning_rate": 2.9999330992747566e-05, "loss": 0.7986, "step": 97 }, { "epoch": 0.05157894736842105, "grad_norm": 1.395878791809082, "learning_rate": 2.999930731307939e-05, "loss": 1.129, "step": 98 }, { "epoch": 0.05210526315789474, "grad_norm": 5.108509540557861, "learning_rate": 2.9999283221603697e-05, "loss": 0.0843, "step": 99 }, { "epoch": 0.05263157894736842, "grad_norm": 4.187808513641357, "learning_rate": 2.999925871832113e-05, "loss": 1.9898, "step": 100 }, { "epoch": 0.053157894736842105, "grad_norm": 2.7410473823547363, "learning_rate": 2.9999233803232368e-05, "loss": 1.4947, "step": 101 }, { "epoch": 0.05368421052631579, "grad_norm": 3.0927412509918213, "learning_rate": 2.99992084763381e-05, "loss": 0.5582, "step": 102 }, { "epoch": 0.05421052631578947, "grad_norm": 28.332054138183594, "learning_rate": 2.9999182737639015e-05, "loss": 2.1906, "step": 103 }, { "epoch": 0.05473684210526316, "grad_norm": 1.7883048057556152, "learning_rate": 2.9999156587135824e-05, "loss": 1.2287, "step": 104 }, { "epoch": 0.05526315789473684, "grad_norm": 3.060166597366333, "learning_rate": 2.999913002482924e-05, "loss": 1.476, "step": 105 }, { "epoch": 0.05578947368421053, "grad_norm": 1.8946037292480469, "learning_rate": 2.9999103050719998e-05, "loss": 0.8644, "step": 106 }, { "epoch": 0.05631578947368421, "grad_norm": 2.7290570735931396, "learning_rate": 2.9999075664808832e-05, "loss": 0.1352, "step": 107 }, { "epoch": 0.056842105263157895, "grad_norm": 6.147144794464111, "learning_rate": 2.9999047867096502e-05, "loss": 0.2661, "step": 108 }, { "epoch": 0.057368421052631575, "grad_norm": 1.951554298400879, "learning_rate": 2.999901965758377e-05, "loss": 1.4944, "step": 109 }, { "epoch": 0.05789473684210526, "grad_norm": 2.1169261932373047, "learning_rate": 2.99989910362714e-05, "loss": 1.0947, "step": 110 }, { "epoch": 0.05842105263157895, "grad_norm": 1.7119637727737427, "learning_rate": 2.9998962003160186e-05, "loss": 1.2013, "step": 111 }, { "epoch": 0.05894736842105263, "grad_norm": 1.4529640674591064, "learning_rate": 2.9998932558250927e-05, "loss": 1.1557, "step": 112 }, { "epoch": 0.05947368421052632, "grad_norm": 2.9316415786743164, "learning_rate": 2.9998902701544427e-05, "loss": 1.3037, "step": 113 }, { "epoch": 0.06, "grad_norm": 10.239468574523926, "learning_rate": 2.999887243304151e-05, "loss": 1.135, "step": 114 }, { "epoch": 0.060526315789473685, "grad_norm": 2.1971309185028076, "learning_rate": 2.9998841752743002e-05, "loss": 1.227, "step": 115 }, { "epoch": 0.061052631578947365, "grad_norm": 4.265197277069092, "learning_rate": 2.999881066064975e-05, "loss": 0.4307, "step": 116 }, { "epoch": 0.06157894736842105, "grad_norm": 9.106600761413574, "learning_rate": 2.9998779156762604e-05, "loss": 1.7782, "step": 117 }, { "epoch": 0.06210526315789474, "grad_norm": 5.628964424133301, "learning_rate": 2.9998747241082433e-05, "loss": 1.1438, "step": 118 }, { "epoch": 0.06263157894736843, "grad_norm": 33.68876647949219, "learning_rate": 2.9998714913610106e-05, "loss": 1.8069, "step": 119 }, { "epoch": 0.06315789473684211, "grad_norm": 2.011239767074585, "learning_rate": 2.9998682174346518e-05, "loss": 1.168, "step": 120 }, { "epoch": 0.06368421052631579, "grad_norm": 2.534121036529541, "learning_rate": 2.9998649023292564e-05, "loss": 1.0316, "step": 121 }, { "epoch": 0.06421052631578947, "grad_norm": 7.767317295074463, "learning_rate": 2.9998615460449155e-05, "loss": 2.0673, "step": 122 }, { "epoch": 0.06473684210526316, "grad_norm": 2.559743642807007, "learning_rate": 2.9998581485817213e-05, "loss": 1.1438, "step": 123 }, { "epoch": 0.06526315789473684, "grad_norm": 5.9045186042785645, "learning_rate": 2.9998547099397673e-05, "loss": 0.527, "step": 124 }, { "epoch": 0.06578947368421052, "grad_norm": 2.093289375305176, "learning_rate": 2.9998512301191472e-05, "loss": 1.4109, "step": 125 }, { "epoch": 0.06631578947368422, "grad_norm": 3.366138458251953, "learning_rate": 2.9998477091199575e-05, "loss": 1.536, "step": 126 }, { "epoch": 0.0668421052631579, "grad_norm": 13.098956108093262, "learning_rate": 2.9998441469422938e-05, "loss": 1.4469, "step": 127 }, { "epoch": 0.06736842105263158, "grad_norm": 1.7754340171813965, "learning_rate": 2.999840543586255e-05, "loss": 1.2916, "step": 128 }, { "epoch": 0.06789473684210526, "grad_norm": 2.7748563289642334, "learning_rate": 2.9998368990519393e-05, "loss": 1.0124, "step": 129 }, { "epoch": 0.06842105263157895, "grad_norm": 16.237028121948242, "learning_rate": 2.9998332133394467e-05, "loss": 0.7667, "step": 130 }, { "epoch": 0.06894736842105263, "grad_norm": 1.7698700428009033, "learning_rate": 2.9998294864488786e-05, "loss": 1.7484, "step": 131 }, { "epoch": 0.06947368421052631, "grad_norm": 2.1112704277038574, "learning_rate": 2.9998257183803378e-05, "loss": 1.3866, "step": 132 }, { "epoch": 0.07, "grad_norm": 1.5450336933135986, "learning_rate": 2.999821909133927e-05, "loss": 1.1476, "step": 133 }, { "epoch": 0.07052631578947369, "grad_norm": 2.787191390991211, "learning_rate": 2.9998180587097518e-05, "loss": 0.7392, "step": 134 }, { "epoch": 0.07105263157894737, "grad_norm": 7.418491840362549, "learning_rate": 2.999814167107916e-05, "loss": 1.0287, "step": 135 }, { "epoch": 0.07157894736842105, "grad_norm": 2.0604071617126465, "learning_rate": 2.9998102343285288e-05, "loss": 0.9736, "step": 136 }, { "epoch": 0.07210526315789474, "grad_norm": 3.1856720447540283, "learning_rate": 2.9998062603716966e-05, "loss": 1.1029, "step": 137 }, { "epoch": 0.07263157894736842, "grad_norm": 44.8514289855957, "learning_rate": 2.9998022452375286e-05, "loss": 2.1055, "step": 138 }, { "epoch": 0.0731578947368421, "grad_norm": 2.2738101482391357, "learning_rate": 2.999798188926136e-05, "loss": 1.135, "step": 139 }, { "epoch": 0.07368421052631578, "grad_norm": 4.192337512969971, "learning_rate": 2.9997940914376287e-05, "loss": 0.9468, "step": 140 }, { "epoch": 0.07421052631578948, "grad_norm": 1.970901370048523, "learning_rate": 2.9997899527721208e-05, "loss": 1.0999, "step": 141 }, { "epoch": 0.07473684210526316, "grad_norm": 7.665932655334473, "learning_rate": 2.999785772929725e-05, "loss": 0.8222, "step": 142 }, { "epoch": 0.07526315789473684, "grad_norm": 1.8983104228973389, "learning_rate": 2.9997815519105562e-05, "loss": 1.2967, "step": 143 }, { "epoch": 0.07578947368421053, "grad_norm": 1.8003042936325073, "learning_rate": 2.9997772897147302e-05, "loss": 1.4046, "step": 144 }, { "epoch": 0.07631578947368421, "grad_norm": 9.573161125183105, "learning_rate": 2.999772986342364e-05, "loss": 0.5591, "step": 145 }, { "epoch": 0.07684210526315789, "grad_norm": 4.940485954284668, "learning_rate": 2.9997686417935764e-05, "loss": 1.9492, "step": 146 }, { "epoch": 0.07736842105263157, "grad_norm": 3.1047379970550537, "learning_rate": 2.9997642560684854e-05, "loss": 1.631, "step": 147 }, { "epoch": 0.07789473684210527, "grad_norm": 2.6074681282043457, "learning_rate": 2.999759829167213e-05, "loss": 1.4144, "step": 148 }, { "epoch": 0.07842105263157895, "grad_norm": 6.4516448974609375, "learning_rate": 2.9997553610898793e-05, "loss": 0.7626, "step": 149 }, { "epoch": 0.07894736842105263, "grad_norm": 5.060396671295166, "learning_rate": 2.999750851836608e-05, "loss": 1.457, "step": 150 }, { "epoch": 0.07947368421052632, "grad_norm": 2.47971510887146, "learning_rate": 2.9997463014075222e-05, "loss": 1.3472, "step": 151 }, { "epoch": 0.08, "grad_norm": 2.023242473602295, "learning_rate": 2.999741709802747e-05, "loss": 1.045, "step": 152 }, { "epoch": 0.08052631578947368, "grad_norm": 2.3610341548919678, "learning_rate": 2.999737077022409e-05, "loss": 1.0414, "step": 153 }, { "epoch": 0.08105263157894736, "grad_norm": 6.400054931640625, "learning_rate": 2.9997324030666347e-05, "loss": 2.2875, "step": 154 }, { "epoch": 0.08157894736842106, "grad_norm": 3.4902946949005127, "learning_rate": 2.999727687935553e-05, "loss": 0.851, "step": 155 }, { "epoch": 0.08210526315789474, "grad_norm": 17.07858657836914, "learning_rate": 2.9997229316292928e-05, "loss": 1.6888, "step": 156 }, { "epoch": 0.08263157894736842, "grad_norm": 0.5993421077728271, "learning_rate": 2.999718134147985e-05, "loss": 0.0141, "step": 157 }, { "epoch": 0.08315789473684211, "grad_norm": 2.3327324390411377, "learning_rate": 2.9997132954917615e-05, "loss": 1.4322, "step": 158 }, { "epoch": 0.08368421052631579, "grad_norm": 2.2541940212249756, "learning_rate": 2.9997084156607543e-05, "loss": 1.0867, "step": 159 }, { "epoch": 0.08421052631578947, "grad_norm": 5.866251468658447, "learning_rate": 2.9997034946550984e-05, "loss": 1.4221, "step": 160 }, { "epoch": 0.08473684210526315, "grad_norm": 4.001148223876953, "learning_rate": 2.9996985324749288e-05, "loss": 1.6462, "step": 161 }, { "epoch": 0.08526315789473685, "grad_norm": 10.700677871704102, "learning_rate": 2.9996935291203805e-05, "loss": 1.456, "step": 162 }, { "epoch": 0.08578947368421053, "grad_norm": 1.5370553731918335, "learning_rate": 2.9996884845915925e-05, "loss": 0.4811, "step": 163 }, { "epoch": 0.0863157894736842, "grad_norm": 4.019623756408691, "learning_rate": 2.9996833988887026e-05, "loss": 0.0901, "step": 164 }, { "epoch": 0.0868421052631579, "grad_norm": 1.621778130531311, "learning_rate": 2.9996782720118502e-05, "loss": 1.005, "step": 165 }, { "epoch": 0.08736842105263158, "grad_norm": 2.1922006607055664, "learning_rate": 2.999673103961176e-05, "loss": 1.1868, "step": 166 }, { "epoch": 0.08789473684210526, "grad_norm": 2.594514846801758, "learning_rate": 2.999667894736823e-05, "loss": 1.2468, "step": 167 }, { "epoch": 0.08842105263157894, "grad_norm": 1.5204415321350098, "learning_rate": 2.9996626443389325e-05, "loss": 1.098, "step": 168 }, { "epoch": 0.08894736842105264, "grad_norm": 14.031874656677246, "learning_rate": 2.9996573527676498e-05, "loss": 1.7613, "step": 169 }, { "epoch": 0.08947368421052632, "grad_norm": 2.578813076019287, "learning_rate": 2.99965202002312e-05, "loss": 0.8607, "step": 170 }, { "epoch": 0.09, "grad_norm": 4.223609447479248, "learning_rate": 2.9996466461054897e-05, "loss": 0.3025, "step": 171 }, { "epoch": 0.09052631578947369, "grad_norm": 3.067014694213867, "learning_rate": 2.9996412310149058e-05, "loss": 1.1488, "step": 172 }, { "epoch": 0.09105263157894737, "grad_norm": 1.7577857971191406, "learning_rate": 2.9996357747515174e-05, "loss": 1.3616, "step": 173 }, { "epoch": 0.09157894736842105, "grad_norm": 4.0033278465271, "learning_rate": 2.9996302773154742e-05, "loss": 0.5106, "step": 174 }, { "epoch": 0.09210526315789473, "grad_norm": 2.1187245845794678, "learning_rate": 2.999624738706927e-05, "loss": 1.1571, "step": 175 }, { "epoch": 0.09263157894736843, "grad_norm": 10.947097778320312, "learning_rate": 2.9996191589260284e-05, "loss": 1.2541, "step": 176 }, { "epoch": 0.0931578947368421, "grad_norm": 1.7811756134033203, "learning_rate": 2.9996135379729314e-05, "loss": 1.5256, "step": 177 }, { "epoch": 0.09368421052631579, "grad_norm": 2.26837420463562, "learning_rate": 2.9996078758477898e-05, "loss": 0.9947, "step": 178 }, { "epoch": 0.09421052631578947, "grad_norm": 4.844338417053223, "learning_rate": 2.9996021725507595e-05, "loss": 1.7111, "step": 179 }, { "epoch": 0.09473684210526316, "grad_norm": 1.7822426557540894, "learning_rate": 2.9995964280819967e-05, "loss": 0.824, "step": 180 }, { "epoch": 0.09526315789473684, "grad_norm": 2.7125401496887207, "learning_rate": 2.99959064244166e-05, "loss": 1.2433, "step": 181 }, { "epoch": 0.09578947368421052, "grad_norm": 1.5130306482315063, "learning_rate": 2.9995848156299076e-05, "loss": 1.2715, "step": 182 }, { "epoch": 0.09631578947368422, "grad_norm": 8.074607849121094, "learning_rate": 2.999578947646899e-05, "loss": 1.9851, "step": 183 }, { "epoch": 0.0968421052631579, "grad_norm": 5.693285942077637, "learning_rate": 2.999573038492796e-05, "loss": 1.5789, "step": 184 }, { "epoch": 0.09736842105263158, "grad_norm": 1.201459288597107, "learning_rate": 2.9995670881677607e-05, "loss": 1.1029, "step": 185 }, { "epoch": 0.09789473684210526, "grad_norm": 2.4890646934509277, "learning_rate": 2.9995610966719565e-05, "loss": 1.1719, "step": 186 }, { "epoch": 0.09842105263157895, "grad_norm": 6.686047077178955, "learning_rate": 2.999555064005548e-05, "loss": 1.8928, "step": 187 }, { "epoch": 0.09894736842105263, "grad_norm": 1.3713222742080688, "learning_rate": 2.9995489901687004e-05, "loss": 0.7986, "step": 188 }, { "epoch": 0.09947368421052631, "grad_norm": 1.3539314270019531, "learning_rate": 2.999542875161581e-05, "loss": 1.0582, "step": 189 }, { "epoch": 0.1, "grad_norm": 7.031521797180176, "learning_rate": 2.999536718984357e-05, "loss": 1.2458, "step": 190 }, { "epoch": 0.10052631578947369, "grad_norm": 2.317741870880127, "learning_rate": 2.9995305216371986e-05, "loss": 1.7422, "step": 191 }, { "epoch": 0.10105263157894737, "grad_norm": 5.038298606872559, "learning_rate": 2.9995242831202744e-05, "loss": 1.2954, "step": 192 }, { "epoch": 0.10157894736842105, "grad_norm": 16.174734115600586, "learning_rate": 2.9995180034337566e-05, "loss": 1.0241, "step": 193 }, { "epoch": 0.10210526315789474, "grad_norm": 9.38874626159668, "learning_rate": 2.9995116825778177e-05, "loss": 1.0844, "step": 194 }, { "epoch": 0.10263157894736842, "grad_norm": 2.1543664932250977, "learning_rate": 2.9995053205526305e-05, "loss": 1.1873, "step": 195 }, { "epoch": 0.1031578947368421, "grad_norm": 3.171243906021118, "learning_rate": 2.9994989173583712e-05, "loss": 0.5879, "step": 196 }, { "epoch": 0.1036842105263158, "grad_norm": 5.367387294769287, "learning_rate": 2.9994924729952135e-05, "loss": 1.1287, "step": 197 }, { "epoch": 0.10421052631578948, "grad_norm": 6.392814636230469, "learning_rate": 2.9994859874633358e-05, "loss": 0.2617, "step": 198 }, { "epoch": 0.10473684210526316, "grad_norm": 8.996861457824707, "learning_rate": 2.999479460762916e-05, "loss": 0.5161, "step": 199 }, { "epoch": 0.10526315789473684, "grad_norm": 2.239112138748169, "learning_rate": 2.9994728928941327e-05, "loss": 1.4558, "step": 200 }, { "epoch": 0.10578947368421053, "grad_norm": 1.4390355348587036, "learning_rate": 2.9994662838571673e-05, "loss": 1.2274, "step": 201 }, { "epoch": 0.10631578947368421, "grad_norm": 20.712223052978516, "learning_rate": 2.9994596336521997e-05, "loss": 1.6812, "step": 202 }, { "epoch": 0.10684210526315789, "grad_norm": 3.6506576538085938, "learning_rate": 2.9994529422794136e-05, "loss": 1.7615, "step": 203 }, { "epoch": 0.10736842105263159, "grad_norm": 4.244356155395508, "learning_rate": 2.9994462097389922e-05, "loss": 2.1721, "step": 204 }, { "epoch": 0.10789473684210527, "grad_norm": 5.399370193481445, "learning_rate": 2.999439436031121e-05, "loss": 1.2347, "step": 205 }, { "epoch": 0.10842105263157895, "grad_norm": 13.014787673950195, "learning_rate": 2.999432621155985e-05, "loss": 1.0546, "step": 206 }, { "epoch": 0.10894736842105263, "grad_norm": 6.201412200927734, "learning_rate": 2.9994257651137727e-05, "loss": 1.6362, "step": 207 }, { "epoch": 0.10947368421052632, "grad_norm": 8.548736572265625, "learning_rate": 2.999418867904671e-05, "loss": 0.4087, "step": 208 }, { "epoch": 0.11, "grad_norm": 1.0881563425064087, "learning_rate": 2.9994119295288696e-05, "loss": 0.0171, "step": 209 }, { "epoch": 0.11052631578947368, "grad_norm": 5.160693168640137, "learning_rate": 2.999404949986559e-05, "loss": 0.7526, "step": 210 }, { "epoch": 0.11105263157894738, "grad_norm": 3.1135430335998535, "learning_rate": 2.999397929277931e-05, "loss": 1.3041, "step": 211 }, { "epoch": 0.11157894736842106, "grad_norm": 3.0570948123931885, "learning_rate": 2.9993908674031787e-05, "loss": 1.0342, "step": 212 }, { "epoch": 0.11210526315789474, "grad_norm": 2.296029806137085, "learning_rate": 2.9993837643624953e-05, "loss": 1.2628, "step": 213 }, { "epoch": 0.11263157894736842, "grad_norm": 34.04128646850586, "learning_rate": 2.9993766201560764e-05, "loss": 2.0594, "step": 214 }, { "epoch": 0.11315789473684211, "grad_norm": 4.997979164123535, "learning_rate": 2.999369434784118e-05, "loss": 0.1955, "step": 215 }, { "epoch": 0.11368421052631579, "grad_norm": 1.8177436590194702, "learning_rate": 2.9993622082468165e-05, "loss": 1.3383, "step": 216 }, { "epoch": 0.11421052631578947, "grad_norm": 4.622316360473633, "learning_rate": 2.9993549405443715e-05, "loss": 0.4904, "step": 217 }, { "epoch": 0.11473684210526315, "grad_norm": 1.736038327217102, "learning_rate": 2.9993476316769822e-05, "loss": 1.1311, "step": 218 }, { "epoch": 0.11526315789473685, "grad_norm": 1.4792280197143555, "learning_rate": 2.9993402816448487e-05, "loss": 1.6061, "step": 219 }, { "epoch": 0.11578947368421053, "grad_norm": 20.820289611816406, "learning_rate": 2.9993328904481737e-05, "loss": 3.1151, "step": 220 }, { "epoch": 0.1163157894736842, "grad_norm": 3.1863715648651123, "learning_rate": 2.999325458087159e-05, "loss": 0.2512, "step": 221 }, { "epoch": 0.1168421052631579, "grad_norm": 7.053295135498047, "learning_rate": 2.99931798456201e-05, "loss": 2.0884, "step": 222 }, { "epoch": 0.11736842105263158, "grad_norm": 2.6936466693878174, "learning_rate": 2.999310469872931e-05, "loss": 1.0639, "step": 223 }, { "epoch": 0.11789473684210526, "grad_norm": 6.108619213104248, "learning_rate": 2.9993029140201288e-05, "loss": 1.5187, "step": 224 }, { "epoch": 0.11842105263157894, "grad_norm": 1.4771653413772583, "learning_rate": 2.99929531700381e-05, "loss": 0.8693, "step": 225 }, { "epoch": 0.11894736842105263, "grad_norm": 5.0803632736206055, "learning_rate": 2.999287678824184e-05, "loss": 1.6351, "step": 226 }, { "epoch": 0.11947368421052632, "grad_norm": 2.738546848297119, "learning_rate": 2.9992799994814602e-05, "loss": 0.076, "step": 227 }, { "epoch": 0.12, "grad_norm": 1.8112925291061401, "learning_rate": 2.9992722789758496e-05, "loss": 1.2248, "step": 228 }, { "epoch": 0.12052631578947369, "grad_norm": 2.918973684310913, "learning_rate": 2.999264517307564e-05, "loss": 1.0747, "step": 229 }, { "epoch": 0.12105263157894737, "grad_norm": 26.450199127197266, "learning_rate": 2.9992567144768167e-05, "loss": 1.0269, "step": 230 }, { "epoch": 0.12157894736842105, "grad_norm": 6.718221187591553, "learning_rate": 2.9992488704838215e-05, "loss": 1.3762, "step": 231 }, { "epoch": 0.12210526315789473, "grad_norm": 12.591816902160645, "learning_rate": 2.9992409853287942e-05, "loss": 2.1439, "step": 232 }, { "epoch": 0.12263157894736842, "grad_norm": 3.2106196880340576, "learning_rate": 2.9992330590119516e-05, "loss": 1.549, "step": 233 }, { "epoch": 0.1231578947368421, "grad_norm": 3.1923153400421143, "learning_rate": 2.9992250915335096e-05, "loss": 0.052, "step": 234 }, { "epoch": 0.12368421052631579, "grad_norm": 2.389961004257202, "learning_rate": 2.999217082893689e-05, "loss": 1.6529, "step": 235 }, { "epoch": 0.12421052631578948, "grad_norm": 7.870512008666992, "learning_rate": 2.9992090330927092e-05, "loss": 0.9921, "step": 236 }, { "epoch": 0.12473684210526316, "grad_norm": 7.302090167999268, "learning_rate": 2.9992009421307904e-05, "loss": 2.3439, "step": 237 }, { "epoch": 0.12526315789473685, "grad_norm": 4.087291240692139, "learning_rate": 2.9991928100081557e-05, "loss": 0.301, "step": 238 }, { "epoch": 0.12578947368421053, "grad_norm": 1.479772925376892, "learning_rate": 2.9991846367250273e-05, "loss": 1.1892, "step": 239 }, { "epoch": 0.12631578947368421, "grad_norm": 2.7664432525634766, "learning_rate": 2.9991764222816304e-05, "loss": 0.0283, "step": 240 }, { "epoch": 0.1268421052631579, "grad_norm": 8.476157188415527, "learning_rate": 2.9991681666781904e-05, "loss": 1.4744, "step": 241 }, { "epoch": 0.12736842105263158, "grad_norm": 6.929798126220703, "learning_rate": 2.9991598699149337e-05, "loss": 0.122, "step": 242 }, { "epoch": 0.12789473684210526, "grad_norm": 11.882857322692871, "learning_rate": 2.9991515319920885e-05, "loss": 2.2842, "step": 243 }, { "epoch": 0.12842105263157894, "grad_norm": 3.38356351852417, "learning_rate": 2.999143152909883e-05, "loss": 0.5131, "step": 244 }, { "epoch": 0.12894736842105264, "grad_norm": 2.857849359512329, "learning_rate": 2.999134732668548e-05, "loss": 1.4122, "step": 245 }, { "epoch": 0.12947368421052632, "grad_norm": 5.270646572113037, "learning_rate": 2.9991262712683142e-05, "loss": 0.0852, "step": 246 }, { "epoch": 0.13, "grad_norm": 7.410548210144043, "learning_rate": 2.9991177687094145e-05, "loss": 0.3834, "step": 247 }, { "epoch": 0.13052631578947368, "grad_norm": 9.727782249450684, "learning_rate": 2.9991092249920818e-05, "loss": 1.3174, "step": 248 }, { "epoch": 0.13105263157894737, "grad_norm": 2.9647574424743652, "learning_rate": 2.9991006401165505e-05, "loss": 1.0558, "step": 249 }, { "epoch": 0.13157894736842105, "grad_norm": 6.312364101409912, "learning_rate": 2.999092014083057e-05, "loss": 1.5202, "step": 250 }, { "epoch": 0.13210526315789473, "grad_norm": 6.21821928024292, "learning_rate": 2.9990833468918376e-05, "loss": 1.3892, "step": 251 }, { "epoch": 0.13263157894736843, "grad_norm": 1.7169901132583618, "learning_rate": 2.999074638543131e-05, "loss": 1.3552, "step": 252 }, { "epoch": 0.13315789473684211, "grad_norm": 1.8053324222564697, "learning_rate": 2.9990658890371753e-05, "loss": 1.3176, "step": 253 }, { "epoch": 0.1336842105263158, "grad_norm": 2.0673868656158447, "learning_rate": 2.9990570983742105e-05, "loss": 1.1511, "step": 254 }, { "epoch": 0.13421052631578947, "grad_norm": 13.948058128356934, "learning_rate": 2.9990482665544792e-05, "loss": 1.8145, "step": 255 }, { "epoch": 0.13473684210526315, "grad_norm": 14.18269157409668, "learning_rate": 2.999039393578223e-05, "loss": 2.3278, "step": 256 }, { "epoch": 0.13526315789473684, "grad_norm": 6.532087802886963, "learning_rate": 2.9990304794456857e-05, "loss": 1.6241, "step": 257 }, { "epoch": 0.13578947368421052, "grad_norm": 5.4354987144470215, "learning_rate": 2.9990215241571124e-05, "loss": 1.094, "step": 258 }, { "epoch": 0.13631578947368422, "grad_norm": 4.302351474761963, "learning_rate": 2.9990125277127487e-05, "loss": 0.8513, "step": 259 }, { "epoch": 0.1368421052631579, "grad_norm": 5.297351837158203, "learning_rate": 2.9990034901128414e-05, "loss": 2.1124, "step": 260 }, { "epoch": 0.13736842105263158, "grad_norm": 1.7843605279922485, "learning_rate": 2.9989944113576387e-05, "loss": 0.9198, "step": 261 }, { "epoch": 0.13789473684210526, "grad_norm": 3.7895467281341553, "learning_rate": 2.9989852914473898e-05, "loss": 1.4874, "step": 262 }, { "epoch": 0.13842105263157894, "grad_norm": 12.728391647338867, "learning_rate": 2.9989761303823453e-05, "loss": 1.3333, "step": 263 }, { "epoch": 0.13894736842105262, "grad_norm": 4.476593017578125, "learning_rate": 2.9989669281627566e-05, "loss": 0.95, "step": 264 }, { "epoch": 0.1394736842105263, "grad_norm": 2.2026288509368896, "learning_rate": 2.998957684788877e-05, "loss": 0.0549, "step": 265 }, { "epoch": 0.14, "grad_norm": 2.320668935775757, "learning_rate": 2.998948400260959e-05, "loss": 1.6905, "step": 266 }, { "epoch": 0.1405263157894737, "grad_norm": 7.45945930480957, "learning_rate": 2.9989390745792585e-05, "loss": 0.9003, "step": 267 }, { "epoch": 0.14105263157894737, "grad_norm": 1.9993150234222412, "learning_rate": 2.998929707744031e-05, "loss": 1.774, "step": 268 }, { "epoch": 0.14157894736842105, "grad_norm": 2.8484346866607666, "learning_rate": 2.998920299755534e-05, "loss": 0.6502, "step": 269 }, { "epoch": 0.14210526315789473, "grad_norm": 1.472853183746338, "learning_rate": 2.9989108506140254e-05, "loss": 1.0543, "step": 270 }, { "epoch": 0.14263157894736841, "grad_norm": 2.5155389308929443, "learning_rate": 2.9989013603197653e-05, "loss": 1.6436, "step": 271 }, { "epoch": 0.1431578947368421, "grad_norm": 3.893866777420044, "learning_rate": 2.998891828873014e-05, "loss": 0.9275, "step": 272 }, { "epoch": 0.1436842105263158, "grad_norm": 5.068791389465332, "learning_rate": 2.9988822562740325e-05, "loss": 1.3801, "step": 273 }, { "epoch": 0.14421052631578948, "grad_norm": 3.420543909072876, "learning_rate": 2.9988726425230842e-05, "loss": 1.4276, "step": 274 }, { "epoch": 0.14473684210526316, "grad_norm": 3.8543524742126465, "learning_rate": 2.9988629876204337e-05, "loss": 0.1829, "step": 275 }, { "epoch": 0.14526315789473684, "grad_norm": 2.018007755279541, "learning_rate": 2.9988532915663446e-05, "loss": 0.9781, "step": 276 }, { "epoch": 0.14578947368421052, "grad_norm": 7.19284200668335, "learning_rate": 2.9988435543610846e-05, "loss": 1.5077, "step": 277 }, { "epoch": 0.1463157894736842, "grad_norm": 4.468955039978027, "learning_rate": 2.99883377600492e-05, "loss": 1.7061, "step": 278 }, { "epoch": 0.14684210526315788, "grad_norm": 2.5749592781066895, "learning_rate": 2.9988239564981193e-05, "loss": 1.1039, "step": 279 }, { "epoch": 0.14736842105263157, "grad_norm": 1.5888959169387817, "learning_rate": 2.9988140958409528e-05, "loss": 1.132, "step": 280 }, { "epoch": 0.14789473684210527, "grad_norm": 1.6728452444076538, "learning_rate": 2.9988041940336906e-05, "loss": 1.239, "step": 281 }, { "epoch": 0.14842105263157895, "grad_norm": 1.9604003429412842, "learning_rate": 2.9987942510766047e-05, "loss": 1.2393, "step": 282 }, { "epoch": 0.14894736842105263, "grad_norm": 12.022218704223633, "learning_rate": 2.9987842669699687e-05, "loss": 0.808, "step": 283 }, { "epoch": 0.14947368421052631, "grad_norm": 1.27422034740448, "learning_rate": 2.998774241714056e-05, "loss": 1.1194, "step": 284 }, { "epoch": 0.15, "grad_norm": 1.4916560649871826, "learning_rate": 2.9987641753091416e-05, "loss": 0.0263, "step": 285 }, { "epoch": 0.15052631578947367, "grad_norm": 6.565960884094238, "learning_rate": 2.9987540677555027e-05, "loss": 1.2911, "step": 286 }, { "epoch": 0.15105263157894736, "grad_norm": 1.564158320426941, "learning_rate": 2.9987439190534163e-05, "loss": 0.6841, "step": 287 }, { "epoch": 0.15157894736842106, "grad_norm": 2.0669963359832764, "learning_rate": 2.998733729203161e-05, "loss": 0.9868, "step": 288 }, { "epoch": 0.15210526315789474, "grad_norm": 7.10345983505249, "learning_rate": 2.9987234982050168e-05, "loss": 0.6493, "step": 289 }, { "epoch": 0.15263157894736842, "grad_norm": 2.2110228538513184, "learning_rate": 2.9987132260592645e-05, "loss": 1.225, "step": 290 }, { "epoch": 0.1531578947368421, "grad_norm": 2.3035542964935303, "learning_rate": 2.9987029127661857e-05, "loss": 1.3038, "step": 291 }, { "epoch": 0.15368421052631578, "grad_norm": 2.7143070697784424, "learning_rate": 2.9986925583260644e-05, "loss": 1.0185, "step": 292 }, { "epoch": 0.15421052631578946, "grad_norm": 1.633699655532837, "learning_rate": 2.9986821627391845e-05, "loss": 1.1361, "step": 293 }, { "epoch": 0.15473684210526314, "grad_norm": 2.975994825363159, "learning_rate": 2.9986717260058314e-05, "loss": 1.8912, "step": 294 }, { "epoch": 0.15526315789473685, "grad_norm": 2.5356616973876953, "learning_rate": 2.9986612481262916e-05, "loss": 1.2134, "step": 295 }, { "epoch": 0.15578947368421053, "grad_norm": 1.880893588066101, "learning_rate": 2.9986507291008524e-05, "loss": 0.0698, "step": 296 }, { "epoch": 0.1563157894736842, "grad_norm": 8.554359436035156, "learning_rate": 2.9986401689298032e-05, "loss": 1.3118, "step": 297 }, { "epoch": 0.1568421052631579, "grad_norm": 1.8531193733215332, "learning_rate": 2.9986295676134337e-05, "loss": 1.1979, "step": 298 }, { "epoch": 0.15736842105263157, "grad_norm": 44.23398971557617, "learning_rate": 2.998618925152035e-05, "loss": 2.0334, "step": 299 }, { "epoch": 0.15789473684210525, "grad_norm": 4.069817066192627, "learning_rate": 2.9986082415458993e-05, "loss": 1.2089, "step": 300 }, { "epoch": 0.15842105263157893, "grad_norm": 1.7818050384521484, "learning_rate": 2.9985975167953198e-05, "loss": 1.0545, "step": 301 }, { "epoch": 0.15894736842105264, "grad_norm": 2.3045952320098877, "learning_rate": 2.9985867509005906e-05, "loss": 1.1889, "step": 302 }, { "epoch": 0.15947368421052632, "grad_norm": 4.53651237487793, "learning_rate": 2.9985759438620082e-05, "loss": 0.78, "step": 303 }, { "epoch": 0.16, "grad_norm": 2.4460010528564453, "learning_rate": 2.9985650956798686e-05, "loss": 1.3093, "step": 304 }, { "epoch": 0.16052631578947368, "grad_norm": 4.067452907562256, "learning_rate": 2.99855420635447e-05, "loss": 0.7694, "step": 305 }, { "epoch": 0.16105263157894736, "grad_norm": 1.7187111377716064, "learning_rate": 2.9985432758861114e-05, "loss": 1.1114, "step": 306 }, { "epoch": 0.16157894736842104, "grad_norm": 5.563487529754639, "learning_rate": 2.9985323042750924e-05, "loss": 0.2845, "step": 307 }, { "epoch": 0.16210526315789472, "grad_norm": 13.657363891601562, "learning_rate": 2.9985212915217146e-05, "loss": 0.7273, "step": 308 }, { "epoch": 0.16263157894736843, "grad_norm": 44.487571716308594, "learning_rate": 2.9985102376262803e-05, "loss": 1.0723, "step": 309 }, { "epoch": 0.1631578947368421, "grad_norm": 3.729584217071533, "learning_rate": 2.998499142589093e-05, "loss": 1.0203, "step": 310 }, { "epoch": 0.1636842105263158, "grad_norm": 3.7574782371520996, "learning_rate": 2.9984880064104575e-05, "loss": 1.1488, "step": 311 }, { "epoch": 0.16421052631578947, "grad_norm": 10.508902549743652, "learning_rate": 2.998476829090679e-05, "loss": 0.3696, "step": 312 }, { "epoch": 0.16473684210526315, "grad_norm": 7.688244342803955, "learning_rate": 2.9984656106300647e-05, "loss": 1.8374, "step": 313 }, { "epoch": 0.16526315789473683, "grad_norm": 2.0016376972198486, "learning_rate": 2.9984543510289227e-05, "loss": 0.8432, "step": 314 }, { "epoch": 0.16578947368421051, "grad_norm": 5.203074932098389, "learning_rate": 2.998443050287562e-05, "loss": 1.2669, "step": 315 }, { "epoch": 0.16631578947368422, "grad_norm": 1.451202630996704, "learning_rate": 2.9984317084062928e-05, "loss": 1.0893, "step": 316 }, { "epoch": 0.1668421052631579, "grad_norm": 1.7263648509979248, "learning_rate": 2.998420325385427e-05, "loss": 1.219, "step": 317 }, { "epoch": 0.16736842105263158, "grad_norm": 8.742837905883789, "learning_rate": 2.9984089012252765e-05, "loss": 0.3662, "step": 318 }, { "epoch": 0.16789473684210526, "grad_norm": 4.271378040313721, "learning_rate": 2.9983974359261556e-05, "loss": 0.9902, "step": 319 }, { "epoch": 0.16842105263157894, "grad_norm": 1.9590401649475098, "learning_rate": 2.998385929488378e-05, "loss": 1.0124, "step": 320 }, { "epoch": 0.16894736842105262, "grad_norm": 1.6438995599746704, "learning_rate": 2.998374381912261e-05, "loss": 1.0227, "step": 321 }, { "epoch": 0.1694736842105263, "grad_norm": 8.173205375671387, "learning_rate": 2.99836279319812e-05, "loss": 0.3485, "step": 322 }, { "epoch": 0.17, "grad_norm": 3.0618059635162354, "learning_rate": 2.9983511633462746e-05, "loss": 1.213, "step": 323 }, { "epoch": 0.1705263157894737, "grad_norm": 2.2398312091827393, "learning_rate": 2.9983394923570436e-05, "loss": 0.7417, "step": 324 }, { "epoch": 0.17105263157894737, "grad_norm": 2.8603358268737793, "learning_rate": 2.9983277802307475e-05, "loss": 0.7479, "step": 325 }, { "epoch": 0.17157894736842105, "grad_norm": 6.809089660644531, "learning_rate": 2.9983160269677074e-05, "loss": 0.1203, "step": 326 }, { "epoch": 0.17210526315789473, "grad_norm": 1.7018848657608032, "learning_rate": 2.998304232568247e-05, "loss": 0.7828, "step": 327 }, { "epoch": 0.1726315789473684, "grad_norm": 1.4800243377685547, "learning_rate": 2.9982923970326887e-05, "loss": 0.0606, "step": 328 }, { "epoch": 0.1731578947368421, "grad_norm": 3.1752805709838867, "learning_rate": 2.998280520361359e-05, "loss": 0.5135, "step": 329 }, { "epoch": 0.1736842105263158, "grad_norm": 1.6199371814727783, "learning_rate": 2.9982686025545824e-05, "loss": 1.2659, "step": 330 }, { "epoch": 0.17421052631578948, "grad_norm": 8.967512130737305, "learning_rate": 2.9982566436126873e-05, "loss": 1.8805, "step": 331 }, { "epoch": 0.17473684210526316, "grad_norm": 20.32599449157715, "learning_rate": 2.9982446435360016e-05, "loss": 0.6175, "step": 332 }, { "epoch": 0.17526315789473684, "grad_norm": 12.868733406066895, "learning_rate": 2.9982326023248548e-05, "loss": 1.1088, "step": 333 }, { "epoch": 0.17578947368421052, "grad_norm": 9.508431434631348, "learning_rate": 2.9982205199795773e-05, "loss": 1.2409, "step": 334 }, { "epoch": 0.1763157894736842, "grad_norm": 1.3848768472671509, "learning_rate": 2.998208396500501e-05, "loss": 1.1361, "step": 335 }, { "epoch": 0.17684210526315788, "grad_norm": 1.4120675325393677, "learning_rate": 2.998196231887959e-05, "loss": 0.8192, "step": 336 }, { "epoch": 0.1773684210526316, "grad_norm": 1.9072043895721436, "learning_rate": 2.998184026142285e-05, "loss": 1.1288, "step": 337 }, { "epoch": 0.17789473684210527, "grad_norm": 17.120038986206055, "learning_rate": 2.9981717792638143e-05, "loss": 0.6453, "step": 338 }, { "epoch": 0.17842105263157895, "grad_norm": 1.8352843523025513, "learning_rate": 2.998159491252883e-05, "loss": 1.7777, "step": 339 }, { "epoch": 0.17894736842105263, "grad_norm": 1.8334228992462158, "learning_rate": 2.9981471621098277e-05, "loss": 1.1382, "step": 340 }, { "epoch": 0.1794736842105263, "grad_norm": 2.4187421798706055, "learning_rate": 2.9981347918349885e-05, "loss": 1.1575, "step": 341 }, { "epoch": 0.18, "grad_norm": 7.076684951782227, "learning_rate": 2.998122380428704e-05, "loss": 0.7906, "step": 342 }, { "epoch": 0.18052631578947367, "grad_norm": 3.0878186225891113, "learning_rate": 2.9981099278913147e-05, "loss": 1.2896, "step": 343 }, { "epoch": 0.18105263157894738, "grad_norm": 3.1805942058563232, "learning_rate": 2.9980974342231633e-05, "loss": 0.4216, "step": 344 }, { "epoch": 0.18157894736842106, "grad_norm": 1.9451438188552856, "learning_rate": 2.9980848994245926e-05, "loss": 1.4878, "step": 345 }, { "epoch": 0.18210526315789474, "grad_norm": 1.4234017133712769, "learning_rate": 2.9980723234959464e-05, "loss": 0.8428, "step": 346 }, { "epoch": 0.18263157894736842, "grad_norm": 6.759546756744385, "learning_rate": 2.99805970643757e-05, "loss": 1.7714, "step": 347 }, { "epoch": 0.1831578947368421, "grad_norm": 7.0381550788879395, "learning_rate": 2.9980470482498105e-05, "loss": 0.4636, "step": 348 }, { "epoch": 0.18368421052631578, "grad_norm": 6.027357578277588, "learning_rate": 2.9980343489330143e-05, "loss": 1.5307, "step": 349 }, { "epoch": 0.18421052631578946, "grad_norm": 4.1133012771606445, "learning_rate": 2.998021608487531e-05, "loss": 1.3673, "step": 350 }, { "epoch": 0.18473684210526317, "grad_norm": 4.925607204437256, "learning_rate": 2.9980088269137097e-05, "loss": 2.2288, "step": 351 }, { "epoch": 0.18526315789473685, "grad_norm": 2.568140983581543, "learning_rate": 2.997996004211902e-05, "loss": 1.3601, "step": 352 }, { "epoch": 0.18578947368421053, "grad_norm": 2.9798226356506348, "learning_rate": 2.9979831403824593e-05, "loss": 1.1923, "step": 353 }, { "epoch": 0.1863157894736842, "grad_norm": 1.192610263824463, "learning_rate": 2.997970235425735e-05, "loss": 0.0341, "step": 354 }, { "epoch": 0.1868421052631579, "grad_norm": 1.2151992321014404, "learning_rate": 2.997957289342084e-05, "loss": 0.9861, "step": 355 }, { "epoch": 0.18736842105263157, "grad_norm": 1.9701948165893555, "learning_rate": 2.9979443021318607e-05, "loss": 0.0926, "step": 356 }, { "epoch": 0.18789473684210525, "grad_norm": 1.1996488571166992, "learning_rate": 2.9979312737954225e-05, "loss": 0.8892, "step": 357 }, { "epoch": 0.18842105263157893, "grad_norm": 3.7606606483459473, "learning_rate": 2.9979182043331268e-05, "loss": 1.3173, "step": 358 }, { "epoch": 0.18894736842105264, "grad_norm": 4.111024856567383, "learning_rate": 2.9979050937453324e-05, "loss": 1.2348, "step": 359 }, { "epoch": 0.18947368421052632, "grad_norm": 16.119956970214844, "learning_rate": 2.997891942032399e-05, "loss": 1.7936, "step": 360 }, { "epoch": 0.19, "grad_norm": 9.241058349609375, "learning_rate": 2.9978787491946886e-05, "loss": 0.2741, "step": 361 }, { "epoch": 0.19052631578947368, "grad_norm": 1.398040533065796, "learning_rate": 2.9978655152325623e-05, "loss": 1.1073, "step": 362 }, { "epoch": 0.19105263157894736, "grad_norm": 6.744707107543945, "learning_rate": 2.997852240146384e-05, "loss": 0.9413, "step": 363 }, { "epoch": 0.19157894736842104, "grad_norm": 1.4302072525024414, "learning_rate": 2.997838923936518e-05, "loss": 1.1093, "step": 364 }, { "epoch": 0.19210526315789472, "grad_norm": 1.3738486766815186, "learning_rate": 2.99782556660333e-05, "loss": 1.3803, "step": 365 }, { "epoch": 0.19263157894736843, "grad_norm": 1.2413164377212524, "learning_rate": 2.9978121681471868e-05, "loss": 0.9807, "step": 366 }, { "epoch": 0.1931578947368421, "grad_norm": 1.4329386949539185, "learning_rate": 2.997798728568456e-05, "loss": 1.3757, "step": 367 }, { "epoch": 0.1936842105263158, "grad_norm": 1.2162076234817505, "learning_rate": 2.9977852478675068e-05, "loss": 0.7532, "step": 368 }, { "epoch": 0.19421052631578947, "grad_norm": 2.649707078933716, "learning_rate": 2.9977717260447092e-05, "loss": 1.0267, "step": 369 }, { "epoch": 0.19473684210526315, "grad_norm": 6.033254146575928, "learning_rate": 2.9977581631004343e-05, "loss": 2.927, "step": 370 }, { "epoch": 0.19526315789473683, "grad_norm": 1.8854405879974365, "learning_rate": 2.9977445590350548e-05, "loss": 1.3284, "step": 371 }, { "epoch": 0.1957894736842105, "grad_norm": 1.3157703876495361, "learning_rate": 2.9977309138489443e-05, "loss": 1.6263, "step": 372 }, { "epoch": 0.19631578947368422, "grad_norm": 6.529455184936523, "learning_rate": 2.997717227542477e-05, "loss": 0.5972, "step": 373 }, { "epoch": 0.1968421052631579, "grad_norm": 5.371290683746338, "learning_rate": 2.997703500116029e-05, "loss": 0.2108, "step": 374 }, { "epoch": 0.19736842105263158, "grad_norm": 11.562923431396484, "learning_rate": 2.9976897315699767e-05, "loss": 0.5319, "step": 375 }, { "epoch": 0.19789473684210526, "grad_norm": 1.7732006311416626, "learning_rate": 2.9976759219046988e-05, "loss": 1.3096, "step": 376 }, { "epoch": 0.19842105263157894, "grad_norm": 2.1219842433929443, "learning_rate": 2.997662071120574e-05, "loss": 1.5544, "step": 377 }, { "epoch": 0.19894736842105262, "grad_norm": 2.662497043609619, "learning_rate": 2.9976481792179827e-05, "loss": 1.3716, "step": 378 }, { "epoch": 0.1994736842105263, "grad_norm": 1.3942060470581055, "learning_rate": 2.997634246197306e-05, "loss": 1.8652, "step": 379 }, { "epoch": 0.2, "grad_norm": 1.5858086347579956, "learning_rate": 2.9976202720589273e-05, "loss": 0.8704, "step": 380 }, { "epoch": 0.2005263157894737, "grad_norm": 4.291787624359131, "learning_rate": 2.997606256803229e-05, "loss": 0.5378, "step": 381 }, { "epoch": 0.20105263157894737, "grad_norm": 1.4939614534378052, "learning_rate": 2.9975922004305972e-05, "loss": 1.5748, "step": 382 }, { "epoch": 0.20157894736842105, "grad_norm": 3.255887269973755, "learning_rate": 2.9975781029414168e-05, "loss": 0.5039, "step": 383 }, { "epoch": 0.20210526315789473, "grad_norm": 1.5569735765457153, "learning_rate": 2.9975639643360756e-05, "loss": 1.2235, "step": 384 }, { "epoch": 0.2026315789473684, "grad_norm": 10.412734985351562, "learning_rate": 2.997549784614961e-05, "loss": 1.7202, "step": 385 }, { "epoch": 0.2031578947368421, "grad_norm": 1.6121785640716553, "learning_rate": 2.997535563778463e-05, "loss": 1.4258, "step": 386 }, { "epoch": 0.2036842105263158, "grad_norm": 7.74535608291626, "learning_rate": 2.9975213018269714e-05, "loss": 0.3529, "step": 387 }, { "epoch": 0.20421052631578948, "grad_norm": 1.4278053045272827, "learning_rate": 2.997506998760878e-05, "loss": 1.1017, "step": 388 }, { "epoch": 0.20473684210526316, "grad_norm": 4.179811477661133, "learning_rate": 2.9974926545805762e-05, "loss": 0.3317, "step": 389 }, { "epoch": 0.20526315789473684, "grad_norm": 5.233457088470459, "learning_rate": 2.997478269286459e-05, "loss": 2.0176, "step": 390 }, { "epoch": 0.20578947368421052, "grad_norm": 10.88487720489502, "learning_rate": 2.9974638428789216e-05, "loss": 1.885, "step": 391 }, { "epoch": 0.2063157894736842, "grad_norm": 1.2455674409866333, "learning_rate": 2.9974493753583597e-05, "loss": 1.3968, "step": 392 }, { "epoch": 0.20684210526315788, "grad_norm": 1.7053565979003906, "learning_rate": 2.9974348667251715e-05, "loss": 1.0096, "step": 393 }, { "epoch": 0.2073684210526316, "grad_norm": 4.857005596160889, "learning_rate": 2.9974203169797543e-05, "loss": 0.5129, "step": 394 }, { "epoch": 0.20789473684210527, "grad_norm": 1.7618751525878906, "learning_rate": 2.997405726122508e-05, "loss": 0.8299, "step": 395 }, { "epoch": 0.20842105263157895, "grad_norm": 6.38074254989624, "learning_rate": 2.9973910941538333e-05, "loss": 1.711, "step": 396 }, { "epoch": 0.20894736842105263, "grad_norm": 4.114274024963379, "learning_rate": 2.9973764210741312e-05, "loss": 0.2308, "step": 397 }, { "epoch": 0.2094736842105263, "grad_norm": 1.7079466581344604, "learning_rate": 2.9973617068838056e-05, "loss": 1.3165, "step": 398 }, { "epoch": 0.21, "grad_norm": 2.909496307373047, "learning_rate": 2.9973469515832604e-05, "loss": 0.4798, "step": 399 }, { "epoch": 0.21052631578947367, "grad_norm": 1.0828051567077637, "learning_rate": 2.9973321551728995e-05, "loss": 0.751, "step": 400 }, { "epoch": 0.21105263157894738, "grad_norm": 2.6114165782928467, "learning_rate": 2.9973173176531305e-05, "loss": 1.226, "step": 401 }, { "epoch": 0.21157894736842106, "grad_norm": 2.137087106704712, "learning_rate": 2.99730243902436e-05, "loss": 1.0014, "step": 402 }, { "epoch": 0.21210526315789474, "grad_norm": 34.939544677734375, "learning_rate": 2.9972875192869965e-05, "loss": 1.3549, "step": 403 }, { "epoch": 0.21263157894736842, "grad_norm": 9.31963062286377, "learning_rate": 2.9972725584414506e-05, "loss": 1.6069, "step": 404 }, { "epoch": 0.2131578947368421, "grad_norm": 1.6685644388198853, "learning_rate": 2.9972575564881316e-05, "loss": 0.6431, "step": 405 }, { "epoch": 0.21368421052631578, "grad_norm": 18.59819984436035, "learning_rate": 2.9972425134274522e-05, "loss": 3.0248, "step": 406 }, { "epoch": 0.21421052631578946, "grad_norm": 5.35085391998291, "learning_rate": 2.9972274292598255e-05, "loss": 1.6838, "step": 407 }, { "epoch": 0.21473684210526317, "grad_norm": 2.953678846359253, "learning_rate": 2.9972123039856654e-05, "loss": 0.9957, "step": 408 }, { "epoch": 0.21526315789473685, "grad_norm": 1.5719188451766968, "learning_rate": 2.997197137605387e-05, "loss": 1.0414, "step": 409 }, { "epoch": 0.21578947368421053, "grad_norm": 5.360487937927246, "learning_rate": 2.997181930119407e-05, "loss": 1.5796, "step": 410 }, { "epoch": 0.2163157894736842, "grad_norm": 1.611450433731079, "learning_rate": 2.997166681528143e-05, "loss": 1.025, "step": 411 }, { "epoch": 0.2168421052631579, "grad_norm": 2.5470244884490967, "learning_rate": 2.997151391832013e-05, "loss": 1.0886, "step": 412 }, { "epoch": 0.21736842105263157, "grad_norm": 1.2462735176086426, "learning_rate": 2.9971360610314374e-05, "loss": 0.5111, "step": 413 }, { "epoch": 0.21789473684210525, "grad_norm": 1.5405879020690918, "learning_rate": 2.9971206891268367e-05, "loss": 0.9845, "step": 414 }, { "epoch": 0.21842105263157896, "grad_norm": 1.282374620437622, "learning_rate": 2.9971052761186338e-05, "loss": 0.4869, "step": 415 }, { "epoch": 0.21894736842105264, "grad_norm": 1.9443198442459106, "learning_rate": 2.9970898220072505e-05, "loss": 0.0399, "step": 416 }, { "epoch": 0.21947368421052632, "grad_norm": 1.685855746269226, "learning_rate": 2.9970743267931123e-05, "loss": 1.3679, "step": 417 }, { "epoch": 0.22, "grad_norm": 2.6044466495513916, "learning_rate": 2.997058790476644e-05, "loss": 1.4172, "step": 418 }, { "epoch": 0.22052631578947368, "grad_norm": 3.274233818054199, "learning_rate": 2.9970432130582727e-05, "loss": 1.5899, "step": 419 }, { "epoch": 0.22105263157894736, "grad_norm": 7.699522972106934, "learning_rate": 2.9970275945384252e-05, "loss": 1.5596, "step": 420 }, { "epoch": 0.22157894736842104, "grad_norm": 1.1986091136932373, "learning_rate": 2.997011934917531e-05, "loss": 1.1682, "step": 421 }, { "epoch": 0.22210526315789475, "grad_norm": 56.02631378173828, "learning_rate": 2.9969962341960196e-05, "loss": 1.7715, "step": 422 }, { "epoch": 0.22263157894736843, "grad_norm": 2.560245990753174, "learning_rate": 2.996980492374323e-05, "loss": 0.9785, "step": 423 }, { "epoch": 0.2231578947368421, "grad_norm": 11.875818252563477, "learning_rate": 2.9969647094528718e-05, "loss": 1.4328, "step": 424 }, { "epoch": 0.2236842105263158, "grad_norm": 1.9637598991394043, "learning_rate": 2.9969488854321007e-05, "loss": 2.0752, "step": 425 }, { "epoch": 0.22421052631578947, "grad_norm": 1.1943355798721313, "learning_rate": 2.9969330203124433e-05, "loss": 0.9533, "step": 426 }, { "epoch": 0.22473684210526315, "grad_norm": 3.17193341255188, "learning_rate": 2.996917114094336e-05, "loss": 0.3756, "step": 427 }, { "epoch": 0.22526315789473683, "grad_norm": 6.2271928787231445, "learning_rate": 2.9969011667782152e-05, "loss": 1.5742, "step": 428 }, { "epoch": 0.22578947368421054, "grad_norm": 3.3042328357696533, "learning_rate": 2.9968851783645182e-05, "loss": 1.2712, "step": 429 }, { "epoch": 0.22631578947368422, "grad_norm": 1.6236546039581299, "learning_rate": 2.9968691488536842e-05, "loss": 1.3309, "step": 430 }, { "epoch": 0.2268421052631579, "grad_norm": 3.908189535140991, "learning_rate": 2.9968530782461537e-05, "loss": 2.0394, "step": 431 }, { "epoch": 0.22736842105263158, "grad_norm": 4.952622413635254, "learning_rate": 2.9968369665423677e-05, "loss": 1.2584, "step": 432 }, { "epoch": 0.22789473684210526, "grad_norm": 3.6018247604370117, "learning_rate": 2.9968208137427685e-05, "loss": 0.6666, "step": 433 }, { "epoch": 0.22842105263157894, "grad_norm": 1.319495677947998, "learning_rate": 2.9968046198477994e-05, "loss": 1.0497, "step": 434 }, { "epoch": 0.22894736842105262, "grad_norm": 9.26498031616211, "learning_rate": 2.9967883848579052e-05, "loss": 1.8237, "step": 435 }, { "epoch": 0.2294736842105263, "grad_norm": 2.161882162094116, "learning_rate": 2.996772108773532e-05, "loss": 1.5562, "step": 436 }, { "epoch": 0.23, "grad_norm": 1.2435736656188965, "learning_rate": 2.9967557915951258e-05, "loss": 1.292, "step": 437 }, { "epoch": 0.2305263157894737, "grad_norm": 1.310845971107483, "learning_rate": 2.996739433323135e-05, "loss": 0.8349, "step": 438 }, { "epoch": 0.23105263157894737, "grad_norm": 10.944952011108398, "learning_rate": 2.9967230339580095e-05, "loss": 0.5981, "step": 439 }, { "epoch": 0.23157894736842105, "grad_norm": 3.8131868839263916, "learning_rate": 2.996706593500198e-05, "loss": 1.3155, "step": 440 }, { "epoch": 0.23210526315789473, "grad_norm": 3.2014780044555664, "learning_rate": 2.9966901119501535e-05, "loss": 1.2769, "step": 441 }, { "epoch": 0.2326315789473684, "grad_norm": 0.4154053330421448, "learning_rate": 2.9966735893083274e-05, "loss": 0.0089, "step": 442 }, { "epoch": 0.2331578947368421, "grad_norm": 1.2587388753890991, "learning_rate": 2.996657025575174e-05, "loss": 1.34, "step": 443 }, { "epoch": 0.2336842105263158, "grad_norm": 1.6259454488754272, "learning_rate": 2.996640420751147e-05, "loss": 1.2116, "step": 444 }, { "epoch": 0.23421052631578948, "grad_norm": 24.173168182373047, "learning_rate": 2.9966237748367032e-05, "loss": 1.2547, "step": 445 }, { "epoch": 0.23473684210526316, "grad_norm": 1.4230668544769287, "learning_rate": 2.9966070878322994e-05, "loss": 0.9427, "step": 446 }, { "epoch": 0.23526315789473684, "grad_norm": 1.4833992719650269, "learning_rate": 2.996590359738394e-05, "loss": 0.9436, "step": 447 }, { "epoch": 0.23578947368421052, "grad_norm": 2.0306854248046875, "learning_rate": 2.996573590555446e-05, "loss": 1.333, "step": 448 }, { "epoch": 0.2363157894736842, "grad_norm": 3.2706172466278076, "learning_rate": 2.996556780283916e-05, "loss": 0.3585, "step": 449 }, { "epoch": 0.23684210526315788, "grad_norm": 2.0781776905059814, "learning_rate": 2.996539928924265e-05, "loss": 1.1149, "step": 450 }, { "epoch": 0.2373684210526316, "grad_norm": 3.0296876430511475, "learning_rate": 2.996523036476956e-05, "loss": 0.4218, "step": 451 }, { "epoch": 0.23789473684210527, "grad_norm": 4.4293293952941895, "learning_rate": 2.9965061029424524e-05, "loss": 1.6268, "step": 452 }, { "epoch": 0.23842105263157895, "grad_norm": 1.4754269123077393, "learning_rate": 2.9964891283212202e-05, "loss": 1.3374, "step": 453 }, { "epoch": 0.23894736842105263, "grad_norm": 8.703702926635742, "learning_rate": 2.9964721126137245e-05, "loss": 1.5505, "step": 454 }, { "epoch": 0.2394736842105263, "grad_norm": 1.779236078262329, "learning_rate": 2.9964550558204332e-05, "loss": 1.0386, "step": 455 }, { "epoch": 0.24, "grad_norm": 2.5311484336853027, "learning_rate": 2.9964379579418136e-05, "loss": 1.4634, "step": 456 }, { "epoch": 0.24052631578947367, "grad_norm": 8.054793357849121, "learning_rate": 2.9964208189783355e-05, "loss": 0.4973, "step": 457 }, { "epoch": 0.24105263157894738, "grad_norm": 4.909681797027588, "learning_rate": 2.9964036389304696e-05, "loss": 1.1552, "step": 458 }, { "epoch": 0.24157894736842106, "grad_norm": 6.568388938903809, "learning_rate": 2.996386417798688e-05, "loss": 1.339, "step": 459 }, { "epoch": 0.24210526315789474, "grad_norm": 1.7436282634735107, "learning_rate": 2.9963691555834626e-05, "loss": 1.6004, "step": 460 }, { "epoch": 0.24263157894736842, "grad_norm": 2.2429983615875244, "learning_rate": 2.996351852285268e-05, "loss": 1.1068, "step": 461 }, { "epoch": 0.2431578947368421, "grad_norm": 2.5454885959625244, "learning_rate": 2.996334507904579e-05, "loss": 0.2007, "step": 462 }, { "epoch": 0.24368421052631578, "grad_norm": 1.5255674123764038, "learning_rate": 2.9963171224418722e-05, "loss": 1.0913, "step": 463 }, { "epoch": 0.24421052631578946, "grad_norm": 1.427351713180542, "learning_rate": 2.9962996958976242e-05, "loss": 0.7462, "step": 464 }, { "epoch": 0.24473684210526317, "grad_norm": 1.3530781269073486, "learning_rate": 2.9962822282723136e-05, "loss": 1.3773, "step": 465 }, { "epoch": 0.24526315789473685, "grad_norm": 1.161293625831604, "learning_rate": 2.996264719566421e-05, "loss": 0.7285, "step": 466 }, { "epoch": 0.24578947368421053, "grad_norm": 3.624866008758545, "learning_rate": 2.996247169780426e-05, "loss": 0.9183, "step": 467 }, { "epoch": 0.2463157894736842, "grad_norm": 1.182325005531311, "learning_rate": 2.9962295789148104e-05, "loss": 1.1596, "step": 468 }, { "epoch": 0.2468421052631579, "grad_norm": 2.112578868865967, "learning_rate": 2.9962119469700577e-05, "loss": 1.302, "step": 469 }, { "epoch": 0.24736842105263157, "grad_norm": 2.4179584980010986, "learning_rate": 2.9961942739466516e-05, "loss": 1.1984, "step": 470 }, { "epoch": 0.24789473684210525, "grad_norm": 5.681208610534668, "learning_rate": 2.9961765598450782e-05, "loss": 1.3961, "step": 471 }, { "epoch": 0.24842105263157896, "grad_norm": 1.9745169878005981, "learning_rate": 2.9961588046658222e-05, "loss": 0.8747, "step": 472 }, { "epoch": 0.24894736842105264, "grad_norm": 1.0899487733840942, "learning_rate": 2.9961410084093727e-05, "loss": 0.8849, "step": 473 }, { "epoch": 0.24947368421052632, "grad_norm": 27.703815460205078, "learning_rate": 2.9961231710762173e-05, "loss": 1.362, "step": 474 }, { "epoch": 0.25, "grad_norm": 77.52987670898438, "learning_rate": 2.996105292666846e-05, "loss": 1.9388, "step": 475 }, { "epoch": 0.25, "eval_loss": 1.0506709814071655, "eval_runtime": 12.9408, "eval_samples_per_second": 7.727, "eval_steps_per_second": 7.727, "step": 475 }, { "epoch": 0.2505263157894737, "grad_norm": 1.5593270063400269, "learning_rate": 2.99608737318175e-05, "loss": 1.1288, "step": 476 }, { "epoch": 0.25105263157894736, "grad_norm": 3.246509552001953, "learning_rate": 2.9960694126214204e-05, "loss": 1.1222, "step": 477 }, { "epoch": 0.25157894736842107, "grad_norm": 1.3423675298690796, "learning_rate": 2.996051410986351e-05, "loss": 0.8958, "step": 478 }, { "epoch": 0.2521052631578947, "grad_norm": 2.2557055950164795, "learning_rate": 2.9960333682770367e-05, "loss": 1.0483, "step": 479 }, { "epoch": 0.25263157894736843, "grad_norm": 1.5628443956375122, "learning_rate": 2.9960152844939716e-05, "loss": 1.1525, "step": 480 }, { "epoch": 0.2531578947368421, "grad_norm": 4.363856315612793, "learning_rate": 2.9959971596376527e-05, "loss": 2.8042, "step": 481 }, { "epoch": 0.2536842105263158, "grad_norm": 5.757142543792725, "learning_rate": 2.9959789937085774e-05, "loss": 0.9964, "step": 482 }, { "epoch": 0.2542105263157895, "grad_norm": 2.2105233669281006, "learning_rate": 2.995960786707245e-05, "loss": 0.844, "step": 483 }, { "epoch": 0.25473684210526315, "grad_norm": 1.6293426752090454, "learning_rate": 2.9959425386341544e-05, "loss": 1.1284, "step": 484 }, { "epoch": 0.25526315789473686, "grad_norm": 5.280759334564209, "learning_rate": 2.9959242494898076e-05, "loss": 1.1822, "step": 485 }, { "epoch": 0.2557894736842105, "grad_norm": 3.082880735397339, "learning_rate": 2.9959059192747064e-05, "loss": 0.4506, "step": 486 }, { "epoch": 0.2563157894736842, "grad_norm": 1.4250710010528564, "learning_rate": 2.995887547989354e-05, "loss": 0.8048, "step": 487 }, { "epoch": 0.25684210526315787, "grad_norm": 2.168121814727783, "learning_rate": 2.9958691356342546e-05, "loss": 1.1605, "step": 488 }, { "epoch": 0.2573684210526316, "grad_norm": 1.3707858324050903, "learning_rate": 2.995850682209914e-05, "loss": 0.9299, "step": 489 }, { "epoch": 0.2578947368421053, "grad_norm": 4.050701141357422, "learning_rate": 2.9958321877168392e-05, "loss": 0.5374, "step": 490 }, { "epoch": 0.25842105263157894, "grad_norm": 7.512131214141846, "learning_rate": 2.9958136521555373e-05, "loss": 1.5434, "step": 491 }, { "epoch": 0.25894736842105265, "grad_norm": 2.668602466583252, "learning_rate": 2.9957950755265167e-05, "loss": 1.0796, "step": 492 }, { "epoch": 0.2594736842105263, "grad_norm": 1.4869720935821533, "learning_rate": 2.995776457830289e-05, "loss": 1.2192, "step": 493 }, { "epoch": 0.26, "grad_norm": 2.3927292823791504, "learning_rate": 2.995757799067364e-05, "loss": 1.1358, "step": 494 }, { "epoch": 0.26052631578947366, "grad_norm": 1.5988667011260986, "learning_rate": 2.995739099238255e-05, "loss": 0.9472, "step": 495 }, { "epoch": 0.26105263157894737, "grad_norm": 1.5140737295150757, "learning_rate": 2.9957203583434742e-05, "loss": 0.6382, "step": 496 }, { "epoch": 0.2615789473684211, "grad_norm": 1.3366851806640625, "learning_rate": 2.9957015763835368e-05, "loss": 0.909, "step": 497 }, { "epoch": 0.26210526315789473, "grad_norm": 1.2181779146194458, "learning_rate": 2.9956827533589592e-05, "loss": 0.8209, "step": 498 }, { "epoch": 0.26263157894736844, "grad_norm": 5.791037559509277, "learning_rate": 2.995663889270257e-05, "loss": 0.7449, "step": 499 }, { "epoch": 0.2631578947368421, "grad_norm": 8.648612976074219, "learning_rate": 2.9956449841179486e-05, "loss": 1.3843, "step": 500 }, { "epoch": 0.2636842105263158, "grad_norm": 4.5848307609558105, "learning_rate": 2.995626037902553e-05, "loss": 0.9603, "step": 501 }, { "epoch": 0.26421052631578945, "grad_norm": 1.4494788646697998, "learning_rate": 2.99560705062459e-05, "loss": 0.7254, "step": 502 }, { "epoch": 0.26473684210526316, "grad_norm": 16.663463592529297, "learning_rate": 2.9955880222845818e-05, "loss": 0.3266, "step": 503 }, { "epoch": 0.26526315789473687, "grad_norm": 1.475623607635498, "learning_rate": 2.99556895288305e-05, "loss": 1.3868, "step": 504 }, { "epoch": 0.2657894736842105, "grad_norm": 1.3773105144500732, "learning_rate": 2.995549842420519e-05, "loss": 0.1299, "step": 505 }, { "epoch": 0.26631578947368423, "grad_norm": 1.1917214393615723, "learning_rate": 2.9955306908975126e-05, "loss": 1.0726, "step": 506 }, { "epoch": 0.2668421052631579, "grad_norm": 5.162087917327881, "learning_rate": 2.9955114983145567e-05, "loss": 1.251, "step": 507 }, { "epoch": 0.2673684210526316, "grad_norm": 1.6344598531723022, "learning_rate": 2.9954922646721785e-05, "loss": 1.1682, "step": 508 }, { "epoch": 0.26789473684210524, "grad_norm": 4.443221569061279, "learning_rate": 2.9954729899709065e-05, "loss": 1.285, "step": 509 }, { "epoch": 0.26842105263157895, "grad_norm": 1.5260694026947021, "learning_rate": 2.9954536742112688e-05, "loss": 1.305, "step": 510 }, { "epoch": 0.26894736842105266, "grad_norm": 1.5212806463241577, "learning_rate": 2.9954343173937968e-05, "loss": 0.8619, "step": 511 }, { "epoch": 0.2694736842105263, "grad_norm": 46.06574249267578, "learning_rate": 2.9954149195190214e-05, "loss": 1.2454, "step": 512 }, { "epoch": 0.27, "grad_norm": 1.1406009197235107, "learning_rate": 2.995395480587475e-05, "loss": 0.8901, "step": 513 }, { "epoch": 0.27052631578947367, "grad_norm": 1.038522720336914, "learning_rate": 2.995376000599692e-05, "loss": 0.5812, "step": 514 }, { "epoch": 0.2710526315789474, "grad_norm": 4.814447402954102, "learning_rate": 2.9953564795562064e-05, "loss": 0.5695, "step": 515 }, { "epoch": 0.27157894736842103, "grad_norm": 3.0107245445251465, "learning_rate": 2.995336917457555e-05, "loss": 1.6017, "step": 516 }, { "epoch": 0.27210526315789474, "grad_norm": 0.9930879473686218, "learning_rate": 2.995317314304274e-05, "loss": 0.9568, "step": 517 }, { "epoch": 0.27263157894736845, "grad_norm": 24.54469108581543, "learning_rate": 2.995297670096902e-05, "loss": 0.8168, "step": 518 }, { "epoch": 0.2731578947368421, "grad_norm": 2.448007106781006, "learning_rate": 2.995277984835979e-05, "loss": 1.5648, "step": 519 }, { "epoch": 0.2736842105263158, "grad_norm": 3.251126527786255, "learning_rate": 2.995258258522044e-05, "loss": 1.4491, "step": 520 }, { "epoch": 0.27421052631578946, "grad_norm": 1.0893594026565552, "learning_rate": 2.9952384911556394e-05, "loss": 1.3153, "step": 521 }, { "epoch": 0.27473684210526317, "grad_norm": 4.283177375793457, "learning_rate": 2.9952186827373085e-05, "loss": 0.2124, "step": 522 }, { "epoch": 0.2752631578947368, "grad_norm": 3.8956921100616455, "learning_rate": 2.995198833267594e-05, "loss": 1.0426, "step": 523 }, { "epoch": 0.27578947368421053, "grad_norm": 4.345573425292969, "learning_rate": 2.995178942747042e-05, "loss": 0.2201, "step": 524 }, { "epoch": 0.27631578947368424, "grad_norm": 1.2920668125152588, "learning_rate": 2.9951590111761975e-05, "loss": 1.4519, "step": 525 }, { "epoch": 0.2768421052631579, "grad_norm": 1.0732390880584717, "learning_rate": 2.9951390385556084e-05, "loss": 0.8767, "step": 526 }, { "epoch": 0.2773684210526316, "grad_norm": 1.2250876426696777, "learning_rate": 2.995119024885823e-05, "loss": 0.6662, "step": 527 }, { "epoch": 0.27789473684210525, "grad_norm": 4.883981704711914, "learning_rate": 2.9950989701673906e-05, "loss": 1.5669, "step": 528 }, { "epoch": 0.27842105263157896, "grad_norm": 1.9038349390029907, "learning_rate": 2.995078874400862e-05, "loss": 1.1141, "step": 529 }, { "epoch": 0.2789473684210526, "grad_norm": 8.930418968200684, "learning_rate": 2.9950587375867887e-05, "loss": 1.2227, "step": 530 }, { "epoch": 0.2794736842105263, "grad_norm": 2.2693824768066406, "learning_rate": 2.995038559725724e-05, "loss": 1.0804, "step": 531 }, { "epoch": 0.28, "grad_norm": 2.965463161468506, "learning_rate": 2.995018340818221e-05, "loss": 1.2129, "step": 532 }, { "epoch": 0.2805263157894737, "grad_norm": 2.060148000717163, "learning_rate": 2.9949980808648357e-05, "loss": 0.6663, "step": 533 }, { "epoch": 0.2810526315789474, "grad_norm": 1.0076817274093628, "learning_rate": 2.9949777798661236e-05, "loss": 0.7417, "step": 534 }, { "epoch": 0.28157894736842104, "grad_norm": 2.449284791946411, "learning_rate": 2.9949574378226426e-05, "loss": 1.5616, "step": 535 }, { "epoch": 0.28210526315789475, "grad_norm": 1.3279973268508911, "learning_rate": 2.994937054734952e-05, "loss": 0.9661, "step": 536 }, { "epoch": 0.2826315789473684, "grad_norm": 3.356504201889038, "learning_rate": 2.9949166306036095e-05, "loss": 1.584, "step": 537 }, { "epoch": 0.2831578947368421, "grad_norm": 1.2326231002807617, "learning_rate": 2.994896165429177e-05, "loss": 0.9024, "step": 538 }, { "epoch": 0.2836842105263158, "grad_norm": 7.02909517288208, "learning_rate": 2.994875659212216e-05, "loss": 1.0278, "step": 539 }, { "epoch": 0.28421052631578947, "grad_norm": 1.6224465370178223, "learning_rate": 2.9948551119532902e-05, "loss": 1.278, "step": 540 }, { "epoch": 0.2847368421052632, "grad_norm": 1.3300915956497192, "learning_rate": 2.994834523652963e-05, "loss": 1.1461, "step": 541 }, { "epoch": 0.28526315789473683, "grad_norm": 1.0725505352020264, "learning_rate": 2.9948138943117996e-05, "loss": 0.9856, "step": 542 }, { "epoch": 0.28578947368421054, "grad_norm": 1.2995253801345825, "learning_rate": 2.9947932239303673e-05, "loss": 0.922, "step": 543 }, { "epoch": 0.2863157894736842, "grad_norm": 3.25260066986084, "learning_rate": 2.9947725125092326e-05, "loss": 1.5961, "step": 544 }, { "epoch": 0.2868421052631579, "grad_norm": 1.5616159439086914, "learning_rate": 2.9947517600489643e-05, "loss": 1.4245, "step": 545 }, { "epoch": 0.2873684210526316, "grad_norm": 14.093785285949707, "learning_rate": 2.9947309665501327e-05, "loss": 0.9921, "step": 546 }, { "epoch": 0.28789473684210526, "grad_norm": 1.165595293045044, "learning_rate": 2.9947101320133083e-05, "loss": 1.0, "step": 547 }, { "epoch": 0.28842105263157897, "grad_norm": 2.5077242851257324, "learning_rate": 2.994689256439063e-05, "loss": 1.5877, "step": 548 }, { "epoch": 0.2889473684210526, "grad_norm": 1.8317209482192993, "learning_rate": 2.9946683398279706e-05, "loss": 1.0294, "step": 549 }, { "epoch": 0.2894736842105263, "grad_norm": 1.0942083597183228, "learning_rate": 2.9946473821806044e-05, "loss": 0.4813, "step": 550 }, { "epoch": 0.29, "grad_norm": 1.6907622814178467, "learning_rate": 2.9946263834975403e-05, "loss": 0.9892, "step": 551 }, { "epoch": 0.2905263157894737, "grad_norm": 9.270956993103027, "learning_rate": 2.994605343779355e-05, "loss": 0.6519, "step": 552 }, { "epoch": 0.2910526315789474, "grad_norm": 3.074319362640381, "learning_rate": 2.9945842630266258e-05, "loss": 0.6775, "step": 553 }, { "epoch": 0.29157894736842105, "grad_norm": 1.4713774919509888, "learning_rate": 2.994563141239932e-05, "loss": 1.1378, "step": 554 }, { "epoch": 0.29210526315789476, "grad_norm": 3.8262014389038086, "learning_rate": 2.9945419784198524e-05, "loss": 1.2291, "step": 555 }, { "epoch": 0.2926315789473684, "grad_norm": 3.6956279277801514, "learning_rate": 2.9945207745669695e-05, "loss": 0.2079, "step": 556 }, { "epoch": 0.2931578947368421, "grad_norm": 1.555780053138733, "learning_rate": 2.9944995296818642e-05, "loss": 0.9855, "step": 557 }, { "epoch": 0.29368421052631577, "grad_norm": 7.439387321472168, "learning_rate": 2.9944782437651203e-05, "loss": 0.1662, "step": 558 }, { "epoch": 0.2942105263157895, "grad_norm": 2.846749782562256, "learning_rate": 2.994456916817322e-05, "loss": 0.4499, "step": 559 }, { "epoch": 0.29473684210526313, "grad_norm": 2.7918450832366943, "learning_rate": 2.9944355488390555e-05, "loss": 0.0985, "step": 560 }, { "epoch": 0.29526315789473684, "grad_norm": 3.7580766677856445, "learning_rate": 2.9944141398309067e-05, "loss": 0.7624, "step": 561 }, { "epoch": 0.29578947368421055, "grad_norm": 1.398541808128357, "learning_rate": 2.994392689793464e-05, "loss": 0.7911, "step": 562 }, { "epoch": 0.2963157894736842, "grad_norm": 7.155591011047363, "learning_rate": 2.9943711987273156e-05, "loss": 0.2768, "step": 563 }, { "epoch": 0.2968421052631579, "grad_norm": 15.522272109985352, "learning_rate": 2.994349666633052e-05, "loss": 1.0977, "step": 564 }, { "epoch": 0.29736842105263156, "grad_norm": 2.732008457183838, "learning_rate": 2.9943280935112644e-05, "loss": 0.749, "step": 565 }, { "epoch": 0.29789473684210527, "grad_norm": 1.2546172142028809, "learning_rate": 2.994306479362545e-05, "loss": 0.823, "step": 566 }, { "epoch": 0.2984210526315789, "grad_norm": 1.2818124294281006, "learning_rate": 2.994284824187487e-05, "loss": 1.1513, "step": 567 }, { "epoch": 0.29894736842105263, "grad_norm": 1.5917184352874756, "learning_rate": 2.9942631279866853e-05, "loss": 0.8293, "step": 568 }, { "epoch": 0.29947368421052634, "grad_norm": 1.2113322019577026, "learning_rate": 2.9942413907607355e-05, "loss": 0.3476, "step": 569 }, { "epoch": 0.3, "grad_norm": 2.0724539756774902, "learning_rate": 2.994219612510234e-05, "loss": 0.6935, "step": 570 }, { "epoch": 0.3005263157894737, "grad_norm": 2.444352149963379, "learning_rate": 2.9941977932357794e-05, "loss": 0.1079, "step": 571 }, { "epoch": 0.30105263157894735, "grad_norm": 4.327816486358643, "learning_rate": 2.99417593293797e-05, "loss": 0.5792, "step": 572 }, { "epoch": 0.30157894736842106, "grad_norm": 1.2691162824630737, "learning_rate": 2.9941540316174066e-05, "loss": 0.9346, "step": 573 }, { "epoch": 0.3021052631578947, "grad_norm": 2.0975606441497803, "learning_rate": 2.99413208927469e-05, "loss": 1.1176, "step": 574 }, { "epoch": 0.3026315789473684, "grad_norm": 1.5340697765350342, "learning_rate": 2.994110105910423e-05, "loss": 0.862, "step": 575 }, { "epoch": 0.3031578947368421, "grad_norm": 16.70974349975586, "learning_rate": 2.9940880815252097e-05, "loss": 0.6121, "step": 576 }, { "epoch": 0.3036842105263158, "grad_norm": 3.6483004093170166, "learning_rate": 2.9940660161196536e-05, "loss": 1.0742, "step": 577 }, { "epoch": 0.3042105263157895, "grad_norm": 1.5293868780136108, "learning_rate": 2.994043909694361e-05, "loss": 0.7131, "step": 578 }, { "epoch": 0.30473684210526314, "grad_norm": 3.1697440147399902, "learning_rate": 2.994021762249939e-05, "loss": 0.7688, "step": 579 }, { "epoch": 0.30526315789473685, "grad_norm": 11.020563125610352, "learning_rate": 2.993999573786995e-05, "loss": 2.4059, "step": 580 }, { "epoch": 0.3057894736842105, "grad_norm": 1.883289098739624, "learning_rate": 2.9939773443061393e-05, "loss": 1.0566, "step": 581 }, { "epoch": 0.3063157894736842, "grad_norm": 1.52479887008667, "learning_rate": 2.9939550738079814e-05, "loss": 1.0846, "step": 582 }, { "epoch": 0.3068421052631579, "grad_norm": 4.475525856018066, "learning_rate": 2.9939327622931333e-05, "loss": 0.6157, "step": 583 }, { "epoch": 0.30736842105263157, "grad_norm": 1.8167206048965454, "learning_rate": 2.993910409762207e-05, "loss": 1.3638, "step": 584 }, { "epoch": 0.3078947368421053, "grad_norm": 1.2058393955230713, "learning_rate": 2.9938880162158164e-05, "loss": 1.0466, "step": 585 }, { "epoch": 0.30842105263157893, "grad_norm": 8.277087211608887, "learning_rate": 2.9938655816545764e-05, "loss": 1.4132, "step": 586 }, { "epoch": 0.30894736842105264, "grad_norm": 2.545767307281494, "learning_rate": 2.993843106079103e-05, "loss": 0.7397, "step": 587 }, { "epoch": 0.3094736842105263, "grad_norm": 1.162050485610962, "learning_rate": 2.993820589490013e-05, "loss": 1.1346, "step": 588 }, { "epoch": 0.31, "grad_norm": 1.3114622831344604, "learning_rate": 2.9937980318879247e-05, "loss": 0.9332, "step": 589 }, { "epoch": 0.3105263157894737, "grad_norm": 2.397101640701294, "learning_rate": 2.9937754332734576e-05, "loss": 0.7573, "step": 590 }, { "epoch": 0.31105263157894736, "grad_norm": 1.4673283100128174, "learning_rate": 2.993752793647232e-05, "loss": 0.8484, "step": 591 }, { "epoch": 0.31157894736842107, "grad_norm": 9.11661148071289, "learning_rate": 2.9937301130098696e-05, "loss": 0.4733, "step": 592 }, { "epoch": 0.3121052631578947, "grad_norm": 1.699437141418457, "learning_rate": 2.9937073913619926e-05, "loss": 0.5332, "step": 593 }, { "epoch": 0.3126315789473684, "grad_norm": 27.69738006591797, "learning_rate": 2.9936846287042254e-05, "loss": 2.1628, "step": 594 }, { "epoch": 0.3131578947368421, "grad_norm": 10.93797779083252, "learning_rate": 2.993661825037193e-05, "loss": 1.469, "step": 595 }, { "epoch": 0.3136842105263158, "grad_norm": 4.551391124725342, "learning_rate": 2.993638980361521e-05, "loss": 0.6163, "step": 596 }, { "epoch": 0.3142105263157895, "grad_norm": 2.5031051635742188, "learning_rate": 2.9936160946778372e-05, "loss": 1.3955, "step": 597 }, { "epoch": 0.31473684210526315, "grad_norm": 1.7328745126724243, "learning_rate": 2.993593167986769e-05, "loss": 0.7825, "step": 598 }, { "epoch": 0.31526315789473686, "grad_norm": 4.045513153076172, "learning_rate": 2.993570200288947e-05, "loss": 0.4318, "step": 599 }, { "epoch": 0.3157894736842105, "grad_norm": 2.73246169090271, "learning_rate": 2.993547191585001e-05, "loss": 1.4336, "step": 600 }, { "epoch": 0.3163157894736842, "grad_norm": 1.558140754699707, "learning_rate": 2.9935241418755626e-05, "loss": 1.0587, "step": 601 }, { "epoch": 0.31684210526315787, "grad_norm": 11.362480163574219, "learning_rate": 2.9935010511612655e-05, "loss": 0.4542, "step": 602 }, { "epoch": 0.3173684210526316, "grad_norm": 1.3051596879959106, "learning_rate": 2.9934779194427427e-05, "loss": 0.7598, "step": 603 }, { "epoch": 0.3178947368421053, "grad_norm": 3.5925891399383545, "learning_rate": 2.99345474672063e-05, "loss": 1.1406, "step": 604 }, { "epoch": 0.31842105263157894, "grad_norm": 12.116620063781738, "learning_rate": 2.993431532995563e-05, "loss": 1.519, "step": 605 }, { "epoch": 0.31894736842105265, "grad_norm": 1.2009161710739136, "learning_rate": 2.9934082782681797e-05, "loss": 0.9572, "step": 606 }, { "epoch": 0.3194736842105263, "grad_norm": 2.059299945831299, "learning_rate": 2.993384982539118e-05, "loss": 0.6667, "step": 607 }, { "epoch": 0.32, "grad_norm": 2.226381778717041, "learning_rate": 2.9933616458090174e-05, "loss": 1.2235, "step": 608 }, { "epoch": 0.32052631578947366, "grad_norm": 1.5386918783187866, "learning_rate": 2.9933382680785196e-05, "loss": 1.2193, "step": 609 }, { "epoch": 0.32105263157894737, "grad_norm": 4.095519065856934, "learning_rate": 2.9933148493482653e-05, "loss": 0.2043, "step": 610 }, { "epoch": 0.3215789473684211, "grad_norm": 16.421064376831055, "learning_rate": 2.9932913896188978e-05, "loss": 1.4538, "step": 611 }, { "epoch": 0.32210526315789473, "grad_norm": 1.1642028093338013, "learning_rate": 2.9932678888910614e-05, "loss": 0.7069, "step": 612 }, { "epoch": 0.32263157894736844, "grad_norm": 1.0497969388961792, "learning_rate": 2.9932443471654013e-05, "loss": 1.027, "step": 613 }, { "epoch": 0.3231578947368421, "grad_norm": 1.70055091381073, "learning_rate": 2.9932207644425635e-05, "loss": 1.0163, "step": 614 }, { "epoch": 0.3236842105263158, "grad_norm": 0.4913199841976166, "learning_rate": 2.993197140723196e-05, "loss": 0.0092, "step": 615 }, { "epoch": 0.32421052631578945, "grad_norm": 9.340396881103516, "learning_rate": 2.993173476007947e-05, "loss": 0.7314, "step": 616 }, { "epoch": 0.32473684210526316, "grad_norm": 3.6009857654571533, "learning_rate": 2.993149770297466e-05, "loss": 0.9824, "step": 617 }, { "epoch": 0.32526315789473687, "grad_norm": 1.5114206075668335, "learning_rate": 2.9931260235924046e-05, "loss": 0.6525, "step": 618 }, { "epoch": 0.3257894736842105, "grad_norm": 4.900846004486084, "learning_rate": 2.9931022358934144e-05, "loss": 1.0022, "step": 619 }, { "epoch": 0.3263157894736842, "grad_norm": 3.2764687538146973, "learning_rate": 2.993078407201148e-05, "loss": 1.571, "step": 620 }, { "epoch": 0.3268421052631579, "grad_norm": 1.1837842464447021, "learning_rate": 2.9930545375162602e-05, "loss": 1.2526, "step": 621 }, { "epoch": 0.3273684210526316, "grad_norm": 1.7309764623641968, "learning_rate": 2.993030626839406e-05, "loss": 0.9898, "step": 622 }, { "epoch": 0.32789473684210524, "grad_norm": 1.766406536102295, "learning_rate": 2.9930066751712427e-05, "loss": 0.6513, "step": 623 }, { "epoch": 0.32842105263157895, "grad_norm": 1.1759055852890015, "learning_rate": 2.9929826825124268e-05, "loss": 1.0862, "step": 624 }, { "epoch": 0.32894736842105265, "grad_norm": 46.77162551879883, "learning_rate": 2.9929586488636174e-05, "loss": 3.9448, "step": 625 }, { "epoch": 0.3294736842105263, "grad_norm": 1.6841140985488892, "learning_rate": 2.992934574225475e-05, "loss": 1.6885, "step": 626 }, { "epoch": 0.33, "grad_norm": 1.449564814567566, "learning_rate": 2.9929104585986594e-05, "loss": 1.354, "step": 627 }, { "epoch": 0.33052631578947367, "grad_norm": 2.3545868396759033, "learning_rate": 2.992886301983833e-05, "loss": 0.7053, "step": 628 }, { "epoch": 0.3310526315789474, "grad_norm": 1.053680181503296, "learning_rate": 2.9928621043816602e-05, "loss": 1.3087, "step": 629 }, { "epoch": 0.33157894736842103, "grad_norm": 7.419710636138916, "learning_rate": 2.9928378657928037e-05, "loss": 0.3903, "step": 630 }, { "epoch": 0.33210526315789474, "grad_norm": 1.032652497291565, "learning_rate": 2.9928135862179304e-05, "loss": 1.0834, "step": 631 }, { "epoch": 0.33263157894736844, "grad_norm": 11.945768356323242, "learning_rate": 2.9927892656577057e-05, "loss": 0.5555, "step": 632 }, { "epoch": 0.3331578947368421, "grad_norm": 3.9728310108184814, "learning_rate": 2.9927649041127978e-05, "loss": 0.2488, "step": 633 }, { "epoch": 0.3336842105263158, "grad_norm": 1.3373970985412598, "learning_rate": 2.992740501583876e-05, "loss": 1.0203, "step": 634 }, { "epoch": 0.33421052631578946, "grad_norm": 4.078335762023926, "learning_rate": 2.9927160580716096e-05, "loss": 1.0495, "step": 635 }, { "epoch": 0.33473684210526317, "grad_norm": 1.9304755926132202, "learning_rate": 2.99269157357667e-05, "loss": 1.4033, "step": 636 }, { "epoch": 0.3352631578947368, "grad_norm": 2.1133267879486084, "learning_rate": 2.99266704809973e-05, "loss": 1.1063, "step": 637 }, { "epoch": 0.3357894736842105, "grad_norm": 6.6004180908203125, "learning_rate": 2.9926424816414615e-05, "loss": 1.1123, "step": 638 }, { "epoch": 0.33631578947368423, "grad_norm": 1.2974992990493774, "learning_rate": 2.9926178742025403e-05, "loss": 2.3209, "step": 639 }, { "epoch": 0.3368421052631579, "grad_norm": 1.2676901817321777, "learning_rate": 2.992593225783641e-05, "loss": 0.9767, "step": 640 }, { "epoch": 0.3373684210526316, "grad_norm": 1.001715064048767, "learning_rate": 2.9925685363854413e-05, "loss": 0.8202, "step": 641 }, { "epoch": 0.33789473684210525, "grad_norm": 3.934710741043091, "learning_rate": 2.9925438060086187e-05, "loss": 1.4305, "step": 642 }, { "epoch": 0.33842105263157896, "grad_norm": 1.8003560304641724, "learning_rate": 2.992519034653852e-05, "loss": 0.8166, "step": 643 }, { "epoch": 0.3389473684210526, "grad_norm": 1.138811469078064, "learning_rate": 2.992494222321821e-05, "loss": 0.8121, "step": 644 }, { "epoch": 0.3394736842105263, "grad_norm": 1.1089503765106201, "learning_rate": 2.992469369013208e-05, "loss": 1.1773, "step": 645 }, { "epoch": 0.34, "grad_norm": 9.685154914855957, "learning_rate": 2.992444474728694e-05, "loss": 1.0016, "step": 646 }, { "epoch": 0.3405263157894737, "grad_norm": 1.1541014909744263, "learning_rate": 2.9924195394689635e-05, "loss": 1.0952, "step": 647 }, { "epoch": 0.3410526315789474, "grad_norm": 1.8897780179977417, "learning_rate": 2.9923945632347002e-05, "loss": 1.2069, "step": 648 }, { "epoch": 0.34157894736842104, "grad_norm": 3.347874879837036, "learning_rate": 2.9923695460265912e-05, "loss": 1.0908, "step": 649 }, { "epoch": 0.34210526315789475, "grad_norm": 12.232775688171387, "learning_rate": 2.992344487845322e-05, "loss": 1.9784, "step": 650 }, { "epoch": 0.3426315789473684, "grad_norm": 2.2362685203552246, "learning_rate": 2.992319388691581e-05, "loss": 0.3282, "step": 651 }, { "epoch": 0.3431578947368421, "grad_norm": 1.7842745780944824, "learning_rate": 2.9922942485660577e-05, "loss": 1.34, "step": 652 }, { "epoch": 0.3436842105263158, "grad_norm": 4.246903419494629, "learning_rate": 2.9922690674694418e-05, "loss": 0.8923, "step": 653 }, { "epoch": 0.34421052631578947, "grad_norm": 5.96973991394043, "learning_rate": 2.9922438454024246e-05, "loss": 0.6434, "step": 654 }, { "epoch": 0.3447368421052632, "grad_norm": 3.631617784500122, "learning_rate": 2.992218582365699e-05, "loss": 0.9737, "step": 655 }, { "epoch": 0.3452631578947368, "grad_norm": 11.352360725402832, "learning_rate": 2.9921932783599585e-05, "loss": 1.8412, "step": 656 }, { "epoch": 0.34578947368421054, "grad_norm": 1.3203455209732056, "learning_rate": 2.9921679333858976e-05, "loss": 1.0928, "step": 657 }, { "epoch": 0.3463157894736842, "grad_norm": 1.1191567182540894, "learning_rate": 2.9921425474442127e-05, "loss": 1.4272, "step": 658 }, { "epoch": 0.3468421052631579, "grad_norm": 2.8861405849456787, "learning_rate": 2.9921171205356e-05, "loss": 0.9221, "step": 659 }, { "epoch": 0.3473684210526316, "grad_norm": 2.145612955093384, "learning_rate": 2.992091652660758e-05, "loss": 0.3259, "step": 660 }, { "epoch": 0.34789473684210526, "grad_norm": 3.5325045585632324, "learning_rate": 2.9920661438203862e-05, "loss": 2.93, "step": 661 }, { "epoch": 0.34842105263157896, "grad_norm": 1.2416492700576782, "learning_rate": 2.9920405940151842e-05, "loss": 1.2741, "step": 662 }, { "epoch": 0.3489473684210526, "grad_norm": 2.492912530899048, "learning_rate": 2.9920150032458538e-05, "loss": 0.9804, "step": 663 }, { "epoch": 0.3494736842105263, "grad_norm": 10.124689102172852, "learning_rate": 2.9919893715130983e-05, "loss": 1.3878, "step": 664 }, { "epoch": 0.35, "grad_norm": 1.77699613571167, "learning_rate": 2.9919636988176208e-05, "loss": 1.2149, "step": 665 }, { "epoch": 0.3505263157894737, "grad_norm": 1.3935060501098633, "learning_rate": 2.9919379851601256e-05, "loss": 1.2551, "step": 666 }, { "epoch": 0.3510526315789474, "grad_norm": 5.249575138092041, "learning_rate": 2.9919122305413196e-05, "loss": 0.7921, "step": 667 }, { "epoch": 0.35157894736842105, "grad_norm": 3.862658739089966, "learning_rate": 2.9918864349619094e-05, "loss": 0.269, "step": 668 }, { "epoch": 0.35210526315789475, "grad_norm": 4.229588508605957, "learning_rate": 2.991860598422604e-05, "loss": 0.9079, "step": 669 }, { "epoch": 0.3526315789473684, "grad_norm": 8.937419891357422, "learning_rate": 2.9918347209241116e-05, "loss": 0.694, "step": 670 }, { "epoch": 0.3531578947368421, "grad_norm": 9.598597526550293, "learning_rate": 2.9918088024671428e-05, "loss": 0.4956, "step": 671 }, { "epoch": 0.35368421052631577, "grad_norm": 11.96364974975586, "learning_rate": 2.99178284305241e-05, "loss": 1.865, "step": 672 }, { "epoch": 0.3542105263157895, "grad_norm": 8.622352600097656, "learning_rate": 2.9917568426806253e-05, "loss": 1.3497, "step": 673 }, { "epoch": 0.3547368421052632, "grad_norm": 1.6212915182113647, "learning_rate": 2.991730801352503e-05, "loss": 1.0978, "step": 674 }, { "epoch": 0.35526315789473684, "grad_norm": 1.9762486219406128, "learning_rate": 2.9917047190687578e-05, "loss": 1.0297, "step": 675 }, { "epoch": 0.35578947368421054, "grad_norm": 1.137174129486084, "learning_rate": 2.991678595830106e-05, "loss": 1.3445, "step": 676 }, { "epoch": 0.3563157894736842, "grad_norm": 1.2727190256118774, "learning_rate": 2.991652431637264e-05, "loss": 0.6727, "step": 677 }, { "epoch": 0.3568421052631579, "grad_norm": 3.9113903045654297, "learning_rate": 2.991626226490951e-05, "loss": 1.0311, "step": 678 }, { "epoch": 0.35736842105263156, "grad_norm": 3.078036069869995, "learning_rate": 2.9915999803918862e-05, "loss": 1.232, "step": 679 }, { "epoch": 0.35789473684210527, "grad_norm": 2.5189783573150635, "learning_rate": 2.99157369334079e-05, "loss": 1.1612, "step": 680 }, { "epoch": 0.358421052631579, "grad_norm": 3.2792701721191406, "learning_rate": 2.991547365338385e-05, "loss": 0.1674, "step": 681 }, { "epoch": 0.3589473684210526, "grad_norm": 2.9844839572906494, "learning_rate": 2.9915209963853928e-05, "loss": 0.2604, "step": 682 }, { "epoch": 0.35947368421052633, "grad_norm": 1.3294965028762817, "learning_rate": 2.991494586482538e-05, "loss": 1.1524, "step": 683 }, { "epoch": 0.36, "grad_norm": 7.3986382484436035, "learning_rate": 2.9914681356305458e-05, "loss": 1.324, "step": 684 }, { "epoch": 0.3605263157894737, "grad_norm": 2.5278208255767822, "learning_rate": 2.991441643830142e-05, "loss": 0.7384, "step": 685 }, { "epoch": 0.36105263157894735, "grad_norm": 1.2104181051254272, "learning_rate": 2.991415111082054e-05, "loss": 1.3023, "step": 686 }, { "epoch": 0.36157894736842106, "grad_norm": 1.2583531141281128, "learning_rate": 2.9913885373870108e-05, "loss": 1.1101, "step": 687 }, { "epoch": 0.36210526315789476, "grad_norm": 1.007744312286377, "learning_rate": 2.9913619227457413e-05, "loss": 1.1889, "step": 688 }, { "epoch": 0.3626315789473684, "grad_norm": 6.841166019439697, "learning_rate": 2.991335267158977e-05, "loss": 1.5798, "step": 689 }, { "epoch": 0.3631578947368421, "grad_norm": 4.756072044372559, "learning_rate": 2.9913085706274485e-05, "loss": 0.4925, "step": 690 }, { "epoch": 0.3636842105263158, "grad_norm": 2.5629639625549316, "learning_rate": 2.99128183315189e-05, "loss": 0.872, "step": 691 }, { "epoch": 0.3642105263157895, "grad_norm": 5.531483173370361, "learning_rate": 2.9912550547330348e-05, "loss": 0.8287, "step": 692 }, { "epoch": 0.36473684210526314, "grad_norm": 1.3215042352676392, "learning_rate": 2.9912282353716184e-05, "loss": 1.0156, "step": 693 }, { "epoch": 0.36526315789473685, "grad_norm": 2.0720999240875244, "learning_rate": 2.9912013750683773e-05, "loss": 1.2208, "step": 694 }, { "epoch": 0.36578947368421055, "grad_norm": 1.3862260580062866, "learning_rate": 2.9911744738240487e-05, "loss": 1.0124, "step": 695 }, { "epoch": 0.3663157894736842, "grad_norm": 9.334665298461914, "learning_rate": 2.991147531639371e-05, "loss": 0.9209, "step": 696 }, { "epoch": 0.3668421052631579, "grad_norm": 6.73415994644165, "learning_rate": 2.9911205485150846e-05, "loss": 0.4709, "step": 697 }, { "epoch": 0.36736842105263157, "grad_norm": 3.3749637603759766, "learning_rate": 2.9910935244519294e-05, "loss": 1.026, "step": 698 }, { "epoch": 0.3678947368421053, "grad_norm": 7.481809139251709, "learning_rate": 2.991066459450648e-05, "loss": 0.6787, "step": 699 }, { "epoch": 0.3684210526315789, "grad_norm": 1.9085243940353394, "learning_rate": 2.991039353511983e-05, "loss": 1.2883, "step": 700 }, { "epoch": 0.36894736842105263, "grad_norm": 1.1209938526153564, "learning_rate": 2.991012206636679e-05, "loss": 1.3699, "step": 701 }, { "epoch": 0.36947368421052634, "grad_norm": 5.468508243560791, "learning_rate": 2.9909850188254814e-05, "loss": 0.5472, "step": 702 }, { "epoch": 0.37, "grad_norm": 1.069633960723877, "learning_rate": 2.9909577900791367e-05, "loss": 0.8953, "step": 703 }, { "epoch": 0.3705263157894737, "grad_norm": 1.5387980937957764, "learning_rate": 2.9909305203983916e-05, "loss": 1.1598, "step": 704 }, { "epoch": 0.37105263157894736, "grad_norm": 2.226820230484009, "learning_rate": 2.9909032097839958e-05, "loss": 1.1898, "step": 705 }, { "epoch": 0.37157894736842106, "grad_norm": 1.4225033521652222, "learning_rate": 2.9908758582366985e-05, "loss": 1.3026, "step": 706 }, { "epoch": 0.3721052631578947, "grad_norm": 5.471367835998535, "learning_rate": 2.9908484657572507e-05, "loss": 0.5267, "step": 707 }, { "epoch": 0.3726315789473684, "grad_norm": 2.0702991485595703, "learning_rate": 2.9908210323464047e-05, "loss": 1.0605, "step": 708 }, { "epoch": 0.37315789473684213, "grad_norm": 8.954567909240723, "learning_rate": 2.9907935580049136e-05, "loss": 0.5651, "step": 709 }, { "epoch": 0.3736842105263158, "grad_norm": 1.6635040044784546, "learning_rate": 2.9907660427335324e-05, "loss": 0.6742, "step": 710 }, { "epoch": 0.3742105263157895, "grad_norm": 2.7905099391937256, "learning_rate": 2.990738486533015e-05, "loss": 0.1368, "step": 711 }, { "epoch": 0.37473684210526315, "grad_norm": 2.07755970954895, "learning_rate": 2.990710889404119e-05, "loss": 0.3577, "step": 712 }, { "epoch": 0.37526315789473685, "grad_norm": 12.278775215148926, "learning_rate": 2.9906832513476022e-05, "loss": 1.7846, "step": 713 }, { "epoch": 0.3757894736842105, "grad_norm": 2.079127073287964, "learning_rate": 2.990655572364223e-05, "loss": 1.0179, "step": 714 }, { "epoch": 0.3763157894736842, "grad_norm": 3.017181396484375, "learning_rate": 2.990627852454741e-05, "loss": 1.0674, "step": 715 }, { "epoch": 0.37684210526315787, "grad_norm": 4.084104537963867, "learning_rate": 2.9906000916199182e-05, "loss": 1.392, "step": 716 }, { "epoch": 0.3773684210526316, "grad_norm": 2.539130210876465, "learning_rate": 2.9905722898605162e-05, "loss": 1.4383, "step": 717 }, { "epoch": 0.3778947368421053, "grad_norm": 2.1830291748046875, "learning_rate": 2.9905444471772978e-05, "loss": 2.1873, "step": 718 }, { "epoch": 0.37842105263157894, "grad_norm": 2.0604703426361084, "learning_rate": 2.9905165635710286e-05, "loss": 1.3176, "step": 719 }, { "epoch": 0.37894736842105264, "grad_norm": 1.2084317207336426, "learning_rate": 2.990488639042473e-05, "loss": 0.9763, "step": 720 }, { "epoch": 0.3794736842105263, "grad_norm": 4.228016376495361, "learning_rate": 2.9904606735923988e-05, "loss": 1.2977, "step": 721 }, { "epoch": 0.38, "grad_norm": 7.167438983917236, "learning_rate": 2.990432667221573e-05, "loss": 2.1188, "step": 722 }, { "epoch": 0.38052631578947366, "grad_norm": 2.2469797134399414, "learning_rate": 2.9904046199307645e-05, "loss": 1.2899, "step": 723 }, { "epoch": 0.38105263157894737, "grad_norm": 1.2914834022521973, "learning_rate": 2.9903765317207436e-05, "loss": 1.1014, "step": 724 }, { "epoch": 0.3815789473684211, "grad_norm": 1.4903666973114014, "learning_rate": 2.9903484025922815e-05, "loss": 1.505, "step": 725 }, { "epoch": 0.3821052631578947, "grad_norm": 7.811427593231201, "learning_rate": 2.9903202325461504e-05, "loss": 1.1364, "step": 726 }, { "epoch": 0.38263157894736843, "grad_norm": 1.9753170013427734, "learning_rate": 2.9902920215831238e-05, "loss": 1.3579, "step": 727 }, { "epoch": 0.3831578947368421, "grad_norm": 4.232260227203369, "learning_rate": 2.990263769703976e-05, "loss": 0.3801, "step": 728 }, { "epoch": 0.3836842105263158, "grad_norm": 4.112211227416992, "learning_rate": 2.9902354769094828e-05, "loss": 1.2913, "step": 729 }, { "epoch": 0.38421052631578945, "grad_norm": 1.6348377466201782, "learning_rate": 2.990207143200421e-05, "loss": 0.8208, "step": 730 }, { "epoch": 0.38473684210526315, "grad_norm": 2.1267893314361572, "learning_rate": 2.9901787685775682e-05, "loss": 1.4784, "step": 731 }, { "epoch": 0.38526315789473686, "grad_norm": 1.5363768339157104, "learning_rate": 2.990150353041704e-05, "loss": 1.1479, "step": 732 }, { "epoch": 0.3857894736842105, "grad_norm": 1.329637885093689, "learning_rate": 2.9901218965936085e-05, "loss": 1.1127, "step": 733 }, { "epoch": 0.3863157894736842, "grad_norm": 5.673161029815674, "learning_rate": 2.9900933992340627e-05, "loss": 1.6329, "step": 734 }, { "epoch": 0.3868421052631579, "grad_norm": 2.2011849880218506, "learning_rate": 2.9900648609638487e-05, "loss": 0.7907, "step": 735 }, { "epoch": 0.3873684210526316, "grad_norm": 8.786739349365234, "learning_rate": 2.9900362817837506e-05, "loss": 1.9727, "step": 736 }, { "epoch": 0.38789473684210524, "grad_norm": 1.4037230014801025, "learning_rate": 2.9900076616945527e-05, "loss": 0.0393, "step": 737 }, { "epoch": 0.38842105263157894, "grad_norm": 1.1987260580062866, "learning_rate": 2.989979000697041e-05, "loss": 0.5779, "step": 738 }, { "epoch": 0.38894736842105265, "grad_norm": 1.5265027284622192, "learning_rate": 2.989950298792002e-05, "loss": 1.1865, "step": 739 }, { "epoch": 0.3894736842105263, "grad_norm": 2.8836402893066406, "learning_rate": 2.9899215559802243e-05, "loss": 1.974, "step": 740 }, { "epoch": 0.39, "grad_norm": 1.7887353897094727, "learning_rate": 2.9898927722624966e-05, "loss": 0.1404, "step": 741 }, { "epoch": 0.39052631578947367, "grad_norm": 6.494203567504883, "learning_rate": 2.9898639476396095e-05, "loss": 0.5194, "step": 742 }, { "epoch": 0.3910526315789474, "grad_norm": 2.408672332763672, "learning_rate": 2.9898350821123536e-05, "loss": 0.6292, "step": 743 }, { "epoch": 0.391578947368421, "grad_norm": 1.4377915859222412, "learning_rate": 2.989806175681523e-05, "loss": 1.4094, "step": 744 }, { "epoch": 0.39210526315789473, "grad_norm": 1.4970815181732178, "learning_rate": 2.9897772283479092e-05, "loss": 0.8209, "step": 745 }, { "epoch": 0.39263157894736844, "grad_norm": 2.360936403274536, "learning_rate": 2.9897482401123088e-05, "loss": 1.5443, "step": 746 }, { "epoch": 0.3931578947368421, "grad_norm": 5.188052177429199, "learning_rate": 2.9897192109755162e-05, "loss": 1.5809, "step": 747 }, { "epoch": 0.3936842105263158, "grad_norm": 1.4602450132369995, "learning_rate": 2.9896901409383296e-05, "loss": 0.0463, "step": 748 }, { "epoch": 0.39421052631578946, "grad_norm": 4.554551601409912, "learning_rate": 2.9896610300015463e-05, "loss": 0.9516, "step": 749 }, { "epoch": 0.39473684210526316, "grad_norm": 1.1301685571670532, "learning_rate": 2.9896318781659662e-05, "loss": 1.0014, "step": 750 }, { "epoch": 0.3952631578947368, "grad_norm": 1.8906985521316528, "learning_rate": 2.9896026854323896e-05, "loss": 1.1485, "step": 751 }, { "epoch": 0.3957894736842105, "grad_norm": 9.192191123962402, "learning_rate": 2.9895734518016174e-05, "loss": 0.8871, "step": 752 }, { "epoch": 0.39631578947368423, "grad_norm": 1.4287827014923096, "learning_rate": 2.9895441772744526e-05, "loss": 1.3338, "step": 753 }, { "epoch": 0.3968421052631579, "grad_norm": 1.5386351346969604, "learning_rate": 2.989514861851699e-05, "loss": 0.8937, "step": 754 }, { "epoch": 0.3973684210526316, "grad_norm": 4.3271379470825195, "learning_rate": 2.989485505534161e-05, "loss": 0.9811, "step": 755 }, { "epoch": 0.39789473684210525, "grad_norm": 13.09150505065918, "learning_rate": 2.9894561083226452e-05, "loss": 1.4184, "step": 756 }, { "epoch": 0.39842105263157895, "grad_norm": 2.528856039047241, "learning_rate": 2.9894266702179586e-05, "loss": 1.3771, "step": 757 }, { "epoch": 0.3989473684210526, "grad_norm": 1.3942149877548218, "learning_rate": 2.989397191220909e-05, "loss": 1.0354, "step": 758 }, { "epoch": 0.3994736842105263, "grad_norm": 1.3833954334259033, "learning_rate": 2.989367671332306e-05, "loss": 1.2427, "step": 759 }, { "epoch": 0.4, "grad_norm": 2.9078752994537354, "learning_rate": 2.98933811055296e-05, "loss": 0.2044, "step": 760 }, { "epoch": 0.4005263157894737, "grad_norm": 4.573373794555664, "learning_rate": 2.9893085088836828e-05, "loss": 1.4173, "step": 761 }, { "epoch": 0.4010526315789474, "grad_norm": 1.6260179281234741, "learning_rate": 2.989278866325287e-05, "loss": 0.8061, "step": 762 }, { "epoch": 0.40157894736842104, "grad_norm": 53.73946762084961, "learning_rate": 2.9892491828785866e-05, "loss": 5.7539, "step": 763 }, { "epoch": 0.40210526315789474, "grad_norm": 1.7073310613632202, "learning_rate": 2.9892194585443964e-05, "loss": 1.3272, "step": 764 }, { "epoch": 0.4026315789473684, "grad_norm": 1.7463268041610718, "learning_rate": 2.9891896933235324e-05, "loss": 0.6709, "step": 765 }, { "epoch": 0.4031578947368421, "grad_norm": 1.671372652053833, "learning_rate": 2.9891598872168116e-05, "loss": 1.0062, "step": 766 }, { "epoch": 0.4036842105263158, "grad_norm": 3.4793472290039062, "learning_rate": 2.989130040225053e-05, "loss": 1.4948, "step": 767 }, { "epoch": 0.40421052631578946, "grad_norm": 2.485945701599121, "learning_rate": 2.9891001523490754e-05, "loss": 1.3262, "step": 768 }, { "epoch": 0.4047368421052632, "grad_norm": 1.4725351333618164, "learning_rate": 2.9890702235897e-05, "loss": 1.1616, "step": 769 }, { "epoch": 0.4052631578947368, "grad_norm": 2.2476260662078857, "learning_rate": 2.9890402539477476e-05, "loss": 1.4187, "step": 770 }, { "epoch": 0.40578947368421053, "grad_norm": 0.10535408556461334, "learning_rate": 2.9890102434240415e-05, "loss": 0.002, "step": 771 }, { "epoch": 0.4063157894736842, "grad_norm": 1.7888716459274292, "learning_rate": 2.9889801920194062e-05, "loss": 1.9264, "step": 772 }, { "epoch": 0.4068421052631579, "grad_norm": 3.374720573425293, "learning_rate": 2.988950099734666e-05, "loss": 0.3194, "step": 773 }, { "epoch": 0.4073684210526316, "grad_norm": 1.5872454643249512, "learning_rate": 2.9889199665706475e-05, "loss": 0.8289, "step": 774 }, { "epoch": 0.40789473684210525, "grad_norm": 4.640042304992676, "learning_rate": 2.988889792528178e-05, "loss": 0.8441, "step": 775 }, { "epoch": 0.40842105263157896, "grad_norm": 5.191117763519287, "learning_rate": 2.9888595776080856e-05, "loss": 2.0781, "step": 776 }, { "epoch": 0.4089473684210526, "grad_norm": 1.595027208328247, "learning_rate": 2.9888293218111998e-05, "loss": 1.5206, "step": 777 }, { "epoch": 0.4094736842105263, "grad_norm": 2.689854145050049, "learning_rate": 2.988799025138352e-05, "loss": 0.7231, "step": 778 }, { "epoch": 0.41, "grad_norm": 16.25735092163086, "learning_rate": 2.988768687590373e-05, "loss": 1.6409, "step": 779 }, { "epoch": 0.4105263157894737, "grad_norm": 1.3042892217636108, "learning_rate": 2.9887383091680964e-05, "loss": 1.0157, "step": 780 }, { "epoch": 0.4110526315789474, "grad_norm": 5.346875190734863, "learning_rate": 2.9887078898723564e-05, "loss": 0.657, "step": 781 }, { "epoch": 0.41157894736842104, "grad_norm": 1.5279061794281006, "learning_rate": 2.9886774297039878e-05, "loss": 1.4934, "step": 782 }, { "epoch": 0.41210526315789475, "grad_norm": 12.0784330368042, "learning_rate": 2.9886469286638265e-05, "loss": 3.1849, "step": 783 }, { "epoch": 0.4126315789473684, "grad_norm": 1.3375122547149658, "learning_rate": 2.9886163867527107e-05, "loss": 1.1898, "step": 784 }, { "epoch": 0.4131578947368421, "grad_norm": 4.016144275665283, "learning_rate": 2.9885858039714786e-05, "loss": 0.863, "step": 785 }, { "epoch": 0.41368421052631577, "grad_norm": 2.205333948135376, "learning_rate": 2.98855518032097e-05, "loss": 0.8202, "step": 786 }, { "epoch": 0.4142105263157895, "grad_norm": 2.130064010620117, "learning_rate": 2.988524515802025e-05, "loss": 1.7152, "step": 787 }, { "epoch": 0.4147368421052632, "grad_norm": 2.0593249797821045, "learning_rate": 2.9884938104154864e-05, "loss": 1.2797, "step": 788 }, { "epoch": 0.41526315789473683, "grad_norm": 4.233097553253174, "learning_rate": 2.9884630641621963e-05, "loss": 1.3987, "step": 789 }, { "epoch": 0.41578947368421054, "grad_norm": 6.048382759094238, "learning_rate": 2.9884322770429996e-05, "loss": 1.7027, "step": 790 }, { "epoch": 0.4163157894736842, "grad_norm": 4.876536846160889, "learning_rate": 2.9884014490587418e-05, "loss": 1.1787, "step": 791 }, { "epoch": 0.4168421052631579, "grad_norm": 3.066049337387085, "learning_rate": 2.9883705802102684e-05, "loss": 2.2892, "step": 792 }, { "epoch": 0.41736842105263156, "grad_norm": 3.7404627799987793, "learning_rate": 2.9883396704984273e-05, "loss": 0.4776, "step": 793 }, { "epoch": 0.41789473684210526, "grad_norm": 1.4917809963226318, "learning_rate": 2.9883087199240672e-05, "loss": 1.1282, "step": 794 }, { "epoch": 0.41842105263157897, "grad_norm": 1.7566046714782715, "learning_rate": 2.988277728488038e-05, "loss": 1.3632, "step": 795 }, { "epoch": 0.4189473684210526, "grad_norm": 3.263943910598755, "learning_rate": 2.98824669619119e-05, "loss": 1.245, "step": 796 }, { "epoch": 0.41947368421052633, "grad_norm": 1.3323115110397339, "learning_rate": 2.9882156230343755e-05, "loss": 0.8262, "step": 797 }, { "epoch": 0.42, "grad_norm": 4.155140399932861, "learning_rate": 2.988184509018448e-05, "loss": 1.6147, "step": 798 }, { "epoch": 0.4205263157894737, "grad_norm": 1.3995517492294312, "learning_rate": 2.9881533541442615e-05, "loss": 1.3523, "step": 799 }, { "epoch": 0.42105263157894735, "grad_norm": 1.254440188407898, "learning_rate": 2.9881221584126716e-05, "loss": 0.7949, "step": 800 }, { "epoch": 0.42157894736842105, "grad_norm": 1.74281644821167, "learning_rate": 2.9880909218245335e-05, "loss": 1.1625, "step": 801 }, { "epoch": 0.42210526315789476, "grad_norm": 0.7531611323356628, "learning_rate": 2.9880596443807065e-05, "loss": 0.0144, "step": 802 }, { "epoch": 0.4226315789473684, "grad_norm": 2.432140827178955, "learning_rate": 2.9880283260820485e-05, "loss": 1.2728, "step": 803 }, { "epoch": 0.4231578947368421, "grad_norm": 12.019746780395508, "learning_rate": 2.9879969669294193e-05, "loss": 0.3237, "step": 804 }, { "epoch": 0.4236842105263158, "grad_norm": 9.18172836303711, "learning_rate": 2.98796556692368e-05, "loss": 1.1377, "step": 805 }, { "epoch": 0.4242105263157895, "grad_norm": 6.744290351867676, "learning_rate": 2.9879341260656926e-05, "loss": 0.8986, "step": 806 }, { "epoch": 0.42473684210526313, "grad_norm": 6.863618850708008, "learning_rate": 2.9879026443563207e-05, "loss": 0.7325, "step": 807 }, { "epoch": 0.42526315789473684, "grad_norm": 2.109163761138916, "learning_rate": 2.9878711217964284e-05, "loss": 0.7444, "step": 808 }, { "epoch": 0.42578947368421055, "grad_norm": 1.5105681419372559, "learning_rate": 2.9878395583868807e-05, "loss": 0.0472, "step": 809 }, { "epoch": 0.4263157894736842, "grad_norm": 2.5132837295532227, "learning_rate": 2.9878079541285445e-05, "loss": 1.1468, "step": 810 }, { "epoch": 0.4268421052631579, "grad_norm": 1.0575823783874512, "learning_rate": 2.987776309022288e-05, "loss": 0.7931, "step": 811 }, { "epoch": 0.42736842105263156, "grad_norm": 2.099186658859253, "learning_rate": 2.9877446230689795e-05, "loss": 0.1261, "step": 812 }, { "epoch": 0.42789473684210527, "grad_norm": 2.151189088821411, "learning_rate": 2.9877128962694892e-05, "loss": 1.3023, "step": 813 }, { "epoch": 0.4284210526315789, "grad_norm": 4.0430707931518555, "learning_rate": 2.987681128624688e-05, "loss": 0.4215, "step": 814 }, { "epoch": 0.42894736842105263, "grad_norm": 2.170595169067383, "learning_rate": 2.9876493201354475e-05, "loss": 1.6202, "step": 815 }, { "epoch": 0.42947368421052634, "grad_norm": 7.825321674346924, "learning_rate": 2.987617470802642e-05, "loss": 0.9704, "step": 816 }, { "epoch": 0.43, "grad_norm": 1.3321274518966675, "learning_rate": 2.9875855806271455e-05, "loss": 0.985, "step": 817 }, { "epoch": 0.4305263157894737, "grad_norm": 3.4284095764160156, "learning_rate": 2.9875536496098335e-05, "loss": 1.6423, "step": 818 }, { "epoch": 0.43105263157894735, "grad_norm": 1.4579600095748901, "learning_rate": 2.9875216777515827e-05, "loss": 1.0348, "step": 819 }, { "epoch": 0.43157894736842106, "grad_norm": 1.253885269165039, "learning_rate": 2.987489665053271e-05, "loss": 0.6495, "step": 820 }, { "epoch": 0.4321052631578947, "grad_norm": 4.7803144454956055, "learning_rate": 2.9874576115157773e-05, "loss": 0.2175, "step": 821 }, { "epoch": 0.4326315789473684, "grad_norm": 2.1819753646850586, "learning_rate": 2.987425517139981e-05, "loss": 0.9445, "step": 822 }, { "epoch": 0.43315789473684213, "grad_norm": 1.4593207836151123, "learning_rate": 2.9873933819267647e-05, "loss": 1.3716, "step": 823 }, { "epoch": 0.4336842105263158, "grad_norm": 2.473689556121826, "learning_rate": 2.9873612058770094e-05, "loss": 0.5247, "step": 824 }, { "epoch": 0.4342105263157895, "grad_norm": 2.997178792953491, "learning_rate": 2.9873289889915986e-05, "loss": 0.9902, "step": 825 }, { "epoch": 0.43473684210526314, "grad_norm": 6.612910270690918, "learning_rate": 2.9872967312714176e-05, "loss": 1.2973, "step": 826 }, { "epoch": 0.43526315789473685, "grad_norm": 1.0687013864517212, "learning_rate": 2.9872644327173513e-05, "loss": 0.7058, "step": 827 }, { "epoch": 0.4357894736842105, "grad_norm": 1.554075837135315, "learning_rate": 2.9872320933302867e-05, "loss": 1.4401, "step": 828 }, { "epoch": 0.4363157894736842, "grad_norm": 4.839378833770752, "learning_rate": 2.9871997131111122e-05, "loss": 1.1889, "step": 829 }, { "epoch": 0.4368421052631579, "grad_norm": 3.0694851875305176, "learning_rate": 2.9871672920607158e-05, "loss": 1.0211, "step": 830 }, { "epoch": 0.4373684210526316, "grad_norm": 1.378533124923706, "learning_rate": 2.9871348301799883e-05, "loss": 0.6821, "step": 831 }, { "epoch": 0.4378947368421053, "grad_norm": 1.238147497177124, "learning_rate": 2.9871023274698204e-05, "loss": 0.0303, "step": 832 }, { "epoch": 0.43842105263157893, "grad_norm": 1.4806623458862305, "learning_rate": 2.9870697839311053e-05, "loss": 1.2068, "step": 833 }, { "epoch": 0.43894736842105264, "grad_norm": 1.3188241720199585, "learning_rate": 2.9870371995647353e-05, "loss": 0.6705, "step": 834 }, { "epoch": 0.4394736842105263, "grad_norm": 4.55914831161499, "learning_rate": 2.9870045743716063e-05, "loss": 0.0898, "step": 835 }, { "epoch": 0.44, "grad_norm": 0.970885694026947, "learning_rate": 2.9869719083526137e-05, "loss": 1.0416, "step": 836 }, { "epoch": 0.4405263157894737, "grad_norm": 1.664801836013794, "learning_rate": 2.9869392015086538e-05, "loss": 1.3485, "step": 837 }, { "epoch": 0.44105263157894736, "grad_norm": 2.7517263889312744, "learning_rate": 2.9869064538406247e-05, "loss": 1.0703, "step": 838 }, { "epoch": 0.44157894736842107, "grad_norm": 0.9083302021026611, "learning_rate": 2.986873665349426e-05, "loss": 0.0277, "step": 839 }, { "epoch": 0.4421052631578947, "grad_norm": 2.142995834350586, "learning_rate": 2.986840836035957e-05, "loss": 1.0772, "step": 840 }, { "epoch": 0.44263157894736843, "grad_norm": 0.20032253861427307, "learning_rate": 2.98680796590112e-05, "loss": 0.0051, "step": 841 }, { "epoch": 0.4431578947368421, "grad_norm": 0.8623015880584717, "learning_rate": 2.9867750549458173e-05, "loss": 0.6315, "step": 842 }, { "epoch": 0.4436842105263158, "grad_norm": 1.2665982246398926, "learning_rate": 2.9867421031709517e-05, "loss": 1.4777, "step": 843 }, { "epoch": 0.4442105263157895, "grad_norm": 0.974398672580719, "learning_rate": 2.986709110577429e-05, "loss": 0.9672, "step": 844 }, { "epoch": 0.44473684210526315, "grad_norm": 0.9700206518173218, "learning_rate": 2.9866760771661544e-05, "loss": 1.0432, "step": 845 }, { "epoch": 0.44526315789473686, "grad_norm": 2.319387912750244, "learning_rate": 2.9866430029380342e-05, "loss": 1.3111, "step": 846 }, { "epoch": 0.4457894736842105, "grad_norm": 0.9847918748855591, "learning_rate": 2.9866098878939777e-05, "loss": 1.1562, "step": 847 }, { "epoch": 0.4463157894736842, "grad_norm": 5.33390474319458, "learning_rate": 2.9865767320348932e-05, "loss": 1.2682, "step": 848 }, { "epoch": 0.4468421052631579, "grad_norm": 9.808623313903809, "learning_rate": 2.9865435353616915e-05, "loss": 1.3406, "step": 849 }, { "epoch": 0.4473684210526316, "grad_norm": 6.855868816375732, "learning_rate": 2.9865102978752837e-05, "loss": 2.3309, "step": 850 }, { "epoch": 0.4478947368421053, "grad_norm": 2.986323595046997, "learning_rate": 2.9864770195765828e-05, "loss": 1.596, "step": 851 }, { "epoch": 0.44842105263157894, "grad_norm": 2.0322799682617188, "learning_rate": 2.9864437004665016e-05, "loss": 1.0168, "step": 852 }, { "epoch": 0.44894736842105265, "grad_norm": 14.111528396606445, "learning_rate": 2.9864103405459556e-05, "loss": 1.2123, "step": 853 }, { "epoch": 0.4494736842105263, "grad_norm": 1.4620112180709839, "learning_rate": 2.9863769398158607e-05, "loss": 1.1575, "step": 854 }, { "epoch": 0.45, "grad_norm": 2.048269271850586, "learning_rate": 2.9863434982771338e-05, "loss": 1.4347, "step": 855 }, { "epoch": 0.45052631578947366, "grad_norm": 1.1362407207489014, "learning_rate": 2.9863100159306923e-05, "loss": 0.9232, "step": 856 }, { "epoch": 0.45105263157894737, "grad_norm": 14.55726432800293, "learning_rate": 2.9862764927774567e-05, "loss": 1.6201, "step": 857 }, { "epoch": 0.4515789473684211, "grad_norm": 4.592569828033447, "learning_rate": 2.9862429288183468e-05, "loss": 1.5495, "step": 858 }, { "epoch": 0.45210526315789473, "grad_norm": 0.8918259143829346, "learning_rate": 2.986209324054284e-05, "loss": 0.9341, "step": 859 }, { "epoch": 0.45263157894736844, "grad_norm": 3.0468339920043945, "learning_rate": 2.986175678486191e-05, "loss": 0.9043, "step": 860 }, { "epoch": 0.4531578947368421, "grad_norm": 1.4248181581497192, "learning_rate": 2.9861419921149916e-05, "loss": 0.983, "step": 861 }, { "epoch": 0.4536842105263158, "grad_norm": 2.1727728843688965, "learning_rate": 2.9861082649416107e-05, "loss": 0.3648, "step": 862 }, { "epoch": 0.45421052631578945, "grad_norm": 1.469250202178955, "learning_rate": 2.9860744969669742e-05, "loss": 0.9197, "step": 863 }, { "epoch": 0.45473684210526316, "grad_norm": 3.119738817214966, "learning_rate": 2.986040688192009e-05, "loss": 1.2948, "step": 864 }, { "epoch": 0.45526315789473687, "grad_norm": 7.314899921417236, "learning_rate": 2.9860068386176437e-05, "loss": 1.1322, "step": 865 }, { "epoch": 0.4557894736842105, "grad_norm": 1.2942012548446655, "learning_rate": 2.9859729482448073e-05, "loss": 1.0436, "step": 866 }, { "epoch": 0.45631578947368423, "grad_norm": 16.526906967163086, "learning_rate": 2.985939017074431e-05, "loss": 0.7217, "step": 867 }, { "epoch": 0.4568421052631579, "grad_norm": 0.9518576860427856, "learning_rate": 2.9859050451074453e-05, "loss": 0.0462, "step": 868 }, { "epoch": 0.4573684210526316, "grad_norm": 1.8784120082855225, "learning_rate": 2.985871032344784e-05, "loss": 1.1546, "step": 869 }, { "epoch": 0.45789473684210524, "grad_norm": 2.373981475830078, "learning_rate": 2.9858369787873795e-05, "loss": 1.1577, "step": 870 }, { "epoch": 0.45842105263157895, "grad_norm": 4.2424421310424805, "learning_rate": 2.985802884436168e-05, "loss": 0.8525, "step": 871 }, { "epoch": 0.4589473684210526, "grad_norm": 1.318802833557129, "learning_rate": 2.9857687492920854e-05, "loss": 0.7319, "step": 872 }, { "epoch": 0.4594736842105263, "grad_norm": 1.247564435005188, "learning_rate": 2.9857345733560683e-05, "loss": 1.0628, "step": 873 }, { "epoch": 0.46, "grad_norm": 1.433790683746338, "learning_rate": 2.985700356629056e-05, "loss": 1.1417, "step": 874 }, { "epoch": 0.4605263157894737, "grad_norm": 3.1146788597106934, "learning_rate": 2.9856660991119867e-05, "loss": 1.4214, "step": 875 }, { "epoch": 0.4610526315789474, "grad_norm": 9.475061416625977, "learning_rate": 2.9856318008058018e-05, "loss": 1.015, "step": 876 }, { "epoch": 0.46157894736842103, "grad_norm": 1.2461477518081665, "learning_rate": 2.9855974617114425e-05, "loss": 1.0464, "step": 877 }, { "epoch": 0.46210526315789474, "grad_norm": 1.4924275875091553, "learning_rate": 2.9855630818298518e-05, "loss": 1.7069, "step": 878 }, { "epoch": 0.4626315789473684, "grad_norm": 1.3920341730117798, "learning_rate": 2.9855286611619733e-05, "loss": 1.5036, "step": 879 }, { "epoch": 0.4631578947368421, "grad_norm": 2.099785089492798, "learning_rate": 2.985494199708753e-05, "loss": 0.057, "step": 880 }, { "epoch": 0.4636842105263158, "grad_norm": 2.4705018997192383, "learning_rate": 2.985459697471136e-05, "loss": 1.091, "step": 881 }, { "epoch": 0.46421052631578946, "grad_norm": 7.607650279998779, "learning_rate": 2.98542515445007e-05, "loss": 0.3252, "step": 882 }, { "epoch": 0.46473684210526317, "grad_norm": 1.6341148614883423, "learning_rate": 2.985390570646503e-05, "loss": 1.5967, "step": 883 }, { "epoch": 0.4652631578947368, "grad_norm": 1.6126140356063843, "learning_rate": 2.9853559460613846e-05, "loss": 1.2722, "step": 884 }, { "epoch": 0.46578947368421053, "grad_norm": 13.708699226379395, "learning_rate": 2.985321280695666e-05, "loss": 1.7056, "step": 885 }, { "epoch": 0.4663157894736842, "grad_norm": 1.0587328672409058, "learning_rate": 2.9852865745502988e-05, "loss": 0.9254, "step": 886 }, { "epoch": 0.4668421052631579, "grad_norm": 1.383562445640564, "learning_rate": 2.9852518276262352e-05, "loss": 0.821, "step": 887 }, { "epoch": 0.4673684210526316, "grad_norm": 5.15762996673584, "learning_rate": 2.9852170399244297e-05, "loss": 1.1727, "step": 888 }, { "epoch": 0.46789473684210525, "grad_norm": 25.248504638671875, "learning_rate": 2.9851822114458374e-05, "loss": 2.5076, "step": 889 }, { "epoch": 0.46842105263157896, "grad_norm": 4.159931182861328, "learning_rate": 2.985147342191414e-05, "loss": 0.4708, "step": 890 }, { "epoch": 0.4689473684210526, "grad_norm": 1.1170576810836792, "learning_rate": 2.9851124321621177e-05, "loss": 0.8062, "step": 891 }, { "epoch": 0.4694736842105263, "grad_norm": 1.807730793952942, "learning_rate": 2.9850774813589065e-05, "loss": 0.5932, "step": 892 }, { "epoch": 0.47, "grad_norm": 1.2220526933670044, "learning_rate": 2.98504248978274e-05, "loss": 1.0653, "step": 893 }, { "epoch": 0.4705263157894737, "grad_norm": 7.340139389038086, "learning_rate": 2.9850074574345787e-05, "loss": 2.1767, "step": 894 }, { "epoch": 0.4710526315789474, "grad_norm": 2.8421573638916016, "learning_rate": 2.9849723843153847e-05, "loss": 0.619, "step": 895 }, { "epoch": 0.47157894736842104, "grad_norm": 1.6204451322555542, "learning_rate": 2.9849372704261203e-05, "loss": 0.469, "step": 896 }, { "epoch": 0.47210526315789475, "grad_norm": 1.9400737285614014, "learning_rate": 2.9849021157677506e-05, "loss": 1.0654, "step": 897 }, { "epoch": 0.4726315789473684, "grad_norm": 1.4052708148956299, "learning_rate": 2.9848669203412404e-05, "loss": 0.8773, "step": 898 }, { "epoch": 0.4731578947368421, "grad_norm": 6.795404434204102, "learning_rate": 2.984831684147556e-05, "loss": 2.0904, "step": 899 }, { "epoch": 0.47368421052631576, "grad_norm": 2.273846387863159, "learning_rate": 2.9847964071876642e-05, "loss": 1.3245, "step": 900 }, { "epoch": 0.47421052631578947, "grad_norm": 1.403640866279602, "learning_rate": 2.9847610894625343e-05, "loss": 0.8352, "step": 901 }, { "epoch": 0.4747368421052632, "grad_norm": 1.2695461511611938, "learning_rate": 2.9847257309731357e-05, "loss": 1.0238, "step": 902 }, { "epoch": 0.47526315789473683, "grad_norm": 2.6055543422698975, "learning_rate": 2.9846903317204388e-05, "loss": 0.2152, "step": 903 }, { "epoch": 0.47578947368421054, "grad_norm": 2.166196584701538, "learning_rate": 2.984654891705416e-05, "loss": 1.2715, "step": 904 }, { "epoch": 0.4763157894736842, "grad_norm": 7.2398247718811035, "learning_rate": 2.9846194109290404e-05, "loss": 2.0351, "step": 905 }, { "epoch": 0.4768421052631579, "grad_norm": 4.5485453605651855, "learning_rate": 2.9845838893922854e-05, "loss": 0.6056, "step": 906 }, { "epoch": 0.47736842105263155, "grad_norm": 2.5836188793182373, "learning_rate": 2.9845483270961267e-05, "loss": 2.0319, "step": 907 }, { "epoch": 0.47789473684210526, "grad_norm": 1.6816260814666748, "learning_rate": 2.984512724041541e-05, "loss": 0.8913, "step": 908 }, { "epoch": 0.47842105263157897, "grad_norm": 1.6629785299301147, "learning_rate": 2.9844770802295056e-05, "loss": 1.5281, "step": 909 }, { "epoch": 0.4789473684210526, "grad_norm": 1.2143621444702148, "learning_rate": 2.9844413956609985e-05, "loss": 1.0262, "step": 910 }, { "epoch": 0.47947368421052633, "grad_norm": 4.233440399169922, "learning_rate": 2.984405670337e-05, "loss": 1.4231, "step": 911 }, { "epoch": 0.48, "grad_norm": 1.3030482530593872, "learning_rate": 2.9843699042584908e-05, "loss": 0.6924, "step": 912 }, { "epoch": 0.4805263157894737, "grad_norm": 5.886407375335693, "learning_rate": 2.9843340974264532e-05, "loss": 2.0718, "step": 913 }, { "epoch": 0.48105263157894734, "grad_norm": 1.7257221937179565, "learning_rate": 2.98429824984187e-05, "loss": 2.0068, "step": 914 }, { "epoch": 0.48157894736842105, "grad_norm": 2.2462844848632812, "learning_rate": 2.9842623615057248e-05, "loss": 1.7576, "step": 915 }, { "epoch": 0.48210526315789476, "grad_norm": 6.659467697143555, "learning_rate": 2.984226432419004e-05, "loss": 0.8356, "step": 916 }, { "epoch": 0.4826315789473684, "grad_norm": 1.0781395435333252, "learning_rate": 2.9841904625826933e-05, "loss": 1.0799, "step": 917 }, { "epoch": 0.4831578947368421, "grad_norm": 1.9946705102920532, "learning_rate": 2.9841544519977805e-05, "loss": 1.7794, "step": 918 }, { "epoch": 0.48368421052631577, "grad_norm": 1.2607777118682861, "learning_rate": 2.9841184006652544e-05, "loss": 1.1695, "step": 919 }, { "epoch": 0.4842105263157895, "grad_norm": 1.6544026136398315, "learning_rate": 2.9840823085861047e-05, "loss": 1.0608, "step": 920 }, { "epoch": 0.48473684210526313, "grad_norm": 3.6581201553344727, "learning_rate": 2.9840461757613217e-05, "loss": 0.1158, "step": 921 }, { "epoch": 0.48526315789473684, "grad_norm": 1.2672442197799683, "learning_rate": 2.9840100021918986e-05, "loss": 1.1773, "step": 922 }, { "epoch": 0.48578947368421055, "grad_norm": 11.554649353027344, "learning_rate": 2.9839737878788276e-05, "loss": 0.1758, "step": 923 }, { "epoch": 0.4863157894736842, "grad_norm": 2.514875888824463, "learning_rate": 2.9839375328231033e-05, "loss": 0.6761, "step": 924 }, { "epoch": 0.4868421052631579, "grad_norm": 1.2187618017196655, "learning_rate": 2.9839012370257213e-05, "loss": 0.8572, "step": 925 }, { "epoch": 0.48736842105263156, "grad_norm": 11.437232971191406, "learning_rate": 2.983864900487678e-05, "loss": 1.4024, "step": 926 }, { "epoch": 0.48789473684210527, "grad_norm": 1.1252726316452026, "learning_rate": 2.9838285232099703e-05, "loss": 0.8422, "step": 927 }, { "epoch": 0.4884210526315789, "grad_norm": 7.090402603149414, "learning_rate": 2.9837921051935983e-05, "loss": 1.7886, "step": 928 }, { "epoch": 0.48894736842105263, "grad_norm": 0.9771952629089355, "learning_rate": 2.983755646439561e-05, "loss": 0.6664, "step": 929 }, { "epoch": 0.48947368421052634, "grad_norm": 1.0985069274902344, "learning_rate": 2.983719146948859e-05, "loss": 0.8674, "step": 930 }, { "epoch": 0.49, "grad_norm": 4.855616092681885, "learning_rate": 2.9836826067224953e-05, "loss": 1.08, "step": 931 }, { "epoch": 0.4905263157894737, "grad_norm": 4.352982044219971, "learning_rate": 2.9836460257614726e-05, "loss": 1.0946, "step": 932 }, { "epoch": 0.49105263157894735, "grad_norm": 9.891953468322754, "learning_rate": 2.9836094040667953e-05, "loss": 0.5332, "step": 933 }, { "epoch": 0.49157894736842106, "grad_norm": 4.198849678039551, "learning_rate": 2.9835727416394692e-05, "loss": 0.5912, "step": 934 }, { "epoch": 0.4921052631578947, "grad_norm": 3.755746841430664, "learning_rate": 2.9835360384805004e-05, "loss": 1.9771, "step": 935 }, { "epoch": 0.4926315789473684, "grad_norm": 1.8602410554885864, "learning_rate": 2.9834992945908966e-05, "loss": 0.9501, "step": 936 }, { "epoch": 0.49315789473684213, "grad_norm": 7.063650608062744, "learning_rate": 2.9834625099716668e-05, "loss": 1.1328, "step": 937 }, { "epoch": 0.4936842105263158, "grad_norm": 48.79571533203125, "learning_rate": 2.983425684623821e-05, "loss": 3.1328, "step": 938 }, { "epoch": 0.4942105263157895, "grad_norm": 8.612360954284668, "learning_rate": 2.98338881854837e-05, "loss": 0.6469, "step": 939 }, { "epoch": 0.49473684210526314, "grad_norm": 5.5089898109436035, "learning_rate": 2.9833519117463263e-05, "loss": 2.3209, "step": 940 }, { "epoch": 0.49526315789473685, "grad_norm": 1.2845944166183472, "learning_rate": 2.9833149642187026e-05, "loss": 1.2498, "step": 941 }, { "epoch": 0.4957894736842105, "grad_norm": 2.241567373275757, "learning_rate": 2.9832779759665144e-05, "loss": 0.1963, "step": 942 }, { "epoch": 0.4963157894736842, "grad_norm": 1.6336573362350464, "learning_rate": 2.983240946990776e-05, "loss": 0.6764, "step": 943 }, { "epoch": 0.4968421052631579, "grad_norm": 2.2285687923431396, "learning_rate": 2.9832038772925044e-05, "loss": 1.1649, "step": 944 }, { "epoch": 0.49736842105263157, "grad_norm": 1.5614920854568481, "learning_rate": 2.983166766872718e-05, "loss": 1.1911, "step": 945 }, { "epoch": 0.4978947368421053, "grad_norm": 1.2409616708755493, "learning_rate": 2.9831296157324348e-05, "loss": 0.9644, "step": 946 }, { "epoch": 0.49842105263157893, "grad_norm": 1.8802260160446167, "learning_rate": 2.983092423872675e-05, "loss": 0.6794, "step": 947 }, { "epoch": 0.49894736842105264, "grad_norm": 1.1808290481567383, "learning_rate": 2.98305519129446e-05, "loss": 1.2996, "step": 948 }, { "epoch": 0.4994736842105263, "grad_norm": 1.2290716171264648, "learning_rate": 2.983017917998812e-05, "loss": 1.0257, "step": 949 }, { "epoch": 0.5, "grad_norm": 1.3215667009353638, "learning_rate": 2.9829806039867537e-05, "loss": 1.1965, "step": 950 }, { "epoch": 0.5, "eval_loss": 1.0288052558898926, "eval_runtime": 12.7382, "eval_samples_per_second": 7.85, "eval_steps_per_second": 7.85, "step": 950 }, { "epoch": 0.5005263157894737, "grad_norm": 1.5079114437103271, "learning_rate": 2.9829432492593105e-05, "loss": 1.0393, "step": 951 }, { "epoch": 0.5010526315789474, "grad_norm": 6.629293441772461, "learning_rate": 2.9829058538175076e-05, "loss": 1.3261, "step": 952 }, { "epoch": 0.501578947368421, "grad_norm": 6.566483974456787, "learning_rate": 2.9828684176623714e-05, "loss": 1.561, "step": 953 }, { "epoch": 0.5021052631578947, "grad_norm": 1.9273375272750854, "learning_rate": 2.98283094079493e-05, "loss": 1.0629, "step": 954 }, { "epoch": 0.5026315789473684, "grad_norm": 2.6924264430999756, "learning_rate": 2.9827934232162128e-05, "loss": 0.967, "step": 955 }, { "epoch": 0.5031578947368421, "grad_norm": 1.5618788003921509, "learning_rate": 2.982755864927249e-05, "loss": 1.2447, "step": 956 }, { "epoch": 0.5036842105263157, "grad_norm": 2.599567174911499, "learning_rate": 2.98271826592907e-05, "loss": 0.9259, "step": 957 }, { "epoch": 0.5042105263157894, "grad_norm": 5.357890605926514, "learning_rate": 2.9826806262227082e-05, "loss": 0.7411, "step": 958 }, { "epoch": 0.5047368421052632, "grad_norm": 2.8332183361053467, "learning_rate": 2.9826429458091968e-05, "loss": 0.1058, "step": 959 }, { "epoch": 0.5052631578947369, "grad_norm": 1.5336825847625732, "learning_rate": 2.982605224689571e-05, "loss": 0.2564, "step": 960 }, { "epoch": 0.5057894736842106, "grad_norm": 3.167874336242676, "learning_rate": 2.9825674628648657e-05, "loss": 0.9333, "step": 961 }, { "epoch": 0.5063157894736842, "grad_norm": 1.4255472421646118, "learning_rate": 2.982529660336118e-05, "loss": 0.0605, "step": 962 }, { "epoch": 0.5068421052631579, "grad_norm": 5.443603038787842, "learning_rate": 2.9824918171043656e-05, "loss": 0.9617, "step": 963 }, { "epoch": 0.5073684210526316, "grad_norm": 1.5958141088485718, "learning_rate": 2.9824539331706476e-05, "loss": 1.8712, "step": 964 }, { "epoch": 0.5078947368421053, "grad_norm": 1.544083595275879, "learning_rate": 2.982416008536004e-05, "loss": 1.9258, "step": 965 }, { "epoch": 0.508421052631579, "grad_norm": 9.469457626342773, "learning_rate": 2.982378043201476e-05, "loss": 1.3554, "step": 966 }, { "epoch": 0.5089473684210526, "grad_norm": 1.1175442934036255, "learning_rate": 2.9823400371681062e-05, "loss": 1.5603, "step": 967 }, { "epoch": 0.5094736842105263, "grad_norm": 5.4420342445373535, "learning_rate": 2.9823019904369377e-05, "loss": 0.5942, "step": 968 }, { "epoch": 0.51, "grad_norm": 1.5822449922561646, "learning_rate": 2.9822639030090156e-05, "loss": 1.0452, "step": 969 }, { "epoch": 0.5105263157894737, "grad_norm": 4.8439483642578125, "learning_rate": 2.9822257748853846e-05, "loss": 0.2948, "step": 970 }, { "epoch": 0.5110526315789473, "grad_norm": 5.272324085235596, "learning_rate": 2.982187606067093e-05, "loss": 1.0793, "step": 971 }, { "epoch": 0.511578947368421, "grad_norm": 35.564964294433594, "learning_rate": 2.9821493965551877e-05, "loss": 4.9608, "step": 972 }, { "epoch": 0.5121052631578947, "grad_norm": 8.41565227508545, "learning_rate": 2.9821111463507177e-05, "loss": 0.8292, "step": 973 }, { "epoch": 0.5126315789473684, "grad_norm": 1.3578308820724487, "learning_rate": 2.9820728554547338e-05, "loss": 0.6588, "step": 974 }, { "epoch": 0.5131578947368421, "grad_norm": 21.759471893310547, "learning_rate": 2.9820345238682862e-05, "loss": 5.2813, "step": 975 }, { "epoch": 0.5136842105263157, "grad_norm": 1.089369535446167, "learning_rate": 2.9819961515924288e-05, "loss": 0.7576, "step": 976 }, { "epoch": 0.5142105263157895, "grad_norm": 1.6790835857391357, "learning_rate": 2.981957738628214e-05, "loss": 1.0218, "step": 977 }, { "epoch": 0.5147368421052632, "grad_norm": 0.8437327742576599, "learning_rate": 2.9819192849766965e-05, "loss": 0.459, "step": 978 }, { "epoch": 0.5152631578947369, "grad_norm": 1.327656865119934, "learning_rate": 2.981880790638933e-05, "loss": 1.0783, "step": 979 }, { "epoch": 0.5157894736842106, "grad_norm": 1.32616126537323, "learning_rate": 2.981842255615979e-05, "loss": 1.0975, "step": 980 }, { "epoch": 0.5163157894736842, "grad_norm": 1.0787245035171509, "learning_rate": 2.981803679908893e-05, "loss": 1.1274, "step": 981 }, { "epoch": 0.5168421052631579, "grad_norm": 5.349323272705078, "learning_rate": 2.9817650635187348e-05, "loss": 0.6531, "step": 982 }, { "epoch": 0.5173684210526316, "grad_norm": 6.324141979217529, "learning_rate": 2.981726406446564e-05, "loss": 1.4738, "step": 983 }, { "epoch": 0.5178947368421053, "grad_norm": 1.0158735513687134, "learning_rate": 2.9816877086934416e-05, "loss": 0.7424, "step": 984 }, { "epoch": 0.5184210526315789, "grad_norm": 1.3030285835266113, "learning_rate": 2.9816489702604307e-05, "loss": 1.2574, "step": 985 }, { "epoch": 0.5189473684210526, "grad_norm": 1.0021758079528809, "learning_rate": 2.9816101911485944e-05, "loss": 0.9003, "step": 986 }, { "epoch": 0.5194736842105263, "grad_norm": 2.6594042778015137, "learning_rate": 2.981571371358998e-05, "loss": 0.4392, "step": 987 }, { "epoch": 0.52, "grad_norm": 4.258882999420166, "learning_rate": 2.981532510892707e-05, "loss": 0.3734, "step": 988 }, { "epoch": 0.5205263157894737, "grad_norm": 4.136307239532471, "learning_rate": 2.9814936097507878e-05, "loss": 1.9032, "step": 989 }, { "epoch": 0.5210526315789473, "grad_norm": 4.121081352233887, "learning_rate": 2.981454667934309e-05, "loss": 1.5468, "step": 990 }, { "epoch": 0.521578947368421, "grad_norm": 6.502449035644531, "learning_rate": 2.9814156854443394e-05, "loss": 1.3444, "step": 991 }, { "epoch": 0.5221052631578947, "grad_norm": 9.599536895751953, "learning_rate": 2.9813766622819494e-05, "loss": 0.8619, "step": 992 }, { "epoch": 0.5226315789473684, "grad_norm": 5.463332653045654, "learning_rate": 2.9813375984482108e-05, "loss": 0.8028, "step": 993 }, { "epoch": 0.5231578947368422, "grad_norm": 2.675305128097534, "learning_rate": 2.9812984939441955e-05, "loss": 0.1877, "step": 994 }, { "epoch": 0.5236842105263158, "grad_norm": 1.6895674467086792, "learning_rate": 2.9812593487709778e-05, "loss": 1.4556, "step": 995 }, { "epoch": 0.5242105263157895, "grad_norm": 1.0445210933685303, "learning_rate": 2.9812201629296313e-05, "loss": 0.9134, "step": 996 }, { "epoch": 0.5247368421052632, "grad_norm": 4.284270286560059, "learning_rate": 2.9811809364212332e-05, "loss": 0.7209, "step": 997 }, { "epoch": 0.5252631578947369, "grad_norm": 3.461217164993286, "learning_rate": 2.98114166924686e-05, "loss": 0.4954, "step": 998 }, { "epoch": 0.5257894736842105, "grad_norm": 4.461343288421631, "learning_rate": 2.981102361407589e-05, "loss": 1.5761, "step": 999 }, { "epoch": 0.5263157894736842, "grad_norm": 1.5852874517440796, "learning_rate": 2.9810630129045003e-05, "loss": 1.1905, "step": 1000 }, { "epoch": 0.5268421052631579, "grad_norm": 2.101400375366211, "learning_rate": 2.9810236237386736e-05, "loss": 1.3343, "step": 1001 }, { "epoch": 0.5273684210526316, "grad_norm": 10.774019241333008, "learning_rate": 2.980984193911191e-05, "loss": 1.6976, "step": 1002 }, { "epoch": 0.5278947368421053, "grad_norm": 1.1593283414840698, "learning_rate": 2.9809447234231347e-05, "loss": 0.9587, "step": 1003 }, { "epoch": 0.5284210526315789, "grad_norm": 5.7743754386901855, "learning_rate": 2.9809052122755885e-05, "loss": 0.9899, "step": 1004 }, { "epoch": 0.5289473684210526, "grad_norm": 1.3892626762390137, "learning_rate": 2.9808656604696368e-05, "loss": 1.1057, "step": 1005 }, { "epoch": 0.5294736842105263, "grad_norm": 3.0294156074523926, "learning_rate": 2.980826068006366e-05, "loss": 0.6647, "step": 1006 }, { "epoch": 0.53, "grad_norm": 4.9338483810424805, "learning_rate": 2.9807864348868627e-05, "loss": 1.9628, "step": 1007 }, { "epoch": 0.5305263157894737, "grad_norm": 4.924473762512207, "learning_rate": 2.980746761112215e-05, "loss": 0.7927, "step": 1008 }, { "epoch": 0.5310526315789473, "grad_norm": 10.412948608398438, "learning_rate": 2.980707046683513e-05, "loss": 1.1254, "step": 1009 }, { "epoch": 0.531578947368421, "grad_norm": 9.721678733825684, "learning_rate": 2.980667291601846e-05, "loss": 1.9085, "step": 1010 }, { "epoch": 0.5321052631578947, "grad_norm": 3.1283535957336426, "learning_rate": 2.980627495868306e-05, "loss": 1.4154, "step": 1011 }, { "epoch": 0.5326315789473685, "grad_norm": 3.373187303543091, "learning_rate": 2.980587659483985e-05, "loss": 1.3, "step": 1012 }, { "epoch": 0.533157894736842, "grad_norm": 8.386382102966309, "learning_rate": 2.9805477824499782e-05, "loss": 0.2979, "step": 1013 }, { "epoch": 0.5336842105263158, "grad_norm": 2.241272211074829, "learning_rate": 2.980507864767379e-05, "loss": 0.2733, "step": 1014 }, { "epoch": 0.5342105263157895, "grad_norm": 1.518618106842041, "learning_rate": 2.9804679064372836e-05, "loss": 1.8392, "step": 1015 }, { "epoch": 0.5347368421052632, "grad_norm": 1.0135964155197144, "learning_rate": 2.9804279074607893e-05, "loss": 1.0983, "step": 1016 }, { "epoch": 0.5352631578947369, "grad_norm": 1.5758641958236694, "learning_rate": 2.9803878678389942e-05, "loss": 1.0834, "step": 1017 }, { "epoch": 0.5357894736842105, "grad_norm": 3.014568328857422, "learning_rate": 2.9803477875729977e-05, "loss": 0.4795, "step": 1018 }, { "epoch": 0.5363157894736842, "grad_norm": 1.2805556058883667, "learning_rate": 2.9803076666639e-05, "loss": 1.2692, "step": 1019 }, { "epoch": 0.5368421052631579, "grad_norm": 5.261938571929932, "learning_rate": 2.9802675051128027e-05, "loss": 0.8414, "step": 1020 }, { "epoch": 0.5373684210526316, "grad_norm": 3.743300676345825, "learning_rate": 2.9802273029208085e-05, "loss": 1.3266, "step": 1021 }, { "epoch": 0.5378947368421053, "grad_norm": 4.708719253540039, "learning_rate": 2.9801870600890216e-05, "loss": 0.132, "step": 1022 }, { "epoch": 0.5384210526315789, "grad_norm": 6.205244064331055, "learning_rate": 2.9801467766185457e-05, "loss": 0.2055, "step": 1023 }, { "epoch": 0.5389473684210526, "grad_norm": 11.566554069519043, "learning_rate": 2.980106452510488e-05, "loss": 0.1921, "step": 1024 }, { "epoch": 0.5394736842105263, "grad_norm": 2.550504446029663, "learning_rate": 2.9800660877659546e-05, "loss": 0.6581, "step": 1025 }, { "epoch": 0.54, "grad_norm": 6.586308002471924, "learning_rate": 2.9800256823860548e-05, "loss": 0.2069, "step": 1026 }, { "epoch": 0.5405263157894736, "grad_norm": 2.6851885318756104, "learning_rate": 2.9799852363718968e-05, "loss": 1.5168, "step": 1027 }, { "epoch": 0.5410526315789473, "grad_norm": 1.1070444583892822, "learning_rate": 2.979944749724592e-05, "loss": 0.8637, "step": 1028 }, { "epoch": 0.541578947368421, "grad_norm": 0.5393061637878418, "learning_rate": 2.979904222445252e-05, "loss": 0.0123, "step": 1029 }, { "epoch": 0.5421052631578948, "grad_norm": 4.805749416351318, "learning_rate": 2.979863654534988e-05, "loss": 0.6532, "step": 1030 }, { "epoch": 0.5426315789473685, "grad_norm": 1.2212395668029785, "learning_rate": 2.9798230459949154e-05, "loss": 1.0349, "step": 1031 }, { "epoch": 0.5431578947368421, "grad_norm": 1.0666829347610474, "learning_rate": 2.9797823968261483e-05, "loss": 0.816, "step": 1032 }, { "epoch": 0.5436842105263158, "grad_norm": 1.6240037679672241, "learning_rate": 2.9797417070298033e-05, "loss": 1.1863, "step": 1033 }, { "epoch": 0.5442105263157895, "grad_norm": 80.0791244506836, "learning_rate": 2.979700976606997e-05, "loss": 3.8047, "step": 1034 }, { "epoch": 0.5447368421052632, "grad_norm": 7.121600151062012, "learning_rate": 2.979660205558848e-05, "loss": 1.8238, "step": 1035 }, { "epoch": 0.5452631578947369, "grad_norm": 5.182654857635498, "learning_rate": 2.9796193938864758e-05, "loss": 1.7294, "step": 1036 }, { "epoch": 0.5457894736842105, "grad_norm": 1.5659658908843994, "learning_rate": 2.9795785415910002e-05, "loss": 1.3367, "step": 1037 }, { "epoch": 0.5463157894736842, "grad_norm": 2.371896266937256, "learning_rate": 2.979537648673544e-05, "loss": 0.6634, "step": 1038 }, { "epoch": 0.5468421052631579, "grad_norm": 6.189696311950684, "learning_rate": 2.979496715135228e-05, "loss": 1.1687, "step": 1039 }, { "epoch": 0.5473684210526316, "grad_norm": 3.078950881958008, "learning_rate": 2.9794557409771775e-05, "loss": 0.2819, "step": 1040 }, { "epoch": 0.5478947368421052, "grad_norm": 1.354801893234253, "learning_rate": 2.9794147262005182e-05, "loss": 1.0736, "step": 1041 }, { "epoch": 0.5484210526315789, "grad_norm": 7.661222457885742, "learning_rate": 2.9793736708063735e-05, "loss": 0.757, "step": 1042 }, { "epoch": 0.5489473684210526, "grad_norm": 1.1365411281585693, "learning_rate": 2.9793325747958736e-05, "loss": 0.9885, "step": 1043 }, { "epoch": 0.5494736842105263, "grad_norm": 2.5599515438079834, "learning_rate": 2.9792914381701448e-05, "loss": 1.5999, "step": 1044 }, { "epoch": 0.55, "grad_norm": 4.250511169433594, "learning_rate": 2.979250260930317e-05, "loss": 0.8261, "step": 1045 }, { "epoch": 0.5505263157894736, "grad_norm": 5.77577543258667, "learning_rate": 2.9792090430775212e-05, "loss": 1.4922, "step": 1046 }, { "epoch": 0.5510526315789473, "grad_norm": 10.645989418029785, "learning_rate": 2.979167784612888e-05, "loss": 1.5039, "step": 1047 }, { "epoch": 0.5515789473684211, "grad_norm": 22.560731887817383, "learning_rate": 2.9791264855375516e-05, "loss": 1.1108, "step": 1048 }, { "epoch": 0.5521052631578948, "grad_norm": 0.9451678395271301, "learning_rate": 2.9790851458526445e-05, "loss": 1.0083, "step": 1049 }, { "epoch": 0.5526315789473685, "grad_norm": 1.7276222705841064, "learning_rate": 2.9790437655593025e-05, "loss": 1.2542, "step": 1050 }, { "epoch": 0.5531578947368421, "grad_norm": 12.997016906738281, "learning_rate": 2.9790023446586616e-05, "loss": 1.5738, "step": 1051 }, { "epoch": 0.5536842105263158, "grad_norm": 5.94761848449707, "learning_rate": 2.9789608831518585e-05, "loss": 0.5001, "step": 1052 }, { "epoch": 0.5542105263157895, "grad_norm": 3.714102029800415, "learning_rate": 2.9789193810400324e-05, "loss": 0.4654, "step": 1053 }, { "epoch": 0.5547368421052632, "grad_norm": 1.2676738500595093, "learning_rate": 2.978877838324322e-05, "loss": 0.9719, "step": 1054 }, { "epoch": 0.5552631578947368, "grad_norm": 4.261148452758789, "learning_rate": 2.9788362550058683e-05, "loss": 0.481, "step": 1055 }, { "epoch": 0.5557894736842105, "grad_norm": 2.568772077560425, "learning_rate": 2.9787946310858126e-05, "loss": 0.2567, "step": 1056 }, { "epoch": 0.5563157894736842, "grad_norm": 17.453506469726562, "learning_rate": 2.9787529665652983e-05, "loss": 1.1513, "step": 1057 }, { "epoch": 0.5568421052631579, "grad_norm": 4.880800724029541, "learning_rate": 2.9787112614454682e-05, "loss": 0.7086, "step": 1058 }, { "epoch": 0.5573684210526316, "grad_norm": 2.3661293983459473, "learning_rate": 2.9786695157274686e-05, "loss": 0.3181, "step": 1059 }, { "epoch": 0.5578947368421052, "grad_norm": 1.5389021635055542, "learning_rate": 2.9786277294124446e-05, "loss": 1.1855, "step": 1060 }, { "epoch": 0.5584210526315789, "grad_norm": 1.2657150030136108, "learning_rate": 2.978585902501544e-05, "loss": 1.0923, "step": 1061 }, { "epoch": 0.5589473684210526, "grad_norm": 1.9125850200653076, "learning_rate": 2.9785440349959154e-05, "loss": 1.5275, "step": 1062 }, { "epoch": 0.5594736842105263, "grad_norm": 1.2199132442474365, "learning_rate": 2.978502126896708e-05, "loss": 0.9496, "step": 1063 }, { "epoch": 0.56, "grad_norm": 0.9119816422462463, "learning_rate": 2.9784601782050716e-05, "loss": 1.1481, "step": 1064 }, { "epoch": 0.5605263157894737, "grad_norm": 1.264197587966919, "learning_rate": 2.9784181889221588e-05, "loss": 1.0066, "step": 1065 }, { "epoch": 0.5610526315789474, "grad_norm": 1.3065340518951416, "learning_rate": 2.978376159049123e-05, "loss": 1.5019, "step": 1066 }, { "epoch": 0.5615789473684211, "grad_norm": 5.354816913604736, "learning_rate": 2.978334088587117e-05, "loss": 1.2623, "step": 1067 }, { "epoch": 0.5621052631578948, "grad_norm": 1.1491811275482178, "learning_rate": 2.9782919775372958e-05, "loss": 0.9462, "step": 1068 }, { "epoch": 0.5626315789473684, "grad_norm": 1.9805035591125488, "learning_rate": 2.9782498259008163e-05, "loss": 0.8934, "step": 1069 }, { "epoch": 0.5631578947368421, "grad_norm": 6.078151702880859, "learning_rate": 2.9782076336788355e-05, "loss": 2.088, "step": 1070 }, { "epoch": 0.5636842105263158, "grad_norm": 1.2258517742156982, "learning_rate": 2.9781654008725118e-05, "loss": 1.2418, "step": 1071 }, { "epoch": 0.5642105263157895, "grad_norm": 4.132420539855957, "learning_rate": 2.9781231274830045e-05, "loss": 1.0597, "step": 1072 }, { "epoch": 0.5647368421052632, "grad_norm": 2.6240031719207764, "learning_rate": 2.9780808135114745e-05, "loss": 1.8114, "step": 1073 }, { "epoch": 0.5652631578947368, "grad_norm": 5.4064154624938965, "learning_rate": 2.9780384589590832e-05, "loss": 1.7053, "step": 1074 }, { "epoch": 0.5657894736842105, "grad_norm": 1.50705885887146, "learning_rate": 2.9779960638269944e-05, "loss": 1.2326, "step": 1075 }, { "epoch": 0.5663157894736842, "grad_norm": 4.527035713195801, "learning_rate": 2.9779536281163707e-05, "loss": 1.0151, "step": 1076 }, { "epoch": 0.5668421052631579, "grad_norm": 2.740678071975708, "learning_rate": 2.9779111518283778e-05, "loss": 1.3425, "step": 1077 }, { "epoch": 0.5673684210526316, "grad_norm": 1.8033603429794312, "learning_rate": 2.977868634964182e-05, "loss": 1.4094, "step": 1078 }, { "epoch": 0.5678947368421052, "grad_norm": 2.819211483001709, "learning_rate": 2.9778260775249507e-05, "loss": 0.5859, "step": 1079 }, { "epoch": 0.5684210526315789, "grad_norm": 2.2031707763671875, "learning_rate": 2.9777834795118516e-05, "loss": 1.6467, "step": 1080 }, { "epoch": 0.5689473684210526, "grad_norm": 1.0411590337753296, "learning_rate": 2.9777408409260556e-05, "loss": 1.1856, "step": 1081 }, { "epoch": 0.5694736842105264, "grad_norm": 1.240462303161621, "learning_rate": 2.977698161768732e-05, "loss": 0.9601, "step": 1082 }, { "epoch": 0.57, "grad_norm": 1.4794399738311768, "learning_rate": 2.977655442041053e-05, "loss": 0.6428, "step": 1083 }, { "epoch": 0.5705263157894737, "grad_norm": 3.423283576965332, "learning_rate": 2.9776126817441918e-05, "loss": 0.5233, "step": 1084 }, { "epoch": 0.5710526315789474, "grad_norm": 1.0337262153625488, "learning_rate": 2.9775698808793216e-05, "loss": 1.2001, "step": 1085 }, { "epoch": 0.5715789473684211, "grad_norm": 2.4375438690185547, "learning_rate": 2.9775270394476187e-05, "loss": 0.8193, "step": 1086 }, { "epoch": 0.5721052631578948, "grad_norm": 1.048431158065796, "learning_rate": 2.9774841574502584e-05, "loss": 1.0908, "step": 1087 }, { "epoch": 0.5726315789473684, "grad_norm": 1.6780253648757935, "learning_rate": 2.9774412348884184e-05, "loss": 1.7247, "step": 1088 }, { "epoch": 0.5731578947368421, "grad_norm": 3.6449801921844482, "learning_rate": 2.9773982717632768e-05, "loss": 1.8992, "step": 1089 }, { "epoch": 0.5736842105263158, "grad_norm": 2.392915725708008, "learning_rate": 2.9773552680760133e-05, "loss": 0.5871, "step": 1090 }, { "epoch": 0.5742105263157895, "grad_norm": 25.228097915649414, "learning_rate": 2.9773122238278088e-05, "loss": 1.6074, "step": 1091 }, { "epoch": 0.5747368421052632, "grad_norm": 11.282571792602539, "learning_rate": 2.9772691390198452e-05, "loss": 1.8768, "step": 1092 }, { "epoch": 0.5752631578947368, "grad_norm": 6.598527908325195, "learning_rate": 2.9772260136533048e-05, "loss": 1.2115, "step": 1093 }, { "epoch": 0.5757894736842105, "grad_norm": 0.990164577960968, "learning_rate": 2.977182847729372e-05, "loss": 1.0003, "step": 1094 }, { "epoch": 0.5763157894736842, "grad_norm": 2.4671967029571533, "learning_rate": 2.9771396412492316e-05, "loss": 0.0873, "step": 1095 }, { "epoch": 0.5768421052631579, "grad_norm": 2.4675517082214355, "learning_rate": 2.977096394214071e-05, "loss": 1.2669, "step": 1096 }, { "epoch": 0.5773684210526315, "grad_norm": 1.001388430595398, "learning_rate": 2.9770531066250754e-05, "loss": 1.3442, "step": 1097 }, { "epoch": 0.5778947368421052, "grad_norm": 2.501490831375122, "learning_rate": 2.9770097784834352e-05, "loss": 0.8013, "step": 1098 }, { "epoch": 0.578421052631579, "grad_norm": 1.80197274684906, "learning_rate": 2.9769664097903394e-05, "loss": 1.3928, "step": 1099 }, { "epoch": 0.5789473684210527, "grad_norm": 1.4630690813064575, "learning_rate": 2.9769230005469785e-05, "loss": 1.5564, "step": 1100 }, { "epoch": 0.5794736842105264, "grad_norm": 2.549288034439087, "learning_rate": 2.9768795507545444e-05, "loss": 0.8852, "step": 1101 }, { "epoch": 0.58, "grad_norm": 3.0548112392425537, "learning_rate": 2.97683606041423e-05, "loss": 1.2653, "step": 1102 }, { "epoch": 0.5805263157894737, "grad_norm": 2.3279740810394287, "learning_rate": 2.9767925295272292e-05, "loss": 1.3083, "step": 1103 }, { "epoch": 0.5810526315789474, "grad_norm": 1.0182585716247559, "learning_rate": 2.9767489580947375e-05, "loss": 0.0161, "step": 1104 }, { "epoch": 0.5815789473684211, "grad_norm": 2.9629054069519043, "learning_rate": 2.976705346117951e-05, "loss": 1.6623, "step": 1105 }, { "epoch": 0.5821052631578948, "grad_norm": 5.767422199249268, "learning_rate": 2.9766616935980668e-05, "loss": 0.9786, "step": 1106 }, { "epoch": 0.5826315789473684, "grad_norm": 4.677647590637207, "learning_rate": 2.9766180005362835e-05, "loss": 0.3723, "step": 1107 }, { "epoch": 0.5831578947368421, "grad_norm": 1.2974311113357544, "learning_rate": 2.9765742669338012e-05, "loss": 0.793, "step": 1108 }, { "epoch": 0.5836842105263158, "grad_norm": 1.323523759841919, "learning_rate": 2.97653049279182e-05, "loss": 1.2581, "step": 1109 }, { "epoch": 0.5842105263157895, "grad_norm": 2.9345805644989014, "learning_rate": 2.9764866781115417e-05, "loss": 0.7428, "step": 1110 }, { "epoch": 0.5847368421052631, "grad_norm": 8.754313468933105, "learning_rate": 2.9764428228941702e-05, "loss": 1.5404, "step": 1111 }, { "epoch": 0.5852631578947368, "grad_norm": 5.4265875816345215, "learning_rate": 2.976398927140908e-05, "loss": 2.3934, "step": 1112 }, { "epoch": 0.5857894736842105, "grad_norm": 3.4678874015808105, "learning_rate": 2.9763549908529614e-05, "loss": 1.079, "step": 1113 }, { "epoch": 0.5863157894736842, "grad_norm": 9.305590629577637, "learning_rate": 2.9763110140315365e-05, "loss": 1.6112, "step": 1114 }, { "epoch": 0.5868421052631579, "grad_norm": 2.989252805709839, "learning_rate": 2.9762669966778406e-05, "loss": 0.6553, "step": 1115 }, { "epoch": 0.5873684210526315, "grad_norm": 1.2564352750778198, "learning_rate": 2.9762229387930817e-05, "loss": 1.0478, "step": 1116 }, { "epoch": 0.5878947368421052, "grad_norm": 2.9239699840545654, "learning_rate": 2.9761788403784703e-05, "loss": 1.4082, "step": 1117 }, { "epoch": 0.588421052631579, "grad_norm": 2.38325834274292, "learning_rate": 2.9761347014352168e-05, "loss": 1.1565, "step": 1118 }, { "epoch": 0.5889473684210527, "grad_norm": 4.686885356903076, "learning_rate": 2.9760905219645325e-05, "loss": 1.3522, "step": 1119 }, { "epoch": 0.5894736842105263, "grad_norm": 1.1427439451217651, "learning_rate": 2.976046301967631e-05, "loss": 0.7014, "step": 1120 }, { "epoch": 0.59, "grad_norm": 5.191329002380371, "learning_rate": 2.976002041445726e-05, "loss": 1.1254, "step": 1121 }, { "epoch": 0.5905263157894737, "grad_norm": 1.5960139036178589, "learning_rate": 2.9759577404000332e-05, "loss": 1.5535, "step": 1122 }, { "epoch": 0.5910526315789474, "grad_norm": 1.104758858680725, "learning_rate": 2.975913398831768e-05, "loss": 1.2053, "step": 1123 }, { "epoch": 0.5915789473684211, "grad_norm": 8.011280059814453, "learning_rate": 2.9758690167421486e-05, "loss": 1.5689, "step": 1124 }, { "epoch": 0.5921052631578947, "grad_norm": 2.1890242099761963, "learning_rate": 2.9758245941323932e-05, "loss": 1.3737, "step": 1125 }, { "epoch": 0.5926315789473684, "grad_norm": 5.8035502433776855, "learning_rate": 2.9757801310037218e-05, "loss": 1.4497, "step": 1126 }, { "epoch": 0.5931578947368421, "grad_norm": 3.329540967941284, "learning_rate": 2.9757356273573543e-05, "loss": 0.3897, "step": 1127 }, { "epoch": 0.5936842105263158, "grad_norm": 3.944429874420166, "learning_rate": 2.9756910831945136e-05, "loss": 1.0921, "step": 1128 }, { "epoch": 0.5942105263157895, "grad_norm": 3.396190643310547, "learning_rate": 2.9756464985164214e-05, "loss": 0.4456, "step": 1129 }, { "epoch": 0.5947368421052631, "grad_norm": 1.586104393005371, "learning_rate": 2.9756018733243033e-05, "loss": 1.401, "step": 1130 }, { "epoch": 0.5952631578947368, "grad_norm": 1.2480674982070923, "learning_rate": 2.9755572076193833e-05, "loss": 1.3656, "step": 1131 }, { "epoch": 0.5957894736842105, "grad_norm": 1.0409289598464966, "learning_rate": 2.9755125014028876e-05, "loss": 1.1957, "step": 1132 }, { "epoch": 0.5963157894736842, "grad_norm": 1.0716015100479126, "learning_rate": 2.9754677546760444e-05, "loss": 0.9709, "step": 1133 }, { "epoch": 0.5968421052631578, "grad_norm": 1.9948137998580933, "learning_rate": 2.9754229674400822e-05, "loss": 1.1416, "step": 1134 }, { "epoch": 0.5973684210526315, "grad_norm": 1.064031720161438, "learning_rate": 2.9753781396962303e-05, "loss": 0.9325, "step": 1135 }, { "epoch": 0.5978947368421053, "grad_norm": 2.7508461475372314, "learning_rate": 2.9753332714457193e-05, "loss": 0.1521, "step": 1136 }, { "epoch": 0.598421052631579, "grad_norm": 2.104701519012451, "learning_rate": 2.9752883626897813e-05, "loss": 1.3733, "step": 1137 }, { "epoch": 0.5989473684210527, "grad_norm": 1.180884838104248, "learning_rate": 2.9752434134296494e-05, "loss": 1.0614, "step": 1138 }, { "epoch": 0.5994736842105263, "grad_norm": 1.3136661052703857, "learning_rate": 2.9751984236665578e-05, "loss": 0.761, "step": 1139 }, { "epoch": 0.6, "grad_norm": 5.025672435760498, "learning_rate": 2.975153393401741e-05, "loss": 0.8013, "step": 1140 }, { "epoch": 0.6005263157894737, "grad_norm": 0.5844347476959229, "learning_rate": 2.975108322636436e-05, "loss": 0.0122, "step": 1141 }, { "epoch": 0.6010526315789474, "grad_norm": 1.406011700630188, "learning_rate": 2.9750632113718795e-05, "loss": 1.2908, "step": 1142 }, { "epoch": 0.6015789473684211, "grad_norm": 51.026851654052734, "learning_rate": 2.9750180596093108e-05, "loss": 2.5605, "step": 1143 }, { "epoch": 0.6021052631578947, "grad_norm": 1.4514580965042114, "learning_rate": 2.9749728673499695e-05, "loss": 1.0439, "step": 1144 }, { "epoch": 0.6026315789473684, "grad_norm": 1.7348302602767944, "learning_rate": 2.9749276345950957e-05, "loss": 1.8712, "step": 1145 }, { "epoch": 0.6031578947368421, "grad_norm": 2.1238996982574463, "learning_rate": 2.974882361345932e-05, "loss": 1.0086, "step": 1146 }, { "epoch": 0.6036842105263158, "grad_norm": 3.6767418384552, "learning_rate": 2.974837047603721e-05, "loss": 0.2294, "step": 1147 }, { "epoch": 0.6042105263157894, "grad_norm": 7.874570369720459, "learning_rate": 2.9747916933697066e-05, "loss": 2.2632, "step": 1148 }, { "epoch": 0.6047368421052631, "grad_norm": 8.193496704101562, "learning_rate": 2.9747462986451347e-05, "loss": 1.793, "step": 1149 }, { "epoch": 0.6052631578947368, "grad_norm": 1.6370347738265991, "learning_rate": 2.9747008634312508e-05, "loss": 1.4331, "step": 1150 }, { "epoch": 0.6057894736842105, "grad_norm": 2.3346762657165527, "learning_rate": 2.9746553877293025e-05, "loss": 1.4356, "step": 1151 }, { "epoch": 0.6063157894736843, "grad_norm": 3.7243125438690186, "learning_rate": 2.9746098715405394e-05, "loss": 1.8991, "step": 1152 }, { "epoch": 0.6068421052631578, "grad_norm": 4.2178168296813965, "learning_rate": 2.9745643148662095e-05, "loss": 1.3015, "step": 1153 }, { "epoch": 0.6073684210526316, "grad_norm": 2.0063540935516357, "learning_rate": 2.9745187177075645e-05, "loss": 1.0399, "step": 1154 }, { "epoch": 0.6078947368421053, "grad_norm": 3.9274532794952393, "learning_rate": 2.9744730800658563e-05, "loss": 0.9394, "step": 1155 }, { "epoch": 0.608421052631579, "grad_norm": 0.9116756319999695, "learning_rate": 2.9744274019423375e-05, "loss": 0.7879, "step": 1156 }, { "epoch": 0.6089473684210527, "grad_norm": 1.725326418876648, "learning_rate": 2.9743816833382627e-05, "loss": 1.2359, "step": 1157 }, { "epoch": 0.6094736842105263, "grad_norm": 2.4110519886016846, "learning_rate": 2.9743359242548865e-05, "loss": 1.0438, "step": 1158 }, { "epoch": 0.61, "grad_norm": 3.515199661254883, "learning_rate": 2.9742901246934657e-05, "loss": 1.0046, "step": 1159 }, { "epoch": 0.6105263157894737, "grad_norm": 3.636535167694092, "learning_rate": 2.9742442846552578e-05, "loss": 1.0394, "step": 1160 }, { "epoch": 0.6110526315789474, "grad_norm": 8.082115173339844, "learning_rate": 2.974198404141521e-05, "loss": 0.8219, "step": 1161 }, { "epoch": 0.611578947368421, "grad_norm": 1.6529840230941772, "learning_rate": 2.9741524831535147e-05, "loss": 0.0603, "step": 1162 }, { "epoch": 0.6121052631578947, "grad_norm": 1.2058558464050293, "learning_rate": 2.9741065216925002e-05, "loss": 0.8499, "step": 1163 }, { "epoch": 0.6126315789473684, "grad_norm": 1.5454597473144531, "learning_rate": 2.97406051975974e-05, "loss": 0.7998, "step": 1164 }, { "epoch": 0.6131578947368421, "grad_norm": 1.1322457790374756, "learning_rate": 2.9740144773564952e-05, "loss": 1.1103, "step": 1165 }, { "epoch": 0.6136842105263158, "grad_norm": 1.2677356004714966, "learning_rate": 2.9739683944840315e-05, "loss": 0.7271, "step": 1166 }, { "epoch": 0.6142105263157894, "grad_norm": 29.934062957763672, "learning_rate": 2.9739222711436137e-05, "loss": 2.5771, "step": 1167 }, { "epoch": 0.6147368421052631, "grad_norm": 1.108261227607727, "learning_rate": 2.973876107336508e-05, "loss": 1.017, "step": 1168 }, { "epoch": 0.6152631578947368, "grad_norm": 4.538265705108643, "learning_rate": 2.9738299030639818e-05, "loss": 1.3243, "step": 1169 }, { "epoch": 0.6157894736842106, "grad_norm": 1.448798418045044, "learning_rate": 2.9737836583273037e-05, "loss": 1.1748, "step": 1170 }, { "epoch": 0.6163157894736843, "grad_norm": 1.834809422492981, "learning_rate": 2.9737373731277432e-05, "loss": 1.1335, "step": 1171 }, { "epoch": 0.6168421052631579, "grad_norm": 1.8159127235412598, "learning_rate": 2.9736910474665714e-05, "loss": 1.3667, "step": 1172 }, { "epoch": 0.6173684210526316, "grad_norm": 1.2512363195419312, "learning_rate": 2.9736446813450603e-05, "loss": 1.1666, "step": 1173 }, { "epoch": 0.6178947368421053, "grad_norm": 13.280810356140137, "learning_rate": 2.9735982747644817e-05, "loss": 1.3808, "step": 1174 }, { "epoch": 0.618421052631579, "grad_norm": 10.25016975402832, "learning_rate": 2.9735518277261113e-05, "loss": 0.7421, "step": 1175 }, { "epoch": 0.6189473684210526, "grad_norm": 2.516862630844116, "learning_rate": 2.9735053402312234e-05, "loss": 1.9148, "step": 1176 }, { "epoch": 0.6194736842105263, "grad_norm": 1.1309010982513428, "learning_rate": 2.973458812281095e-05, "loss": 1.2431, "step": 1177 }, { "epoch": 0.62, "grad_norm": 2.400505542755127, "learning_rate": 2.9734122438770023e-05, "loss": 1.3809, "step": 1178 }, { "epoch": 0.6205263157894737, "grad_norm": 1.2216683626174927, "learning_rate": 2.9733656350202248e-05, "loss": 0.7622, "step": 1179 }, { "epoch": 0.6210526315789474, "grad_norm": 1.942233920097351, "learning_rate": 2.973318985712042e-05, "loss": 1.2894, "step": 1180 }, { "epoch": 0.621578947368421, "grad_norm": 3.7196786403656006, "learning_rate": 2.9732722959537345e-05, "loss": 0.9568, "step": 1181 }, { "epoch": 0.6221052631578947, "grad_norm": 1.3381714820861816, "learning_rate": 2.973225565746585e-05, "loss": 1.1391, "step": 1182 }, { "epoch": 0.6226315789473684, "grad_norm": 3.379469871520996, "learning_rate": 2.973178795091875e-05, "loss": 1.4507, "step": 1183 }, { "epoch": 0.6231578947368421, "grad_norm": 4.584420680999756, "learning_rate": 2.9731319839908895e-05, "loss": 0.5189, "step": 1184 }, { "epoch": 0.6236842105263158, "grad_norm": 1.164870023727417, "learning_rate": 2.9730851324449133e-05, "loss": 1.2698, "step": 1185 }, { "epoch": 0.6242105263157894, "grad_norm": 2.1792705059051514, "learning_rate": 2.9730382404552334e-05, "loss": 0.9649, "step": 1186 }, { "epoch": 0.6247368421052631, "grad_norm": 10.801166534423828, "learning_rate": 2.9729913080231364e-05, "loss": 1.006, "step": 1187 }, { "epoch": 0.6252631578947369, "grad_norm": 6.744489669799805, "learning_rate": 2.9729443351499118e-05, "loss": 1.5334, "step": 1188 }, { "epoch": 0.6257894736842106, "grad_norm": 0.9813660979270935, "learning_rate": 2.9728973218368477e-05, "loss": 0.8807, "step": 1189 }, { "epoch": 0.6263157894736842, "grad_norm": 1.4452024698257446, "learning_rate": 2.9728502680852368e-05, "loss": 1.2235, "step": 1190 }, { "epoch": 0.6268421052631579, "grad_norm": 1.5050314664840698, "learning_rate": 2.9728031738963695e-05, "loss": 0.9459, "step": 1191 }, { "epoch": 0.6273684210526316, "grad_norm": 1.1202926635742188, "learning_rate": 2.9727560392715395e-05, "loss": 0.7596, "step": 1192 }, { "epoch": 0.6278947368421053, "grad_norm": 0.803450345993042, "learning_rate": 2.9727088642120406e-05, "loss": 0.3567, "step": 1193 }, { "epoch": 0.628421052631579, "grad_norm": 1.9323444366455078, "learning_rate": 2.9726616487191675e-05, "loss": 1.1573, "step": 1194 }, { "epoch": 0.6289473684210526, "grad_norm": 1.0270202159881592, "learning_rate": 2.9726143927942176e-05, "loss": 1.09, "step": 1195 }, { "epoch": 0.6294736842105263, "grad_norm": 12.028121948242188, "learning_rate": 2.9725670964384876e-05, "loss": 0.8462, "step": 1196 }, { "epoch": 0.63, "grad_norm": 7.259978771209717, "learning_rate": 2.9725197596532768e-05, "loss": 1.0018, "step": 1197 }, { "epoch": 0.6305263157894737, "grad_norm": 1.1468654870986938, "learning_rate": 2.9724723824398838e-05, "loss": 1.1135, "step": 1198 }, { "epoch": 0.6310526315789474, "grad_norm": 1.0924807786941528, "learning_rate": 2.9724249647996095e-05, "loss": 1.2861, "step": 1199 }, { "epoch": 0.631578947368421, "grad_norm": 1.762115478515625, "learning_rate": 2.972377506733756e-05, "loss": 1.2612, "step": 1200 }, { "epoch": 0.6321052631578947, "grad_norm": 1.1973563432693481, "learning_rate": 2.9723300082436266e-05, "loss": 1.042, "step": 1201 }, { "epoch": 0.6326315789473684, "grad_norm": 1.2056132555007935, "learning_rate": 2.972282469330525e-05, "loss": 0.8789, "step": 1202 }, { "epoch": 0.6331578947368421, "grad_norm": 0.9773816466331482, "learning_rate": 2.9722348899957564e-05, "loss": 0.9174, "step": 1203 }, { "epoch": 0.6336842105263157, "grad_norm": 2.236300468444824, "learning_rate": 2.972187270240627e-05, "loss": 1.404, "step": 1204 }, { "epoch": 0.6342105263157894, "grad_norm": 1.2097399234771729, "learning_rate": 2.9721396100664447e-05, "loss": 1.2159, "step": 1205 }, { "epoch": 0.6347368421052632, "grad_norm": 1.800818681716919, "learning_rate": 2.9720919094745176e-05, "loss": 0.8479, "step": 1206 }, { "epoch": 0.6352631578947369, "grad_norm": 2.954768180847168, "learning_rate": 2.9720441684661552e-05, "loss": 1.3383, "step": 1207 }, { "epoch": 0.6357894736842106, "grad_norm": 2.48077130317688, "learning_rate": 2.9719963870426685e-05, "loss": 1.2594, "step": 1208 }, { "epoch": 0.6363157894736842, "grad_norm": 4.164140701293945, "learning_rate": 2.9719485652053696e-05, "loss": 1.3007, "step": 1209 }, { "epoch": 0.6368421052631579, "grad_norm": 4.807854175567627, "learning_rate": 2.971900702955571e-05, "loss": 0.2724, "step": 1210 }, { "epoch": 0.6373684210526316, "grad_norm": 2.658689498901367, "learning_rate": 2.971852800294587e-05, "loss": 1.4455, "step": 1211 }, { "epoch": 0.6378947368421053, "grad_norm": 5.018266677856445, "learning_rate": 2.9718048572237323e-05, "loss": 1.5204, "step": 1212 }, { "epoch": 0.638421052631579, "grad_norm": 7.115269184112549, "learning_rate": 2.9717568737443236e-05, "loss": 1.763, "step": 1213 }, { "epoch": 0.6389473684210526, "grad_norm": 1.1763184070587158, "learning_rate": 2.9717088498576787e-05, "loss": 0.7555, "step": 1214 }, { "epoch": 0.6394736842105263, "grad_norm": 0.8004067540168762, "learning_rate": 2.9716607855651154e-05, "loss": 0.5879, "step": 1215 }, { "epoch": 0.64, "grad_norm": 0.9092245697975159, "learning_rate": 2.971612680867953e-05, "loss": 0.9113, "step": 1216 }, { "epoch": 0.6405263157894737, "grad_norm": 1.03469717502594, "learning_rate": 2.9715645357675133e-05, "loss": 0.8598, "step": 1217 }, { "epoch": 0.6410526315789473, "grad_norm": 13.371722221374512, "learning_rate": 2.9715163502651176e-05, "loss": 0.6834, "step": 1218 }, { "epoch": 0.641578947368421, "grad_norm": 1.745549201965332, "learning_rate": 2.9714681243620885e-05, "loss": 1.4798, "step": 1219 }, { "epoch": 0.6421052631578947, "grad_norm": 2.9852797985076904, "learning_rate": 2.971419858059751e-05, "loss": 1.6255, "step": 1220 }, { "epoch": 0.6426315789473684, "grad_norm": 1.1600061655044556, "learning_rate": 2.971371551359429e-05, "loss": 0.9992, "step": 1221 }, { "epoch": 0.6431578947368422, "grad_norm": 1.1885265111923218, "learning_rate": 2.97132320426245e-05, "loss": 0.9063, "step": 1222 }, { "epoch": 0.6436842105263157, "grad_norm": 1.2887687683105469, "learning_rate": 2.9712748167701406e-05, "loss": 1.0904, "step": 1223 }, { "epoch": 0.6442105263157895, "grad_norm": 3.543919324874878, "learning_rate": 2.9712263888838295e-05, "loss": 0.6287, "step": 1224 }, { "epoch": 0.6447368421052632, "grad_norm": 1.4697569608688354, "learning_rate": 2.9711779206048457e-05, "loss": 1.6434, "step": 1225 }, { "epoch": 0.6452631578947369, "grad_norm": 1.5276075601577759, "learning_rate": 2.9711294119345212e-05, "loss": 0.9159, "step": 1226 }, { "epoch": 0.6457894736842106, "grad_norm": 1.8441684246063232, "learning_rate": 2.9710808628741866e-05, "loss": 1.2164, "step": 1227 }, { "epoch": 0.6463157894736842, "grad_norm": 5.690790176391602, "learning_rate": 2.9710322734251753e-05, "loss": 0.818, "step": 1228 }, { "epoch": 0.6468421052631579, "grad_norm": 2.064948081970215, "learning_rate": 2.9709836435888213e-05, "loss": 1.5381, "step": 1229 }, { "epoch": 0.6473684210526316, "grad_norm": 1.3582369089126587, "learning_rate": 2.9709349733664602e-05, "loss": 0.4742, "step": 1230 }, { "epoch": 0.6478947368421053, "grad_norm": 1.6712125539779663, "learning_rate": 2.9708862627594275e-05, "loss": 0.9667, "step": 1231 }, { "epoch": 0.6484210526315789, "grad_norm": 5.520822048187256, "learning_rate": 2.970837511769061e-05, "loss": 0.7173, "step": 1232 }, { "epoch": 0.6489473684210526, "grad_norm": 3.7994682788848877, "learning_rate": 2.9707887203966986e-05, "loss": 0.8053, "step": 1233 }, { "epoch": 0.6494736842105263, "grad_norm": 1.3790806531906128, "learning_rate": 2.970739888643681e-05, "loss": 1.2264, "step": 1234 }, { "epoch": 0.65, "grad_norm": 2.789325475692749, "learning_rate": 2.9706910165113477e-05, "loss": 0.6273, "step": 1235 }, { "epoch": 0.6505263157894737, "grad_norm": 2.6303861141204834, "learning_rate": 2.970642104001041e-05, "loss": 0.0705, "step": 1236 }, { "epoch": 0.6510526315789473, "grad_norm": 6.236796855926514, "learning_rate": 2.9705931511141037e-05, "loss": 0.147, "step": 1237 }, { "epoch": 0.651578947368421, "grad_norm": 7.691266059875488, "learning_rate": 2.9705441578518798e-05, "loss": 0.6646, "step": 1238 }, { "epoch": 0.6521052631578947, "grad_norm": 1.5175530910491943, "learning_rate": 2.9704951242157145e-05, "loss": 1.33, "step": 1239 }, { "epoch": 0.6526315789473685, "grad_norm": 1.3474842309951782, "learning_rate": 2.9704460502069544e-05, "loss": 0.8195, "step": 1240 }, { "epoch": 0.6531578947368422, "grad_norm": 4.029512882232666, "learning_rate": 2.9703969358269462e-05, "loss": 0.3568, "step": 1241 }, { "epoch": 0.6536842105263158, "grad_norm": 1.3466565608978271, "learning_rate": 2.9703477810770384e-05, "loss": 1.259, "step": 1242 }, { "epoch": 0.6542105263157895, "grad_norm": 2.5386829376220703, "learning_rate": 2.970298585958581e-05, "loss": 1.0745, "step": 1243 }, { "epoch": 0.6547368421052632, "grad_norm": 2.428055763244629, "learning_rate": 2.9702493504729244e-05, "loss": 0.8593, "step": 1244 }, { "epoch": 0.6552631578947369, "grad_norm": 1.3094781637191772, "learning_rate": 2.97020007462142e-05, "loss": 0.8212, "step": 1245 }, { "epoch": 0.6557894736842105, "grad_norm": 1.4779995679855347, "learning_rate": 2.970150758405421e-05, "loss": 1.0186, "step": 1246 }, { "epoch": 0.6563157894736842, "grad_norm": 1.1934393644332886, "learning_rate": 2.9701014018262817e-05, "loss": 1.1701, "step": 1247 }, { "epoch": 0.6568421052631579, "grad_norm": 0.7196000218391418, "learning_rate": 2.9700520048853566e-05, "loss": 0.0174, "step": 1248 }, { "epoch": 0.6573684210526316, "grad_norm": 0.9140834808349609, "learning_rate": 2.9700025675840028e-05, "loss": 0.9861, "step": 1249 }, { "epoch": 0.6578947368421053, "grad_norm": 4.055169582366943, "learning_rate": 2.969953089923577e-05, "loss": 0.4162, "step": 1250 }, { "epoch": 0.6584210526315789, "grad_norm": 0.5134720802307129, "learning_rate": 2.9699035719054367e-05, "loss": 0.0143, "step": 1251 }, { "epoch": 0.6589473684210526, "grad_norm": 3.9289088249206543, "learning_rate": 2.9698540135309434e-05, "loss": 0.5238, "step": 1252 }, { "epoch": 0.6594736842105263, "grad_norm": 6.439306735992432, "learning_rate": 2.969804414801456e-05, "loss": 0.549, "step": 1253 }, { "epoch": 0.66, "grad_norm": 2.025859832763672, "learning_rate": 2.969754775718337e-05, "loss": 0.9216, "step": 1254 }, { "epoch": 0.6605263157894737, "grad_norm": 1.8443411588668823, "learning_rate": 2.969705096282949e-05, "loss": 0.1711, "step": 1255 }, { "epoch": 0.6610526315789473, "grad_norm": 1.5247849225997925, "learning_rate": 2.9696553764966562e-05, "loss": 1.1315, "step": 1256 }, { "epoch": 0.661578947368421, "grad_norm": 3.6713950634002686, "learning_rate": 2.9696056163608237e-05, "loss": 0.467, "step": 1257 }, { "epoch": 0.6621052631578948, "grad_norm": 0.790451169013977, "learning_rate": 2.969555815876818e-05, "loss": 0.0082, "step": 1258 }, { "epoch": 0.6626315789473685, "grad_norm": 1.4589107036590576, "learning_rate": 2.9695059750460052e-05, "loss": 0.6006, "step": 1259 }, { "epoch": 0.6631578947368421, "grad_norm": 1.3655271530151367, "learning_rate": 2.9694560938697548e-05, "loss": 1.2306, "step": 1260 }, { "epoch": 0.6636842105263158, "grad_norm": 1.3729549646377563, "learning_rate": 2.9694061723494358e-05, "loss": 0.7385, "step": 1261 }, { "epoch": 0.6642105263157895, "grad_norm": 4.8193864822387695, "learning_rate": 2.9693562104864182e-05, "loss": 0.7079, "step": 1262 }, { "epoch": 0.6647368421052632, "grad_norm": 1.563333511352539, "learning_rate": 2.9693062082820752e-05, "loss": 0.5117, "step": 1263 }, { "epoch": 0.6652631578947369, "grad_norm": 1.3271199464797974, "learning_rate": 2.969256165737779e-05, "loss": 1.0954, "step": 1264 }, { "epoch": 0.6657894736842105, "grad_norm": 4.284012794494629, "learning_rate": 2.9692060828549025e-05, "loss": 1.9864, "step": 1265 }, { "epoch": 0.6663157894736842, "grad_norm": 3.3707847595214844, "learning_rate": 2.969155959634822e-05, "loss": 1.0695, "step": 1266 }, { "epoch": 0.6668421052631579, "grad_norm": 2.1350784301757812, "learning_rate": 2.9691057960789133e-05, "loss": 1.2387, "step": 1267 }, { "epoch": 0.6673684210526316, "grad_norm": 1.350180745124817, "learning_rate": 2.9690555921885534e-05, "loss": 1.1326, "step": 1268 }, { "epoch": 0.6678947368421052, "grad_norm": 1.3490583896636963, "learning_rate": 2.9690053479651207e-05, "loss": 0.7916, "step": 1269 }, { "epoch": 0.6684210526315789, "grad_norm": 5.74185848236084, "learning_rate": 2.968955063409995e-05, "loss": 0.9312, "step": 1270 }, { "epoch": 0.6689473684210526, "grad_norm": 1.1950428485870361, "learning_rate": 2.968904738524557e-05, "loss": 0.8421, "step": 1271 }, { "epoch": 0.6694736842105263, "grad_norm": 1.2086502313613892, "learning_rate": 2.9688543733101876e-05, "loss": 1.205, "step": 1272 }, { "epoch": 0.67, "grad_norm": 2.652189254760742, "learning_rate": 2.96880396776827e-05, "loss": 1.0971, "step": 1273 }, { "epoch": 0.6705263157894736, "grad_norm": 1.5372319221496582, "learning_rate": 2.968753521900188e-05, "loss": 1.4361, "step": 1274 }, { "epoch": 0.6710526315789473, "grad_norm": 3.254009485244751, "learning_rate": 2.9687030357073265e-05, "loss": 1.5015, "step": 1275 }, { "epoch": 0.671578947368421, "grad_norm": 1.3365062475204468, "learning_rate": 2.968652509191072e-05, "loss": 1.2392, "step": 1276 }, { "epoch": 0.6721052631578948, "grad_norm": 8.001225471496582, "learning_rate": 2.9686019423528117e-05, "loss": 1.1131, "step": 1277 }, { "epoch": 0.6726315789473685, "grad_norm": 4.288140296936035, "learning_rate": 2.9685513351939335e-05, "loss": 1.2558, "step": 1278 }, { "epoch": 0.6731578947368421, "grad_norm": 0.9908925294876099, "learning_rate": 2.968500687715827e-05, "loss": 1.0251, "step": 1279 }, { "epoch": 0.6736842105263158, "grad_norm": 1.2662602663040161, "learning_rate": 2.968449999919883e-05, "loss": 1.0004, "step": 1280 }, { "epoch": 0.6742105263157895, "grad_norm": 1.4708739519119263, "learning_rate": 2.9683992718074926e-05, "loss": 1.3322, "step": 1281 }, { "epoch": 0.6747368421052632, "grad_norm": 1.1707433462142944, "learning_rate": 2.9683485033800494e-05, "loss": 1.3583, "step": 1282 }, { "epoch": 0.6752631578947368, "grad_norm": 0.060910388827323914, "learning_rate": 2.9682976946389463e-05, "loss": 0.0018, "step": 1283 }, { "epoch": 0.6757894736842105, "grad_norm": 1.6072977781295776, "learning_rate": 2.9682468455855783e-05, "loss": 0.0838, "step": 1284 }, { "epoch": 0.6763157894736842, "grad_norm": 1.179344892501831, "learning_rate": 2.9681959562213422e-05, "loss": 1.3671, "step": 1285 }, { "epoch": 0.6768421052631579, "grad_norm": 6.977228164672852, "learning_rate": 2.9681450265476345e-05, "loss": 0.5456, "step": 1286 }, { "epoch": 0.6773684210526316, "grad_norm": 1.475023865699768, "learning_rate": 2.968094056565854e-05, "loss": 1.0063, "step": 1287 }, { "epoch": 0.6778947368421052, "grad_norm": 2.039761781692505, "learning_rate": 2.9680430462774e-05, "loss": 0.3243, "step": 1288 }, { "epoch": 0.6784210526315789, "grad_norm": 1.093194603919983, "learning_rate": 2.967991995683673e-05, "loss": 1.2607, "step": 1289 }, { "epoch": 0.6789473684210526, "grad_norm": 24.999378204345703, "learning_rate": 2.9679409047860743e-05, "loss": 1.5342, "step": 1290 }, { "epoch": 0.6794736842105263, "grad_norm": 1.6621330976486206, "learning_rate": 2.9678897735860066e-05, "loss": 1.8376, "step": 1291 }, { "epoch": 0.68, "grad_norm": 3.57814621925354, "learning_rate": 2.9678386020848742e-05, "loss": 1.4534, "step": 1292 }, { "epoch": 0.6805263157894736, "grad_norm": 1.6566349267959595, "learning_rate": 2.9677873902840813e-05, "loss": 0.049, "step": 1293 }, { "epoch": 0.6810526315789474, "grad_norm": 1.090785264968872, "learning_rate": 2.9677361381850345e-05, "loss": 1.0962, "step": 1294 }, { "epoch": 0.6815789473684211, "grad_norm": 1.6356092691421509, "learning_rate": 2.9676848457891407e-05, "loss": 0.6644, "step": 1295 }, { "epoch": 0.6821052631578948, "grad_norm": 1.247987151145935, "learning_rate": 2.9676335130978082e-05, "loss": 0.8771, "step": 1296 }, { "epoch": 0.6826315789473684, "grad_norm": 1.4328747987747192, "learning_rate": 2.9675821401124465e-05, "loss": 1.0978, "step": 1297 }, { "epoch": 0.6831578947368421, "grad_norm": 6.744469165802002, "learning_rate": 2.967530726834466e-05, "loss": 0.8294, "step": 1298 }, { "epoch": 0.6836842105263158, "grad_norm": 1.233546495437622, "learning_rate": 2.9674792732652773e-05, "loss": 0.9119, "step": 1299 }, { "epoch": 0.6842105263157895, "grad_norm": 1.5304890871047974, "learning_rate": 2.967427779406295e-05, "loss": 1.2472, "step": 1300 }, { "epoch": 0.6847368421052632, "grad_norm": 1.6759600639343262, "learning_rate": 2.9673762452589307e-05, "loss": 1.0797, "step": 1301 }, { "epoch": 0.6852631578947368, "grad_norm": 2.450378894805908, "learning_rate": 2.967324670824601e-05, "loss": 0.7401, "step": 1302 }, { "epoch": 0.6857894736842105, "grad_norm": 0.631327748298645, "learning_rate": 2.9672730561047214e-05, "loss": 0.0149, "step": 1303 }, { "epoch": 0.6863157894736842, "grad_norm": 2.2037277221679688, "learning_rate": 2.9672214011007087e-05, "loss": 0.9195, "step": 1304 }, { "epoch": 0.6868421052631579, "grad_norm": 20.760221481323242, "learning_rate": 2.967169705813981e-05, "loss": 2.7392, "step": 1305 }, { "epoch": 0.6873684210526316, "grad_norm": 1.0082573890686035, "learning_rate": 2.9671179702459576e-05, "loss": 1.0737, "step": 1306 }, { "epoch": 0.6878947368421052, "grad_norm": 2.9095327854156494, "learning_rate": 2.9670661943980595e-05, "loss": 1.4494, "step": 1307 }, { "epoch": 0.6884210526315789, "grad_norm": 0.9664977788925171, "learning_rate": 2.9670143782717075e-05, "loss": 1.1525, "step": 1308 }, { "epoch": 0.6889473684210526, "grad_norm": 0.6551389098167419, "learning_rate": 2.966962521868325e-05, "loss": 0.0284, "step": 1309 }, { "epoch": 0.6894736842105263, "grad_norm": 6.767817497253418, "learning_rate": 2.966910625189335e-05, "loss": 0.253, "step": 1310 }, { "epoch": 0.69, "grad_norm": 4.445407867431641, "learning_rate": 2.9668586882361625e-05, "loss": 1.5918, "step": 1311 }, { "epoch": 0.6905263157894737, "grad_norm": 3.130711078643799, "learning_rate": 2.9668067110102338e-05, "loss": 1.0437, "step": 1312 }, { "epoch": 0.6910526315789474, "grad_norm": 1.7217767238616943, "learning_rate": 2.9667546935129757e-05, "loss": 0.5301, "step": 1313 }, { "epoch": 0.6915789473684211, "grad_norm": 6.565402984619141, "learning_rate": 2.966702635745816e-05, "loss": 1.4053, "step": 1314 }, { "epoch": 0.6921052631578948, "grad_norm": 1.4495912790298462, "learning_rate": 2.9666505377101845e-05, "loss": 0.8034, "step": 1315 }, { "epoch": 0.6926315789473684, "grad_norm": 1.2847636938095093, "learning_rate": 2.9665983994075113e-05, "loss": 1.0825, "step": 1316 }, { "epoch": 0.6931578947368421, "grad_norm": 1.0292222499847412, "learning_rate": 2.966546220839228e-05, "loss": 1.1681, "step": 1317 }, { "epoch": 0.6936842105263158, "grad_norm": 1.537844181060791, "learning_rate": 2.966494002006767e-05, "loss": 1.6044, "step": 1318 }, { "epoch": 0.6942105263157895, "grad_norm": 4.573411464691162, "learning_rate": 2.966441742911562e-05, "loss": 0.7895, "step": 1319 }, { "epoch": 0.6947368421052632, "grad_norm": 1.3798898458480835, "learning_rate": 2.9663894435550477e-05, "loss": 0.5207, "step": 1320 }, { "epoch": 0.6952631578947368, "grad_norm": 1.8828356266021729, "learning_rate": 2.96633710393866e-05, "loss": 0.4363, "step": 1321 }, { "epoch": 0.6957894736842105, "grad_norm": 4.514081954956055, "learning_rate": 2.966284724063836e-05, "loss": 0.9505, "step": 1322 }, { "epoch": 0.6963157894736842, "grad_norm": 6.411378860473633, "learning_rate": 2.966232303932014e-05, "loss": 0.2739, "step": 1323 }, { "epoch": 0.6968421052631579, "grad_norm": 3.237457036972046, "learning_rate": 2.9661798435446325e-05, "loss": 0.2196, "step": 1324 }, { "epoch": 0.6973684210526315, "grad_norm": 2.3053643703460693, "learning_rate": 2.966127342903133e-05, "loss": 1.4729, "step": 1325 }, { "epoch": 0.6978947368421052, "grad_norm": 1.2993172407150269, "learning_rate": 2.9660748020089555e-05, "loss": 0.8165, "step": 1326 }, { "epoch": 0.6984210526315789, "grad_norm": 1.4997568130493164, "learning_rate": 2.9660222208635438e-05, "loss": 1.0576, "step": 1327 }, { "epoch": 0.6989473684210527, "grad_norm": 6.542171001434326, "learning_rate": 2.9659695994683404e-05, "loss": 0.7472, "step": 1328 }, { "epoch": 0.6994736842105264, "grad_norm": 2.04111647605896, "learning_rate": 2.965916937824791e-05, "loss": 1.0342, "step": 1329 }, { "epoch": 0.7, "grad_norm": 3.6200339794158936, "learning_rate": 2.965864235934341e-05, "loss": 0.9009, "step": 1330 }, { "epoch": 0.7005263157894737, "grad_norm": 3.2301101684570312, "learning_rate": 2.965811493798437e-05, "loss": 1.1608, "step": 1331 }, { "epoch": 0.7010526315789474, "grad_norm": 13.88673210144043, "learning_rate": 2.9657587114185272e-05, "loss": 2.5288, "step": 1332 }, { "epoch": 0.7015789473684211, "grad_norm": 3.9736039638519287, "learning_rate": 2.9657058887960613e-05, "loss": 1.0553, "step": 1333 }, { "epoch": 0.7021052631578948, "grad_norm": 24.85307502746582, "learning_rate": 2.965653025932489e-05, "loss": 0.916, "step": 1334 }, { "epoch": 0.7026315789473684, "grad_norm": 4.859387397766113, "learning_rate": 2.965600122829262e-05, "loss": 0.9876, "step": 1335 }, { "epoch": 0.7031578947368421, "grad_norm": 2.863323211669922, "learning_rate": 2.9655471794878325e-05, "loss": 0.8016, "step": 1336 }, { "epoch": 0.7036842105263158, "grad_norm": 2.4096951484680176, "learning_rate": 2.9654941959096543e-05, "loss": 0.6945, "step": 1337 }, { "epoch": 0.7042105263157895, "grad_norm": 1.333808183670044, "learning_rate": 2.9654411720961816e-05, "loss": 1.1654, "step": 1338 }, { "epoch": 0.7047368421052631, "grad_norm": 6.172205448150635, "learning_rate": 2.9653881080488705e-05, "loss": 0.5594, "step": 1339 }, { "epoch": 0.7052631578947368, "grad_norm": 5.650049686431885, "learning_rate": 2.9653350037691777e-05, "loss": 0.8009, "step": 1340 }, { "epoch": 0.7057894736842105, "grad_norm": 1.504169225692749, "learning_rate": 2.9652818592585616e-05, "loss": 1.1546, "step": 1341 }, { "epoch": 0.7063157894736842, "grad_norm": 2.2842626571655273, "learning_rate": 2.965228674518481e-05, "loss": 0.9654, "step": 1342 }, { "epoch": 0.7068421052631579, "grad_norm": 1.429457187652588, "learning_rate": 2.965175449550396e-05, "loss": 1.3076, "step": 1343 }, { "epoch": 0.7073684210526315, "grad_norm": 1.1388136148452759, "learning_rate": 2.9651221843557682e-05, "loss": 0.9347, "step": 1344 }, { "epoch": 0.7078947368421052, "grad_norm": 2.501084566116333, "learning_rate": 2.96506887893606e-05, "loss": 1.11, "step": 1345 }, { "epoch": 0.708421052631579, "grad_norm": 27.422794342041016, "learning_rate": 2.9650155332927343e-05, "loss": 7.5874, "step": 1346 }, { "epoch": 0.7089473684210527, "grad_norm": 5.256887912750244, "learning_rate": 2.9649621474272564e-05, "loss": 1.4157, "step": 1347 }, { "epoch": 0.7094736842105264, "grad_norm": 1.0665907859802246, "learning_rate": 2.9649087213410918e-05, "loss": 0.8372, "step": 1348 }, { "epoch": 0.71, "grad_norm": 6.498046398162842, "learning_rate": 2.964855255035707e-05, "loss": 1.2181, "step": 1349 }, { "epoch": 0.7105263157894737, "grad_norm": 2.614759683609009, "learning_rate": 2.9648017485125708e-05, "loss": 0.1237, "step": 1350 }, { "epoch": 0.7110526315789474, "grad_norm": 1.4356591701507568, "learning_rate": 2.9647482017731508e-05, "loss": 1.2685, "step": 1351 }, { "epoch": 0.7115789473684211, "grad_norm": 1.245197057723999, "learning_rate": 2.964694614818918e-05, "loss": 1.1176, "step": 1352 }, { "epoch": 0.7121052631578947, "grad_norm": 1.7546439170837402, "learning_rate": 2.9646409876513444e-05, "loss": 1.8027, "step": 1353 }, { "epoch": 0.7126315789473684, "grad_norm": 6.851596355438232, "learning_rate": 2.9645873202719013e-05, "loss": 0.9774, "step": 1354 }, { "epoch": 0.7131578947368421, "grad_norm": 2.6459946632385254, "learning_rate": 2.9645336126820616e-05, "loss": 1.5187, "step": 1355 }, { "epoch": 0.7136842105263158, "grad_norm": 11.114506721496582, "learning_rate": 2.9644798648833013e-05, "loss": 1.5982, "step": 1356 }, { "epoch": 0.7142105263157895, "grad_norm": 1.1266565322875977, "learning_rate": 2.9644260768770953e-05, "loss": 1.043, "step": 1357 }, { "epoch": 0.7147368421052631, "grad_norm": 1.3332453966140747, "learning_rate": 2.9643722486649203e-05, "loss": 0.9823, "step": 1358 }, { "epoch": 0.7152631578947368, "grad_norm": 7.991135120391846, "learning_rate": 2.9643183802482542e-05, "loss": 1.8691, "step": 1359 }, { "epoch": 0.7157894736842105, "grad_norm": 1.4576728343963623, "learning_rate": 2.9642644716285765e-05, "loss": 1.2554, "step": 1360 }, { "epoch": 0.7163157894736842, "grad_norm": 24.029621124267578, "learning_rate": 2.9642105228073662e-05, "loss": 1.4993, "step": 1361 }, { "epoch": 0.716842105263158, "grad_norm": 1.7569326162338257, "learning_rate": 2.9641565337861055e-05, "loss": 0.6511, "step": 1362 }, { "epoch": 0.7173684210526315, "grad_norm": 0.8566625118255615, "learning_rate": 2.9641025045662765e-05, "loss": 0.0214, "step": 1363 }, { "epoch": 0.7178947368421053, "grad_norm": 1.8023531436920166, "learning_rate": 2.9640484351493616e-05, "loss": 1.3085, "step": 1364 }, { "epoch": 0.718421052631579, "grad_norm": 1.1481828689575195, "learning_rate": 2.9639943255368468e-05, "loss": 1.3198, "step": 1365 }, { "epoch": 0.7189473684210527, "grad_norm": 2.6472365856170654, "learning_rate": 2.9639401757302162e-05, "loss": 0.2092, "step": 1366 }, { "epoch": 0.7194736842105263, "grad_norm": 1.8456323146820068, "learning_rate": 2.9638859857309574e-05, "loss": 1.0272, "step": 1367 }, { "epoch": 0.72, "grad_norm": 3.3874828815460205, "learning_rate": 2.9638317555405582e-05, "loss": 0.9814, "step": 1368 }, { "epoch": 0.7205263157894737, "grad_norm": 1.234138011932373, "learning_rate": 2.9637774851605073e-05, "loss": 0.8918, "step": 1369 }, { "epoch": 0.7210526315789474, "grad_norm": 2.1231067180633545, "learning_rate": 2.9637231745922942e-05, "loss": 0.8428, "step": 1370 }, { "epoch": 0.7215789473684211, "grad_norm": 2.287672281265259, "learning_rate": 2.963668823837411e-05, "loss": 1.5556, "step": 1371 }, { "epoch": 0.7221052631578947, "grad_norm": 1.185556173324585, "learning_rate": 2.9636144328973495e-05, "loss": 0.8633, "step": 1372 }, { "epoch": 0.7226315789473684, "grad_norm": 5.523141860961914, "learning_rate": 2.9635600017736024e-05, "loss": 0.7142, "step": 1373 }, { "epoch": 0.7231578947368421, "grad_norm": 12.063275337219238, "learning_rate": 2.9635055304676647e-05, "loss": 0.3757, "step": 1374 }, { "epoch": 0.7236842105263158, "grad_norm": 4.580484867095947, "learning_rate": 2.963451018981032e-05, "loss": 1.5171, "step": 1375 }, { "epoch": 0.7242105263157895, "grad_norm": 1.9087570905685425, "learning_rate": 2.9633964673152004e-05, "loss": 1.3982, "step": 1376 }, { "epoch": 0.7247368421052631, "grad_norm": 2.3344743251800537, "learning_rate": 2.9633418754716682e-05, "loss": 1.0389, "step": 1377 }, { "epoch": 0.7252631578947368, "grad_norm": 4.389303207397461, "learning_rate": 2.9632872434519342e-05, "loss": 0.3592, "step": 1378 }, { "epoch": 0.7257894736842105, "grad_norm": 4.647209644317627, "learning_rate": 2.963232571257498e-05, "loss": 2.2252, "step": 1379 }, { "epoch": 0.7263157894736842, "grad_norm": 2.98408579826355, "learning_rate": 2.9631778588898606e-05, "loss": 1.3005, "step": 1380 }, { "epoch": 0.7268421052631578, "grad_norm": 10.173641204833984, "learning_rate": 2.9631231063505245e-05, "loss": 0.4029, "step": 1381 }, { "epoch": 0.7273684210526316, "grad_norm": 2.366199254989624, "learning_rate": 2.963068313640992e-05, "loss": 1.408, "step": 1382 }, { "epoch": 0.7278947368421053, "grad_norm": 0.9890458583831787, "learning_rate": 2.963013480762769e-05, "loss": 0.941, "step": 1383 }, { "epoch": 0.728421052631579, "grad_norm": 2.4400086402893066, "learning_rate": 2.96295860771736e-05, "loss": 0.303, "step": 1384 }, { "epoch": 0.7289473684210527, "grad_norm": 7.326104640960693, "learning_rate": 2.9629036945062715e-05, "loss": 0.1054, "step": 1385 }, { "epoch": 0.7294736842105263, "grad_norm": 0.08032506704330444, "learning_rate": 2.9628487411310113e-05, "loss": 0.0038, "step": 1386 }, { "epoch": 0.73, "grad_norm": 1.0595088005065918, "learning_rate": 2.962793747593088e-05, "loss": 1.1041, "step": 1387 }, { "epoch": 0.7305263157894737, "grad_norm": 0.06474526226520538, "learning_rate": 2.9627387138940117e-05, "loss": 0.0022, "step": 1388 }, { "epoch": 0.7310526315789474, "grad_norm": 1.7989740371704102, "learning_rate": 2.9626836400352932e-05, "loss": 0.8054, "step": 1389 }, { "epoch": 0.7315789473684211, "grad_norm": 0.8924505710601807, "learning_rate": 2.962628526018445e-05, "loss": 1.0706, "step": 1390 }, { "epoch": 0.7321052631578947, "grad_norm": 1.269217848777771, "learning_rate": 2.9625733718449792e-05, "loss": 0.7377, "step": 1391 }, { "epoch": 0.7326315789473684, "grad_norm": 2.0392255783081055, "learning_rate": 2.962518177516411e-05, "loss": 0.5988, "step": 1392 }, { "epoch": 0.7331578947368421, "grad_norm": 9.548563003540039, "learning_rate": 2.9624629430342557e-05, "loss": 0.437, "step": 1393 }, { "epoch": 0.7336842105263158, "grad_norm": 2.005530595779419, "learning_rate": 2.9624076684000292e-05, "loss": 1.4294, "step": 1394 }, { "epoch": 0.7342105263157894, "grad_norm": 8.551248550415039, "learning_rate": 2.96235235361525e-05, "loss": 1.0252, "step": 1395 }, { "epoch": 0.7347368421052631, "grad_norm": 2.2077114582061768, "learning_rate": 2.962296998681436e-05, "loss": 0.749, "step": 1396 }, { "epoch": 0.7352631578947368, "grad_norm": 1.9976532459259033, "learning_rate": 2.962241603600107e-05, "loss": 0.6527, "step": 1397 }, { "epoch": 0.7357894736842105, "grad_norm": 1.972212314605713, "learning_rate": 2.962186168372784e-05, "loss": 1.1974, "step": 1398 }, { "epoch": 0.7363157894736843, "grad_norm": 0.9594762921333313, "learning_rate": 2.962130693000989e-05, "loss": 1.5027, "step": 1399 }, { "epoch": 0.7368421052631579, "grad_norm": 1.018154501914978, "learning_rate": 2.9620751774862456e-05, "loss": 1.1123, "step": 1400 }, { "epoch": 0.7373684210526316, "grad_norm": 1.4608852863311768, "learning_rate": 2.962019621830077e-05, "loss": 1.2949, "step": 1401 }, { "epoch": 0.7378947368421053, "grad_norm": 19.72736167907715, "learning_rate": 2.9619640260340092e-05, "loss": 1.1407, "step": 1402 }, { "epoch": 0.738421052631579, "grad_norm": 2.0531411170959473, "learning_rate": 2.9619083900995684e-05, "loss": 1.2994, "step": 1403 }, { "epoch": 0.7389473684210527, "grad_norm": 1.4088972806930542, "learning_rate": 2.961852714028282e-05, "loss": 1.0896, "step": 1404 }, { "epoch": 0.7394736842105263, "grad_norm": 0.31528720259666443, "learning_rate": 2.961796997821679e-05, "loss": 0.0114, "step": 1405 }, { "epoch": 0.74, "grad_norm": 6.371922969818115, "learning_rate": 2.9617412414812883e-05, "loss": 0.6344, "step": 1406 }, { "epoch": 0.7405263157894737, "grad_norm": 2.7792177200317383, "learning_rate": 2.9616854450086415e-05, "loss": 0.0651, "step": 1407 }, { "epoch": 0.7410526315789474, "grad_norm": 1.3516690731048584, "learning_rate": 2.9616296084052698e-05, "loss": 0.9898, "step": 1408 }, { "epoch": 0.741578947368421, "grad_norm": 5.8592448234558105, "learning_rate": 2.961573731672707e-05, "loss": 1.8225, "step": 1409 }, { "epoch": 0.7421052631578947, "grad_norm": 1.515937089920044, "learning_rate": 2.9615178148124867e-05, "loss": 0.8826, "step": 1410 }, { "epoch": 0.7426315789473684, "grad_norm": 1.1607184410095215, "learning_rate": 2.9614618578261436e-05, "loss": 0.9627, "step": 1411 }, { "epoch": 0.7431578947368421, "grad_norm": 1.7677443027496338, "learning_rate": 2.9614058607152153e-05, "loss": 1.2879, "step": 1412 }, { "epoch": 0.7436842105263158, "grad_norm": 1.718802809715271, "learning_rate": 2.9613498234812378e-05, "loss": 0.9766, "step": 1413 }, { "epoch": 0.7442105263157894, "grad_norm": 1.4020949602127075, "learning_rate": 2.9612937461257504e-05, "loss": 1.319, "step": 1414 }, { "epoch": 0.7447368421052631, "grad_norm": 1.2930610179901123, "learning_rate": 2.9612376286502934e-05, "loss": 1.0887, "step": 1415 }, { "epoch": 0.7452631578947368, "grad_norm": 1.6021417379379272, "learning_rate": 2.961181471056406e-05, "loss": 0.861, "step": 1416 }, { "epoch": 0.7457894736842106, "grad_norm": 1.3957011699676514, "learning_rate": 2.9611252733456306e-05, "loss": 0.8134, "step": 1417 }, { "epoch": 0.7463157894736843, "grad_norm": 2.1977591514587402, "learning_rate": 2.9610690355195108e-05, "loss": 1.5508, "step": 1418 }, { "epoch": 0.7468421052631579, "grad_norm": 2.2483537197113037, "learning_rate": 2.9610127575795894e-05, "loss": 0.7797, "step": 1419 }, { "epoch": 0.7473684210526316, "grad_norm": 1.5289298295974731, "learning_rate": 2.9609564395274125e-05, "loss": 1.4425, "step": 1420 }, { "epoch": 0.7478947368421053, "grad_norm": 1.3084803819656372, "learning_rate": 2.9609000813645257e-05, "loss": 1.117, "step": 1421 }, { "epoch": 0.748421052631579, "grad_norm": 2.422757863998413, "learning_rate": 2.960843683092477e-05, "loss": 1.7103, "step": 1422 }, { "epoch": 0.7489473684210526, "grad_norm": 1.4796887636184692, "learning_rate": 2.9607872447128142e-05, "loss": 1.0487, "step": 1423 }, { "epoch": 0.7494736842105263, "grad_norm": 1.0981144905090332, "learning_rate": 2.960730766227087e-05, "loss": 0.5913, "step": 1424 }, { "epoch": 0.75, "grad_norm": 1.8127541542053223, "learning_rate": 2.9606742476368464e-05, "loss": 0.8062, "step": 1425 }, { "epoch": 0.75, "eval_loss": 1.0191138982772827, "eval_runtime": 12.9816, "eval_samples_per_second": 7.703, "eval_steps_per_second": 7.703, "step": 1425 }, { "epoch": 0.7505263157894737, "grad_norm": 2.0109336376190186, "learning_rate": 2.9606176889436435e-05, "loss": 1.5174, "step": 1426 }, { "epoch": 0.7510526315789474, "grad_norm": 1.7353535890579224, "learning_rate": 2.9605610901490312e-05, "loss": 1.0043, "step": 1427 }, { "epoch": 0.751578947368421, "grad_norm": 2.5789828300476074, "learning_rate": 2.960504451254564e-05, "loss": 1.056, "step": 1428 }, { "epoch": 0.7521052631578947, "grad_norm": 10.612245559692383, "learning_rate": 2.960447772261796e-05, "loss": 1.3747, "step": 1429 }, { "epoch": 0.7526315789473684, "grad_norm": 1.5246334075927734, "learning_rate": 2.960391053172285e-05, "loss": 1.2934, "step": 1430 }, { "epoch": 0.7531578947368421, "grad_norm": 1.2563579082489014, "learning_rate": 2.9603342939875863e-05, "loss": 1.1843, "step": 1431 }, { "epoch": 0.7536842105263157, "grad_norm": 2.347818613052368, "learning_rate": 2.960277494709259e-05, "loss": 0.9651, "step": 1432 }, { "epoch": 0.7542105263157894, "grad_norm": 1.2420134544372559, "learning_rate": 2.960220655338863e-05, "loss": 0.8367, "step": 1433 }, { "epoch": 0.7547368421052632, "grad_norm": 1.5213626623153687, "learning_rate": 2.9601637758779577e-05, "loss": 1.2834, "step": 1434 }, { "epoch": 0.7552631578947369, "grad_norm": 3.097379684448242, "learning_rate": 2.9601068563281063e-05, "loss": 1.1477, "step": 1435 }, { "epoch": 0.7557894736842106, "grad_norm": 1.0866178274154663, "learning_rate": 2.9600498966908702e-05, "loss": 1.1422, "step": 1436 }, { "epoch": 0.7563157894736842, "grad_norm": 1.3330771923065186, "learning_rate": 2.959992896967814e-05, "loss": 1.2408, "step": 1437 }, { "epoch": 0.7568421052631579, "grad_norm": 2.1840741634368896, "learning_rate": 2.959935857160502e-05, "loss": 0.7088, "step": 1438 }, { "epoch": 0.7573684210526316, "grad_norm": 1.8869200944900513, "learning_rate": 2.9598787772705006e-05, "loss": 1.0891, "step": 1439 }, { "epoch": 0.7578947368421053, "grad_norm": 1.4808701276779175, "learning_rate": 2.959821657299377e-05, "loss": 1.7629, "step": 1440 }, { "epoch": 0.758421052631579, "grad_norm": 1.3917648792266846, "learning_rate": 2.959764497248699e-05, "loss": 1.3897, "step": 1441 }, { "epoch": 0.7589473684210526, "grad_norm": 1.1825001239776611, "learning_rate": 2.9597072971200366e-05, "loss": 1.1223, "step": 1442 }, { "epoch": 0.7594736842105263, "grad_norm": 1.76896071434021, "learning_rate": 2.9596500569149603e-05, "loss": 1.6206, "step": 1443 }, { "epoch": 0.76, "grad_norm": 1.2272676229476929, "learning_rate": 2.9595927766350406e-05, "loss": 1.1395, "step": 1444 }, { "epoch": 0.7605263157894737, "grad_norm": 4.527194499969482, "learning_rate": 2.959535456281851e-05, "loss": 0.4346, "step": 1445 }, { "epoch": 0.7610526315789473, "grad_norm": 1.3327759504318237, "learning_rate": 2.959478095856965e-05, "loss": 0.9762, "step": 1446 }, { "epoch": 0.761578947368421, "grad_norm": 2.397428512573242, "learning_rate": 2.959420695361958e-05, "loss": 0.7909, "step": 1447 }, { "epoch": 0.7621052631578947, "grad_norm": 1.3565646409988403, "learning_rate": 2.9593632547984047e-05, "loss": 0.7872, "step": 1448 }, { "epoch": 0.7626315789473684, "grad_norm": 0.9641793370246887, "learning_rate": 2.9593057741678832e-05, "loss": 0.8944, "step": 1449 }, { "epoch": 0.7631578947368421, "grad_norm": 2.6557819843292236, "learning_rate": 2.959248253471971e-05, "loss": 0.8311, "step": 1450 }, { "epoch": 0.7636842105263157, "grad_norm": 4.478829860687256, "learning_rate": 2.9591906927122477e-05, "loss": 0.4445, "step": 1451 }, { "epoch": 0.7642105263157895, "grad_norm": 2.393716335296631, "learning_rate": 2.9591330918902935e-05, "loss": 1.7126, "step": 1452 }, { "epoch": 0.7647368421052632, "grad_norm": 1.553025722503662, "learning_rate": 2.95907545100769e-05, "loss": 1.1763, "step": 1453 }, { "epoch": 0.7652631578947369, "grad_norm": 1.0929442644119263, "learning_rate": 2.9590177700660193e-05, "loss": 1.14, "step": 1454 }, { "epoch": 0.7657894736842106, "grad_norm": 7.241302013397217, "learning_rate": 2.9589600490668655e-05, "loss": 1.111, "step": 1455 }, { "epoch": 0.7663157894736842, "grad_norm": 0.972644567489624, "learning_rate": 2.9589022880118133e-05, "loss": 1.1578, "step": 1456 }, { "epoch": 0.7668421052631579, "grad_norm": 7.614607810974121, "learning_rate": 2.9588444869024484e-05, "loss": 1.0112, "step": 1457 }, { "epoch": 0.7673684210526316, "grad_norm": 1.8197392225265503, "learning_rate": 2.9587866457403577e-05, "loss": 1.2878, "step": 1458 }, { "epoch": 0.7678947368421053, "grad_norm": 1.1428899765014648, "learning_rate": 2.9587287645271293e-05, "loss": 0.7216, "step": 1459 }, { "epoch": 0.7684210526315789, "grad_norm": 1.5418213605880737, "learning_rate": 2.958670843264353e-05, "loss": 0.9176, "step": 1460 }, { "epoch": 0.7689473684210526, "grad_norm": 6.577853679656982, "learning_rate": 2.9586128819536172e-05, "loss": 1.3075, "step": 1461 }, { "epoch": 0.7694736842105263, "grad_norm": 1.7671222686767578, "learning_rate": 2.958554880596515e-05, "loss": 1.0772, "step": 1462 }, { "epoch": 0.77, "grad_norm": 1.2378417253494263, "learning_rate": 2.9584968391946378e-05, "loss": 1.048, "step": 1463 }, { "epoch": 0.7705263157894737, "grad_norm": 1.9807356595993042, "learning_rate": 2.9584387577495803e-05, "loss": 1.6231, "step": 1464 }, { "epoch": 0.7710526315789473, "grad_norm": 1.2167612314224243, "learning_rate": 2.958380636262936e-05, "loss": 1.5038, "step": 1465 }, { "epoch": 0.771578947368421, "grad_norm": 1.2333711385726929, "learning_rate": 2.9583224747363008e-05, "loss": 0.6763, "step": 1466 }, { "epoch": 0.7721052631578947, "grad_norm": 1.9440076351165771, "learning_rate": 2.958264273171272e-05, "loss": 1.8108, "step": 1467 }, { "epoch": 0.7726315789473684, "grad_norm": 1.7781847715377808, "learning_rate": 2.958206031569447e-05, "loss": 1.5469, "step": 1468 }, { "epoch": 0.7731578947368422, "grad_norm": 1.5457514524459839, "learning_rate": 2.9581477499324254e-05, "loss": 1.1067, "step": 1469 }, { "epoch": 0.7736842105263158, "grad_norm": 6.0052103996276855, "learning_rate": 2.9580894282618073e-05, "loss": 0.5737, "step": 1470 }, { "epoch": 0.7742105263157895, "grad_norm": 1.827090859413147, "learning_rate": 2.9580310665591933e-05, "loss": 1.1473, "step": 1471 }, { "epoch": 0.7747368421052632, "grad_norm": 5.207357883453369, "learning_rate": 2.9579726648261862e-05, "loss": 2.123, "step": 1472 }, { "epoch": 0.7752631578947369, "grad_norm": 4.914318084716797, "learning_rate": 2.957914223064389e-05, "loss": 1.4473, "step": 1473 }, { "epoch": 0.7757894736842105, "grad_norm": 4.295709609985352, "learning_rate": 2.9578557412754067e-05, "loss": 0.1821, "step": 1474 }, { "epoch": 0.7763157894736842, "grad_norm": 14.05141830444336, "learning_rate": 2.9577972194608453e-05, "loss": 1.7767, "step": 1475 }, { "epoch": 0.7768421052631579, "grad_norm": 3.5937631130218506, "learning_rate": 2.9577386576223105e-05, "loss": 0.8665, "step": 1476 }, { "epoch": 0.7773684210526316, "grad_norm": 1.2602375745773315, "learning_rate": 2.9576800557614103e-05, "loss": 1.6036, "step": 1477 }, { "epoch": 0.7778947368421053, "grad_norm": 6.486451148986816, "learning_rate": 2.9576214138797544e-05, "loss": 0.8866, "step": 1478 }, { "epoch": 0.7784210526315789, "grad_norm": 2.9001505374908447, "learning_rate": 2.9575627319789523e-05, "loss": 1.085, "step": 1479 }, { "epoch": 0.7789473684210526, "grad_norm": 1.1088868379592896, "learning_rate": 2.957504010060615e-05, "loss": 1.0509, "step": 1480 }, { "epoch": 0.7794736842105263, "grad_norm": 2.7946763038635254, "learning_rate": 2.9574452481263553e-05, "loss": 0.1451, "step": 1481 }, { "epoch": 0.78, "grad_norm": 1.2279927730560303, "learning_rate": 2.9573864461777856e-05, "loss": 0.9693, "step": 1482 }, { "epoch": 0.7805263157894737, "grad_norm": 9.112462997436523, "learning_rate": 2.957327604216521e-05, "loss": 0.4725, "step": 1483 }, { "epoch": 0.7810526315789473, "grad_norm": 4.834194183349609, "learning_rate": 2.957268722244177e-05, "loss": 0.7559, "step": 1484 }, { "epoch": 0.781578947368421, "grad_norm": 1.3210901021957397, "learning_rate": 2.9572098002623697e-05, "loss": 0.9653, "step": 1485 }, { "epoch": 0.7821052631578947, "grad_norm": 6.407845973968506, "learning_rate": 2.9571508382727173e-05, "loss": 2.3573, "step": 1486 }, { "epoch": 0.7826315789473685, "grad_norm": 1.6584391593933105, "learning_rate": 2.9570918362768386e-05, "loss": 1.6321, "step": 1487 }, { "epoch": 0.783157894736842, "grad_norm": 1.5938304662704468, "learning_rate": 2.9570327942763535e-05, "loss": 1.1299, "step": 1488 }, { "epoch": 0.7836842105263158, "grad_norm": 0.9389225840568542, "learning_rate": 2.9569737122728823e-05, "loss": 0.9162, "step": 1489 }, { "epoch": 0.7842105263157895, "grad_norm": 2.557969331741333, "learning_rate": 2.956914590268048e-05, "loss": 1.1852, "step": 1490 }, { "epoch": 0.7847368421052632, "grad_norm": 1.1017169952392578, "learning_rate": 2.9568554282634733e-05, "loss": 1.3037, "step": 1491 }, { "epoch": 0.7852631578947369, "grad_norm": 1.8029855489730835, "learning_rate": 2.956796226260783e-05, "loss": 1.9646, "step": 1492 }, { "epoch": 0.7857894736842105, "grad_norm": 10.63049030303955, "learning_rate": 2.9567369842616015e-05, "loss": 0.8632, "step": 1493 }, { "epoch": 0.7863157894736842, "grad_norm": 6.249732494354248, "learning_rate": 2.9566777022675563e-05, "loss": 0.5931, "step": 1494 }, { "epoch": 0.7868421052631579, "grad_norm": 3.90535306930542, "learning_rate": 2.956618380280275e-05, "loss": 1.2675, "step": 1495 }, { "epoch": 0.7873684210526316, "grad_norm": 5.921347618103027, "learning_rate": 2.9565590183013855e-05, "loss": 0.2349, "step": 1496 }, { "epoch": 0.7878947368421053, "grad_norm": 0.9792536497116089, "learning_rate": 2.9564996163325186e-05, "loss": 1.1103, "step": 1497 }, { "epoch": 0.7884210526315789, "grad_norm": 3.9059622287750244, "learning_rate": 2.956440174375304e-05, "loss": 0.0963, "step": 1498 }, { "epoch": 0.7889473684210526, "grad_norm": 2.304387092590332, "learning_rate": 2.9563806924313746e-05, "loss": 1.1423, "step": 1499 }, { "epoch": 0.7894736842105263, "grad_norm": 6.4439263343811035, "learning_rate": 2.956321170502363e-05, "loss": 1.7772, "step": 1500 }, { "epoch": 0.79, "grad_norm": 2.323302745819092, "learning_rate": 2.956261608589904e-05, "loss": 0.9714, "step": 1501 }, { "epoch": 0.7905263157894736, "grad_norm": 2.1680712699890137, "learning_rate": 2.956202006695632e-05, "loss": 1.3609, "step": 1502 }, { "epoch": 0.7910526315789473, "grad_norm": 1.4925416707992554, "learning_rate": 2.9561423648211842e-05, "loss": 1.1193, "step": 1503 }, { "epoch": 0.791578947368421, "grad_norm": 2.1561765670776367, "learning_rate": 2.956082682968197e-05, "loss": 1.3798, "step": 1504 }, { "epoch": 0.7921052631578948, "grad_norm": 2.159775495529175, "learning_rate": 2.9560229611383104e-05, "loss": 1.2499, "step": 1505 }, { "epoch": 0.7926315789473685, "grad_norm": 2.9001150131225586, "learning_rate": 2.955963199333163e-05, "loss": 1.2439, "step": 1506 }, { "epoch": 0.7931578947368421, "grad_norm": 20.27045440673828, "learning_rate": 2.9559033975543962e-05, "loss": 1.3729, "step": 1507 }, { "epoch": 0.7936842105263158, "grad_norm": 1.0026558637619019, "learning_rate": 2.9558435558036507e-05, "loss": 0.8089, "step": 1508 }, { "epoch": 0.7942105263157895, "grad_norm": 2.685729503631592, "learning_rate": 2.955783674082571e-05, "loss": 0.9878, "step": 1509 }, { "epoch": 0.7947368421052632, "grad_norm": 15.882140159606934, "learning_rate": 2.9557237523928005e-05, "loss": 2.2313, "step": 1510 }, { "epoch": 0.7952631578947369, "grad_norm": 1.5289243459701538, "learning_rate": 2.9556637907359845e-05, "loss": 0.6709, "step": 1511 }, { "epoch": 0.7957894736842105, "grad_norm": 1.3777986764907837, "learning_rate": 2.9556037891137686e-05, "loss": 0.8706, "step": 1512 }, { "epoch": 0.7963157894736842, "grad_norm": 3.52500319480896, "learning_rate": 2.9555437475278013e-05, "loss": 2.0162, "step": 1513 }, { "epoch": 0.7968421052631579, "grad_norm": 1.4890594482421875, "learning_rate": 2.95548366597973e-05, "loss": 0.8976, "step": 1514 }, { "epoch": 0.7973684210526316, "grad_norm": 3.3992414474487305, "learning_rate": 2.9554235444712045e-05, "loss": 0.5914, "step": 1515 }, { "epoch": 0.7978947368421052, "grad_norm": 4.6464152336120605, "learning_rate": 2.9553633830038757e-05, "loss": 2.0883, "step": 1516 }, { "epoch": 0.7984210526315789, "grad_norm": 2.2460274696350098, "learning_rate": 2.955303181579395e-05, "loss": 1.3262, "step": 1517 }, { "epoch": 0.7989473684210526, "grad_norm": 1.4003204107284546, "learning_rate": 2.955242940199416e-05, "loss": 1.3034, "step": 1518 }, { "epoch": 0.7994736842105263, "grad_norm": 1.23296058177948, "learning_rate": 2.955182658865592e-05, "loss": 1.2704, "step": 1519 }, { "epoch": 0.8, "grad_norm": 1.1843321323394775, "learning_rate": 2.9551223375795778e-05, "loss": 1.0394, "step": 1520 }, { "epoch": 0.8005263157894736, "grad_norm": 1.3480000495910645, "learning_rate": 2.95506197634303e-05, "loss": 1.1087, "step": 1521 }, { "epoch": 0.8010526315789473, "grad_norm": 1.0556117296218872, "learning_rate": 2.9550015751576055e-05, "loss": 0.9642, "step": 1522 }, { "epoch": 0.8015789473684211, "grad_norm": 5.54053258895874, "learning_rate": 2.9549411340249627e-05, "loss": 1.3033, "step": 1523 }, { "epoch": 0.8021052631578948, "grad_norm": 3.8652968406677246, "learning_rate": 2.9548806529467617e-05, "loss": 1.0023, "step": 1524 }, { "epoch": 0.8026315789473685, "grad_norm": 3.5290725231170654, "learning_rate": 2.954820131924662e-05, "loss": 0.5537, "step": 1525 }, { "epoch": 0.8031578947368421, "grad_norm": 0.9234074950218201, "learning_rate": 2.9547595709603255e-05, "loss": 0.9543, "step": 1526 }, { "epoch": 0.8036842105263158, "grad_norm": 13.975532531738281, "learning_rate": 2.9546989700554154e-05, "loss": 1.2983, "step": 1527 }, { "epoch": 0.8042105263157895, "grad_norm": 1.407110333442688, "learning_rate": 2.9546383292115947e-05, "loss": 1.2283, "step": 1528 }, { "epoch": 0.8047368421052632, "grad_norm": 1.1215810775756836, "learning_rate": 2.9545776484305293e-05, "loss": 0.9641, "step": 1529 }, { "epoch": 0.8052631578947368, "grad_norm": 2.0094728469848633, "learning_rate": 2.9545169277138845e-05, "loss": 1.0771, "step": 1530 }, { "epoch": 0.8057894736842105, "grad_norm": 3.301666021347046, "learning_rate": 2.9544561670633272e-05, "loss": 0.5507, "step": 1531 }, { "epoch": 0.8063157894736842, "grad_norm": 2.8523879051208496, "learning_rate": 2.954395366480526e-05, "loss": 0.414, "step": 1532 }, { "epoch": 0.8068421052631579, "grad_norm": 5.4080986976623535, "learning_rate": 2.9543345259671505e-05, "loss": 0.9862, "step": 1533 }, { "epoch": 0.8073684210526316, "grad_norm": 3.0157690048217773, "learning_rate": 2.9542736455248702e-05, "loss": 1.1812, "step": 1534 }, { "epoch": 0.8078947368421052, "grad_norm": 2.6960086822509766, "learning_rate": 2.9542127251553573e-05, "loss": 0.8758, "step": 1535 }, { "epoch": 0.8084210526315789, "grad_norm": 1.4676748514175415, "learning_rate": 2.954151764860284e-05, "loss": 1.2065, "step": 1536 }, { "epoch": 0.8089473684210526, "grad_norm": 1.893027424812317, "learning_rate": 2.9540907646413247e-05, "loss": 1.7415, "step": 1537 }, { "epoch": 0.8094736842105263, "grad_norm": 2.425906181335449, "learning_rate": 2.954029724500153e-05, "loss": 1.0721, "step": 1538 }, { "epoch": 0.81, "grad_norm": 4.737662315368652, "learning_rate": 2.953968644438445e-05, "loss": 1.0819, "step": 1539 }, { "epoch": 0.8105263157894737, "grad_norm": 2.0641164779663086, "learning_rate": 2.9539075244578793e-05, "loss": 1.6513, "step": 1540 }, { "epoch": 0.8110526315789474, "grad_norm": 1.7806847095489502, "learning_rate": 2.9538463645601317e-05, "loss": 1.0087, "step": 1541 }, { "epoch": 0.8115789473684211, "grad_norm": 1.2224940061569214, "learning_rate": 2.9537851647468827e-05, "loss": 1.0176, "step": 1542 }, { "epoch": 0.8121052631578948, "grad_norm": 3.8275363445281982, "learning_rate": 2.953723925019812e-05, "loss": 0.2529, "step": 1543 }, { "epoch": 0.8126315789473684, "grad_norm": 4.72785758972168, "learning_rate": 2.9536626453806008e-05, "loss": 1.2509, "step": 1544 }, { "epoch": 0.8131578947368421, "grad_norm": 1.0455762147903442, "learning_rate": 2.9536013258309323e-05, "loss": 0.766, "step": 1545 }, { "epoch": 0.8136842105263158, "grad_norm": 1.5553256273269653, "learning_rate": 2.9535399663724893e-05, "loss": 1.1175, "step": 1546 }, { "epoch": 0.8142105263157895, "grad_norm": 2.9578685760498047, "learning_rate": 2.953478567006957e-05, "loss": 0.6569, "step": 1547 }, { "epoch": 0.8147368421052632, "grad_norm": 1.4119994640350342, "learning_rate": 2.9534171277360207e-05, "loss": 1.2938, "step": 1548 }, { "epoch": 0.8152631578947368, "grad_norm": 2.524916887283325, "learning_rate": 2.9533556485613673e-05, "loss": 0.4687, "step": 1549 }, { "epoch": 0.8157894736842105, "grad_norm": 2.7491307258605957, "learning_rate": 2.9532941294846848e-05, "loss": 0.9021, "step": 1550 }, { "epoch": 0.8163157894736842, "grad_norm": 3.837688446044922, "learning_rate": 2.9532325705076622e-05, "loss": 0.7552, "step": 1551 }, { "epoch": 0.8168421052631579, "grad_norm": 1.9643511772155762, "learning_rate": 2.9531709716319895e-05, "loss": 1.2493, "step": 1552 }, { "epoch": 0.8173684210526316, "grad_norm": 2.230646848678589, "learning_rate": 2.953109332859358e-05, "loss": 0.1658, "step": 1553 }, { "epoch": 0.8178947368421052, "grad_norm": 4.156702995300293, "learning_rate": 2.9530476541914602e-05, "loss": 0.4678, "step": 1554 }, { "epoch": 0.8184210526315789, "grad_norm": 10.191547393798828, "learning_rate": 2.9529859356299894e-05, "loss": 0.6664, "step": 1555 }, { "epoch": 0.8189473684210526, "grad_norm": 1.6715644598007202, "learning_rate": 2.9529241771766396e-05, "loss": 0.929, "step": 1556 }, { "epoch": 0.8194736842105264, "grad_norm": 1.5630298852920532, "learning_rate": 2.9528623788331067e-05, "loss": 1.5926, "step": 1557 }, { "epoch": 0.82, "grad_norm": 0.9629634618759155, "learning_rate": 2.9528005406010877e-05, "loss": 0.7064, "step": 1558 }, { "epoch": 0.8205263157894737, "grad_norm": 2.5397849082946777, "learning_rate": 2.9527386624822805e-05, "loss": 1.0266, "step": 1559 }, { "epoch": 0.8210526315789474, "grad_norm": 0.8805807828903198, "learning_rate": 2.952676744478383e-05, "loss": 0.1851, "step": 1560 }, { "epoch": 0.8215789473684211, "grad_norm": 1.243837594985962, "learning_rate": 2.952614786591096e-05, "loss": 1.4598, "step": 1561 }, { "epoch": 0.8221052631578948, "grad_norm": 2.9784929752349854, "learning_rate": 2.9525527888221203e-05, "loss": 1.4596, "step": 1562 }, { "epoch": 0.8226315789473684, "grad_norm": 1.0616602897644043, "learning_rate": 2.9524907511731582e-05, "loss": 1.151, "step": 1563 }, { "epoch": 0.8231578947368421, "grad_norm": 3.9415078163146973, "learning_rate": 2.9524286736459125e-05, "loss": 0.7022, "step": 1564 }, { "epoch": 0.8236842105263158, "grad_norm": 4.667412757873535, "learning_rate": 2.9523665562420882e-05, "loss": 1.614, "step": 1565 }, { "epoch": 0.8242105263157895, "grad_norm": 2.3108322620391846, "learning_rate": 2.95230439896339e-05, "loss": 0.8213, "step": 1566 }, { "epoch": 0.8247368421052632, "grad_norm": 1.3090465068817139, "learning_rate": 2.9522422018115254e-05, "loss": 0.9995, "step": 1567 }, { "epoch": 0.8252631578947368, "grad_norm": 2.5572314262390137, "learning_rate": 2.952179964788201e-05, "loss": 0.3968, "step": 1568 }, { "epoch": 0.8257894736842105, "grad_norm": 4.847593784332275, "learning_rate": 2.9521176878951262e-05, "loss": 1.5573, "step": 1569 }, { "epoch": 0.8263157894736842, "grad_norm": 3.584441661834717, "learning_rate": 2.9520553711340107e-05, "loss": 1.6182, "step": 1570 }, { "epoch": 0.8268421052631579, "grad_norm": 0.06894690543413162, "learning_rate": 2.951993014506565e-05, "loss": 0.0021, "step": 1571 }, { "epoch": 0.8273684210526315, "grad_norm": 1.0614222288131714, "learning_rate": 2.951930618014502e-05, "loss": 1.1364, "step": 1572 }, { "epoch": 0.8278947368421052, "grad_norm": 1.2423781156539917, "learning_rate": 2.9518681816595337e-05, "loss": 0.7434, "step": 1573 }, { "epoch": 0.828421052631579, "grad_norm": 1.0514869689941406, "learning_rate": 2.9518057054433753e-05, "loss": 0.9144, "step": 1574 }, { "epoch": 0.8289473684210527, "grad_norm": 6.573333263397217, "learning_rate": 2.951743189367741e-05, "loss": 0.6118, "step": 1575 }, { "epoch": 0.8294736842105264, "grad_norm": 6.8987016677856445, "learning_rate": 2.9516806334343482e-05, "loss": 0.7301, "step": 1576 }, { "epoch": 0.83, "grad_norm": 1.210350513458252, "learning_rate": 2.951618037644914e-05, "loss": 0.9962, "step": 1577 }, { "epoch": 0.8305263157894737, "grad_norm": 4.0031328201293945, "learning_rate": 2.9515554020011567e-05, "loss": 1.1273, "step": 1578 }, { "epoch": 0.8310526315789474, "grad_norm": 6.773041725158691, "learning_rate": 2.9514927265047968e-05, "loss": 2.3973, "step": 1579 }, { "epoch": 0.8315789473684211, "grad_norm": 2.425973892211914, "learning_rate": 2.951430011157554e-05, "loss": 0.6053, "step": 1580 }, { "epoch": 0.8321052631578948, "grad_norm": 6.728580951690674, "learning_rate": 2.951367255961151e-05, "loss": 0.9388, "step": 1581 }, { "epoch": 0.8326315789473684, "grad_norm": 1.9585034847259521, "learning_rate": 2.95130446091731e-05, "loss": 1.0964, "step": 1582 }, { "epoch": 0.8331578947368421, "grad_norm": 1.406274676322937, "learning_rate": 2.9512416260277554e-05, "loss": 1.4249, "step": 1583 }, { "epoch": 0.8336842105263158, "grad_norm": 20.80072593688965, "learning_rate": 2.9511787512942125e-05, "loss": 0.8256, "step": 1584 }, { "epoch": 0.8342105263157895, "grad_norm": 1.4816763401031494, "learning_rate": 2.9511158367184078e-05, "loss": 0.6075, "step": 1585 }, { "epoch": 0.8347368421052631, "grad_norm": 6.253325939178467, "learning_rate": 2.9510528823020683e-05, "loss": 0.124, "step": 1586 }, { "epoch": 0.8352631578947368, "grad_norm": 1.8239108324050903, "learning_rate": 2.9509898880469215e-05, "loss": 0.7881, "step": 1587 }, { "epoch": 0.8357894736842105, "grad_norm": 9.914002418518066, "learning_rate": 2.9509268539546985e-05, "loss": 0.3637, "step": 1588 }, { "epoch": 0.8363157894736842, "grad_norm": 2.900563955307007, "learning_rate": 2.9508637800271293e-05, "loss": 0.9361, "step": 1589 }, { "epoch": 0.8368421052631579, "grad_norm": 1.4834281206130981, "learning_rate": 2.9508006662659447e-05, "loss": 0.4473, "step": 1590 }, { "epoch": 0.8373684210526315, "grad_norm": 2.494849443435669, "learning_rate": 2.9507375126728787e-05, "loss": 1.2612, "step": 1591 }, { "epoch": 0.8378947368421052, "grad_norm": 1.2127383947372437, "learning_rate": 2.950674319249665e-05, "loss": 0.8488, "step": 1592 }, { "epoch": 0.838421052631579, "grad_norm": 1.7087959051132202, "learning_rate": 2.950611085998038e-05, "loss": 0.1118, "step": 1593 }, { "epoch": 0.8389473684210527, "grad_norm": 1.0232149362564087, "learning_rate": 2.9505478129197343e-05, "loss": 0.0376, "step": 1594 }, { "epoch": 0.8394736842105263, "grad_norm": 1.3383780717849731, "learning_rate": 2.950484500016491e-05, "loss": 1.1654, "step": 1595 }, { "epoch": 0.84, "grad_norm": 1.3777598142623901, "learning_rate": 2.950421147290046e-05, "loss": 0.8772, "step": 1596 }, { "epoch": 0.8405263157894737, "grad_norm": 3.801234722137451, "learning_rate": 2.950357754742139e-05, "loss": 1.8463, "step": 1597 }, { "epoch": 0.8410526315789474, "grad_norm": 1.922048568725586, "learning_rate": 2.9502943223745104e-05, "loss": 1.6611, "step": 1598 }, { "epoch": 0.8415789473684211, "grad_norm": 2.133612632751465, "learning_rate": 2.9502308501889016e-05, "loss": 2.0384, "step": 1599 }, { "epoch": 0.8421052631578947, "grad_norm": 3.9198570251464844, "learning_rate": 2.950167338187056e-05, "loss": 1.6621, "step": 1600 }, { "epoch": 0.8426315789473684, "grad_norm": 2.9979448318481445, "learning_rate": 2.950103786370716e-05, "loss": 0.2946, "step": 1601 }, { "epoch": 0.8431578947368421, "grad_norm": 1.4680919647216797, "learning_rate": 2.950040194741627e-05, "loss": 1.1285, "step": 1602 }, { "epoch": 0.8436842105263158, "grad_norm": 2.139376640319824, "learning_rate": 2.9499765633015354e-05, "loss": 0.8158, "step": 1603 }, { "epoch": 0.8442105263157895, "grad_norm": 2.2096030712127686, "learning_rate": 2.9499128920521875e-05, "loss": 0.0758, "step": 1604 }, { "epoch": 0.8447368421052631, "grad_norm": 2.995516300201416, "learning_rate": 2.949849180995332e-05, "loss": 0.8989, "step": 1605 }, { "epoch": 0.8452631578947368, "grad_norm": 3.639786958694458, "learning_rate": 2.9497854301327175e-05, "loss": 1.5573, "step": 1606 }, { "epoch": 0.8457894736842105, "grad_norm": 9.909292221069336, "learning_rate": 2.9497216394660948e-05, "loss": 0.7076, "step": 1607 }, { "epoch": 0.8463157894736842, "grad_norm": 8.76042366027832, "learning_rate": 2.949657808997215e-05, "loss": 1.8939, "step": 1608 }, { "epoch": 0.8468421052631578, "grad_norm": 8.321390151977539, "learning_rate": 2.9495939387278303e-05, "loss": 0.3237, "step": 1609 }, { "epoch": 0.8473684210526315, "grad_norm": 2.2840054035186768, "learning_rate": 2.949530028659695e-05, "loss": 1.3387, "step": 1610 }, { "epoch": 0.8478947368421053, "grad_norm": 1.5071499347686768, "learning_rate": 2.9494660787945634e-05, "loss": 1.1994, "step": 1611 }, { "epoch": 0.848421052631579, "grad_norm": 8.956307411193848, "learning_rate": 2.9494020891341912e-05, "loss": 1.5923, "step": 1612 }, { "epoch": 0.8489473684210527, "grad_norm": 1.6805635690689087, "learning_rate": 2.949338059680335e-05, "loss": 1.6323, "step": 1613 }, { "epoch": 0.8494736842105263, "grad_norm": 0.6971753835678101, "learning_rate": 2.9492739904347533e-05, "loss": 0.0236, "step": 1614 }, { "epoch": 0.85, "grad_norm": 1.6537336111068726, "learning_rate": 2.9492098813992045e-05, "loss": 1.2005, "step": 1615 }, { "epoch": 0.8505263157894737, "grad_norm": 1.5291582345962524, "learning_rate": 2.9491457325754495e-05, "loss": 1.1125, "step": 1616 }, { "epoch": 0.8510526315789474, "grad_norm": 2.4934237003326416, "learning_rate": 2.949081543965249e-05, "loss": 1.3744, "step": 1617 }, { "epoch": 0.8515789473684211, "grad_norm": 2.8750171661376953, "learning_rate": 2.949017315570365e-05, "loss": 1.6325, "step": 1618 }, { "epoch": 0.8521052631578947, "grad_norm": 0.9199529886245728, "learning_rate": 2.9489530473925615e-05, "loss": 1.0891, "step": 1619 }, { "epoch": 0.8526315789473684, "grad_norm": 4.175845146179199, "learning_rate": 2.9488887394336025e-05, "loss": 0.306, "step": 1620 }, { "epoch": 0.8531578947368421, "grad_norm": 2.5200769901275635, "learning_rate": 2.948824391695254e-05, "loss": 0.2277, "step": 1621 }, { "epoch": 0.8536842105263158, "grad_norm": 1.024670124053955, "learning_rate": 2.9487600041792825e-05, "loss": 1.073, "step": 1622 }, { "epoch": 0.8542105263157894, "grad_norm": 3.6856977939605713, "learning_rate": 2.9486955768874555e-05, "loss": 0.9715, "step": 1623 }, { "epoch": 0.8547368421052631, "grad_norm": 2.900958776473999, "learning_rate": 2.9486311098215425e-05, "loss": 1.142, "step": 1624 }, { "epoch": 0.8552631578947368, "grad_norm": 1.7244210243225098, "learning_rate": 2.948566602983313e-05, "loss": 1.1344, "step": 1625 }, { "epoch": 0.8557894736842105, "grad_norm": 5.9757232666015625, "learning_rate": 2.948502056374538e-05, "loss": 0.8719, "step": 1626 }, { "epoch": 0.8563157894736843, "grad_norm": 3.8142871856689453, "learning_rate": 2.94843746999699e-05, "loss": 0.8588, "step": 1627 }, { "epoch": 0.8568421052631578, "grad_norm": 5.312744617462158, "learning_rate": 2.9483728438524417e-05, "loss": 0.9706, "step": 1628 }, { "epoch": 0.8573684210526316, "grad_norm": 0.9603210687637329, "learning_rate": 2.9483081779426678e-05, "loss": 0.8257, "step": 1629 }, { "epoch": 0.8578947368421053, "grad_norm": 12.436172485351562, "learning_rate": 2.9482434722694434e-05, "loss": 1.303, "step": 1630 }, { "epoch": 0.858421052631579, "grad_norm": 2.7977070808410645, "learning_rate": 2.9481787268345456e-05, "loss": 1.4104, "step": 1631 }, { "epoch": 0.8589473684210527, "grad_norm": 1.3737499713897705, "learning_rate": 2.9481139416397512e-05, "loss": 1.7754, "step": 1632 }, { "epoch": 0.8594736842105263, "grad_norm": 4.523530960083008, "learning_rate": 2.9480491166868396e-05, "loss": 1.9328, "step": 1633 }, { "epoch": 0.86, "grad_norm": 14.998103141784668, "learning_rate": 2.9479842519775903e-05, "loss": 1.3404, "step": 1634 }, { "epoch": 0.8605263157894737, "grad_norm": 3.4863827228546143, "learning_rate": 2.9479193475137834e-05, "loss": 1.7372, "step": 1635 }, { "epoch": 0.8610526315789474, "grad_norm": 3.7734429836273193, "learning_rate": 2.9478544032972024e-05, "loss": 2.3958, "step": 1636 }, { "epoch": 0.861578947368421, "grad_norm": 1.744239091873169, "learning_rate": 2.9477894193296295e-05, "loss": 0.9227, "step": 1637 }, { "epoch": 0.8621052631578947, "grad_norm": 2.1991264820098877, "learning_rate": 2.9477243956128484e-05, "loss": 1.5509, "step": 1638 }, { "epoch": 0.8626315789473684, "grad_norm": 13.812848091125488, "learning_rate": 2.9476593321486455e-05, "loss": 1.1571, "step": 1639 }, { "epoch": 0.8631578947368421, "grad_norm": 5.264194488525391, "learning_rate": 2.9475942289388056e-05, "loss": 1.2813, "step": 1640 }, { "epoch": 0.8636842105263158, "grad_norm": 1.047844648361206, "learning_rate": 2.9475290859851173e-05, "loss": 0.9731, "step": 1641 }, { "epoch": 0.8642105263157894, "grad_norm": 6.97605562210083, "learning_rate": 2.9474639032893685e-05, "loss": 1.1533, "step": 1642 }, { "epoch": 0.8647368421052631, "grad_norm": 1.112858772277832, "learning_rate": 2.9473986808533495e-05, "loss": 1.062, "step": 1643 }, { "epoch": 0.8652631578947368, "grad_norm": 1.1823147535324097, "learning_rate": 2.9473334186788503e-05, "loss": 1.3267, "step": 1644 }, { "epoch": 0.8657894736842106, "grad_norm": 1.9988354444503784, "learning_rate": 2.947268116767663e-05, "loss": 0.8981, "step": 1645 }, { "epoch": 0.8663157894736843, "grad_norm": 10.701164245605469, "learning_rate": 2.9472027751215803e-05, "loss": 1.2634, "step": 1646 }, { "epoch": 0.8668421052631579, "grad_norm": 8.62368106842041, "learning_rate": 2.9471373937423963e-05, "loss": 0.6866, "step": 1647 }, { "epoch": 0.8673684210526316, "grad_norm": 7.085811138153076, "learning_rate": 2.947071972631906e-05, "loss": 0.4653, "step": 1648 }, { "epoch": 0.8678947368421053, "grad_norm": 0.7952293753623962, "learning_rate": 2.9470065117919057e-05, "loss": 0.688, "step": 1649 }, { "epoch": 0.868421052631579, "grad_norm": 0.9955103993415833, "learning_rate": 2.9469410112241925e-05, "loss": 0.8748, "step": 1650 }, { "epoch": 0.8689473684210526, "grad_norm": 1.0168907642364502, "learning_rate": 2.9468754709305643e-05, "loss": 1.3661, "step": 1651 }, { "epoch": 0.8694736842105263, "grad_norm": 5.310661315917969, "learning_rate": 2.9468098909128212e-05, "loss": 0.8655, "step": 1652 }, { "epoch": 0.87, "grad_norm": 5.678101539611816, "learning_rate": 2.9467442711727637e-05, "loss": 0.914, "step": 1653 }, { "epoch": 0.8705263157894737, "grad_norm": 7.018301963806152, "learning_rate": 2.9466786117121928e-05, "loss": 1.3955, "step": 1654 }, { "epoch": 0.8710526315789474, "grad_norm": 1.5973879098892212, "learning_rate": 2.9466129125329114e-05, "loss": 1.1749, "step": 1655 }, { "epoch": 0.871578947368421, "grad_norm": 1.043853521347046, "learning_rate": 2.9465471736367234e-05, "loss": 0.0401, "step": 1656 }, { "epoch": 0.8721052631578947, "grad_norm": 1.4394428730010986, "learning_rate": 2.9464813950254336e-05, "loss": 0.0699, "step": 1657 }, { "epoch": 0.8726315789473684, "grad_norm": 1.1438318490982056, "learning_rate": 2.9464155767008485e-05, "loss": 1.5743, "step": 1658 }, { "epoch": 0.8731578947368421, "grad_norm": 7.7294392585754395, "learning_rate": 2.9463497186647747e-05, "loss": 0.961, "step": 1659 }, { "epoch": 0.8736842105263158, "grad_norm": 2.75779390335083, "learning_rate": 2.9462838209190198e-05, "loss": 0.7825, "step": 1660 }, { "epoch": 0.8742105263157894, "grad_norm": 1.5052375793457031, "learning_rate": 2.9462178834653937e-05, "loss": 1.0688, "step": 1661 }, { "epoch": 0.8747368421052631, "grad_norm": 4.883263111114502, "learning_rate": 2.9461519063057064e-05, "loss": 0.4308, "step": 1662 }, { "epoch": 0.8752631578947369, "grad_norm": 2.8982460498809814, "learning_rate": 2.9460858894417694e-05, "loss": 0.4369, "step": 1663 }, { "epoch": 0.8757894736842106, "grad_norm": 38.408145904541016, "learning_rate": 2.9460198328753955e-05, "loss": 2.1035, "step": 1664 }, { "epoch": 0.8763157894736842, "grad_norm": 1.7316091060638428, "learning_rate": 2.9459537366083983e-05, "loss": 0.9198, "step": 1665 }, { "epoch": 0.8768421052631579, "grad_norm": 0.7877925634384155, "learning_rate": 2.945887600642592e-05, "loss": 0.5887, "step": 1666 }, { "epoch": 0.8773684210526316, "grad_norm": 1.3625367879867554, "learning_rate": 2.9458214249797924e-05, "loss": 1.1236, "step": 1667 }, { "epoch": 0.8778947368421053, "grad_norm": 1.776649832725525, "learning_rate": 2.9457552096218168e-05, "loss": 1.2667, "step": 1668 }, { "epoch": 0.878421052631579, "grad_norm": 5.151728630065918, "learning_rate": 2.945688954570483e-05, "loss": 1.5045, "step": 1669 }, { "epoch": 0.8789473684210526, "grad_norm": 1.2965806722640991, "learning_rate": 2.9456226598276097e-05, "loss": 1.1484, "step": 1670 }, { "epoch": 0.8794736842105263, "grad_norm": 1.1582598686218262, "learning_rate": 2.9455563253950176e-05, "loss": 1.1762, "step": 1671 }, { "epoch": 0.88, "grad_norm": 1.36055588722229, "learning_rate": 2.945489951274527e-05, "loss": 1.2704, "step": 1672 }, { "epoch": 0.8805263157894737, "grad_norm": 5.386001110076904, "learning_rate": 2.9454235374679612e-05, "loss": 1.0296, "step": 1673 }, { "epoch": 0.8810526315789474, "grad_norm": 4.373623371124268, "learning_rate": 2.9453570839771432e-05, "loss": 1.3767, "step": 1674 }, { "epoch": 0.881578947368421, "grad_norm": 1.0014320611953735, "learning_rate": 2.9452905908038975e-05, "loss": 1.3597, "step": 1675 }, { "epoch": 0.8821052631578947, "grad_norm": 4.0518412590026855, "learning_rate": 2.9452240579500496e-05, "loss": 0.7262, "step": 1676 }, { "epoch": 0.8826315789473684, "grad_norm": 2.685443878173828, "learning_rate": 2.9451574854174265e-05, "loss": 1.2386, "step": 1677 }, { "epoch": 0.8831578947368421, "grad_norm": 2.57368540763855, "learning_rate": 2.9450908732078553e-05, "loss": 1.0309, "step": 1678 }, { "epoch": 0.8836842105263157, "grad_norm": 1.2405550479888916, "learning_rate": 2.9450242213231654e-05, "loss": 1.1075, "step": 1679 }, { "epoch": 0.8842105263157894, "grad_norm": 1.6623353958129883, "learning_rate": 2.9449575297651865e-05, "loss": 0.9361, "step": 1680 }, { "epoch": 0.8847368421052632, "grad_norm": 14.208858489990234, "learning_rate": 2.9448907985357498e-05, "loss": 2.1596, "step": 1681 }, { "epoch": 0.8852631578947369, "grad_norm": 1.5413142442703247, "learning_rate": 2.944824027636687e-05, "loss": 1.4468, "step": 1682 }, { "epoch": 0.8857894736842106, "grad_norm": 0.9851499795913696, "learning_rate": 2.9447572170698324e-05, "loss": 0.9406, "step": 1683 }, { "epoch": 0.8863157894736842, "grad_norm": 0.8356818556785583, "learning_rate": 2.9446903668370188e-05, "loss": 0.0297, "step": 1684 }, { "epoch": 0.8868421052631579, "grad_norm": 2.6945416927337646, "learning_rate": 2.944623476940082e-05, "loss": 1.2829, "step": 1685 }, { "epoch": 0.8873684210526316, "grad_norm": 33.185028076171875, "learning_rate": 2.9445565473808593e-05, "loss": 0.9756, "step": 1686 }, { "epoch": 0.8878947368421053, "grad_norm": 1.7505784034729004, "learning_rate": 2.9444895781611876e-05, "loss": 1.2576, "step": 1687 }, { "epoch": 0.888421052631579, "grad_norm": 4.850620269775391, "learning_rate": 2.944422569282906e-05, "loss": 0.1032, "step": 1688 }, { "epoch": 0.8889473684210526, "grad_norm": 1.0093870162963867, "learning_rate": 2.9443555207478536e-05, "loss": 0.9847, "step": 1689 }, { "epoch": 0.8894736842105263, "grad_norm": 2.135535955429077, "learning_rate": 2.9442884325578714e-05, "loss": 1.3696, "step": 1690 }, { "epoch": 0.89, "grad_norm": 3.9265260696411133, "learning_rate": 2.9442213047148012e-05, "loss": 0.9913, "step": 1691 }, { "epoch": 0.8905263157894737, "grad_norm": 4.452930927276611, "learning_rate": 2.944154137220487e-05, "loss": 0.7837, "step": 1692 }, { "epoch": 0.8910526315789473, "grad_norm": 2.796504259109497, "learning_rate": 2.944086930076771e-05, "loss": 0.442, "step": 1693 }, { "epoch": 0.891578947368421, "grad_norm": 2.876086950302124, "learning_rate": 2.9440196832855004e-05, "loss": 1.2997, "step": 1694 }, { "epoch": 0.8921052631578947, "grad_norm": 1.2075906991958618, "learning_rate": 2.94395239684852e-05, "loss": 0.8753, "step": 1695 }, { "epoch": 0.8926315789473684, "grad_norm": 7.354285717010498, "learning_rate": 2.9438850707676786e-05, "loss": 0.9609, "step": 1696 }, { "epoch": 0.8931578947368422, "grad_norm": 5.216385364532471, "learning_rate": 2.943817705044823e-05, "loss": 1.3972, "step": 1697 }, { "epoch": 0.8936842105263157, "grad_norm": 1.4113494157791138, "learning_rate": 2.9437502996818035e-05, "loss": 1.4202, "step": 1698 }, { "epoch": 0.8942105263157895, "grad_norm": 6.46826696395874, "learning_rate": 2.9436828546804707e-05, "loss": 0.6637, "step": 1699 }, { "epoch": 0.8947368421052632, "grad_norm": 1.4000974893569946, "learning_rate": 2.943615370042677e-05, "loss": 0.0493, "step": 1700 }, { "epoch": 0.8952631578947369, "grad_norm": 1.1557705402374268, "learning_rate": 2.943547845770274e-05, "loss": 0.4349, "step": 1701 }, { "epoch": 0.8957894736842106, "grad_norm": 8.672079086303711, "learning_rate": 2.943480281865116e-05, "loss": 1.1025, "step": 1702 }, { "epoch": 0.8963157894736842, "grad_norm": 1.1264904737472534, "learning_rate": 2.943412678329058e-05, "loss": 1.0514, "step": 1703 }, { "epoch": 0.8968421052631579, "grad_norm": 0.8493797183036804, "learning_rate": 2.9433450351639567e-05, "loss": 0.7548, "step": 1704 }, { "epoch": 0.8973684210526316, "grad_norm": 3.7704265117645264, "learning_rate": 2.9432773523716683e-05, "loss": 1.2445, "step": 1705 }, { "epoch": 0.8978947368421053, "grad_norm": 1.2486798763275146, "learning_rate": 2.9432096299540518e-05, "loss": 1.1246, "step": 1706 }, { "epoch": 0.8984210526315789, "grad_norm": 1.134177803993225, "learning_rate": 2.9431418679129655e-05, "loss": 0.8451, "step": 1707 }, { "epoch": 0.8989473684210526, "grad_norm": 1.9805290699005127, "learning_rate": 2.9430740662502712e-05, "loss": 0.4611, "step": 1708 }, { "epoch": 0.8994736842105263, "grad_norm": 2.323180675506592, "learning_rate": 2.9430062249678297e-05, "loss": 1.2539, "step": 1709 }, { "epoch": 0.9, "grad_norm": 1.190372347831726, "learning_rate": 2.942938344067503e-05, "loss": 0.992, "step": 1710 }, { "epoch": 0.9005263157894737, "grad_norm": 2.458120822906494, "learning_rate": 2.9428704235511557e-05, "loss": 1.1641, "step": 1711 }, { "epoch": 0.9010526315789473, "grad_norm": 1.1921024322509766, "learning_rate": 2.942802463420652e-05, "loss": 0.9713, "step": 1712 }, { "epoch": 0.901578947368421, "grad_norm": 1.0940788984298706, "learning_rate": 2.942734463677858e-05, "loss": 0.8756, "step": 1713 }, { "epoch": 0.9021052631578947, "grad_norm": 1.3359793424606323, "learning_rate": 2.9426664243246404e-05, "loss": 1.3792, "step": 1714 }, { "epoch": 0.9026315789473685, "grad_norm": 2.6136605739593506, "learning_rate": 2.9425983453628677e-05, "loss": 0.954, "step": 1715 }, { "epoch": 0.9031578947368422, "grad_norm": 1.5081348419189453, "learning_rate": 2.942530226794409e-05, "loss": 1.3498, "step": 1716 }, { "epoch": 0.9036842105263158, "grad_norm": 3.4692232608795166, "learning_rate": 2.942462068621134e-05, "loss": 1.0917, "step": 1717 }, { "epoch": 0.9042105263157895, "grad_norm": 4.044422149658203, "learning_rate": 2.942393870844914e-05, "loss": 1.806, "step": 1718 }, { "epoch": 0.9047368421052632, "grad_norm": 1.875767707824707, "learning_rate": 2.9423256334676215e-05, "loss": 1.1397, "step": 1719 }, { "epoch": 0.9052631578947369, "grad_norm": 2.141014337539673, "learning_rate": 2.9422573564911305e-05, "loss": 1.2411, "step": 1720 }, { "epoch": 0.9057894736842105, "grad_norm": 1.6898494958877563, "learning_rate": 2.9421890399173153e-05, "loss": 1.4801, "step": 1721 }, { "epoch": 0.9063157894736842, "grad_norm": 2.024906635284424, "learning_rate": 2.942120683748051e-05, "loss": 0.317, "step": 1722 }, { "epoch": 0.9068421052631579, "grad_norm": 18.252716064453125, "learning_rate": 2.9420522879852148e-05, "loss": 0.7255, "step": 1723 }, { "epoch": 0.9073684210526316, "grad_norm": 2.799224853515625, "learning_rate": 2.9419838526306845e-05, "loss": 0.7641, "step": 1724 }, { "epoch": 0.9078947368421053, "grad_norm": 1.1127315759658813, "learning_rate": 2.941915377686339e-05, "loss": 1.0762, "step": 1725 }, { "epoch": 0.9084210526315789, "grad_norm": 2.6971869468688965, "learning_rate": 2.9418468631540578e-05, "loss": 1.15, "step": 1726 }, { "epoch": 0.9089473684210526, "grad_norm": 1.2686923742294312, "learning_rate": 2.9417783090357224e-05, "loss": 0.9524, "step": 1727 }, { "epoch": 0.9094736842105263, "grad_norm": 3.2218449115753174, "learning_rate": 2.9417097153332152e-05, "loss": 1.7397, "step": 1728 }, { "epoch": 0.91, "grad_norm": 1.8627907037734985, "learning_rate": 2.941641082048419e-05, "loss": 1.2889, "step": 1729 }, { "epoch": 0.9105263157894737, "grad_norm": 3.6902270317077637, "learning_rate": 2.9415724091832184e-05, "loss": 1.9215, "step": 1730 }, { "epoch": 0.9110526315789473, "grad_norm": 1.4535077810287476, "learning_rate": 2.9415036967394988e-05, "loss": 1.0473, "step": 1731 }, { "epoch": 0.911578947368421, "grad_norm": 1.3268829584121704, "learning_rate": 2.9414349447191466e-05, "loss": 1.0177, "step": 1732 }, { "epoch": 0.9121052631578948, "grad_norm": 1.7916074991226196, "learning_rate": 2.9413661531240493e-05, "loss": 1.0315, "step": 1733 }, { "epoch": 0.9126315789473685, "grad_norm": 1.504307508468628, "learning_rate": 2.941297321956096e-05, "loss": 1.1304, "step": 1734 }, { "epoch": 0.9131578947368421, "grad_norm": 4.602481365203857, "learning_rate": 2.9412284512171756e-05, "loss": 1.52, "step": 1735 }, { "epoch": 0.9136842105263158, "grad_norm": 4.820444107055664, "learning_rate": 2.94115954090918e-05, "loss": 1.8141, "step": 1736 }, { "epoch": 0.9142105263157895, "grad_norm": 1.313570261001587, "learning_rate": 2.9410905910340004e-05, "loss": 0.741, "step": 1737 }, { "epoch": 0.9147368421052632, "grad_norm": 1.3603005409240723, "learning_rate": 2.9410216015935304e-05, "loss": 1.313, "step": 1738 }, { "epoch": 0.9152631578947369, "grad_norm": 4.094880104064941, "learning_rate": 2.9409525725896636e-05, "loss": 1.0756, "step": 1739 }, { "epoch": 0.9157894736842105, "grad_norm": 1.1733410358428955, "learning_rate": 2.9408835040242953e-05, "loss": 0.9995, "step": 1740 }, { "epoch": 0.9163157894736842, "grad_norm": 3.4846761226654053, "learning_rate": 2.9408143958993218e-05, "loss": 0.9578, "step": 1741 }, { "epoch": 0.9168421052631579, "grad_norm": 1.4159212112426758, "learning_rate": 2.9407452482166412e-05, "loss": 1.1337, "step": 1742 }, { "epoch": 0.9173684210526316, "grad_norm": 7.647956371307373, "learning_rate": 2.9406760609781507e-05, "loss": 1.4257, "step": 1743 }, { "epoch": 0.9178947368421052, "grad_norm": 1.0013185739517212, "learning_rate": 2.9406068341857505e-05, "loss": 1.2189, "step": 1744 }, { "epoch": 0.9184210526315789, "grad_norm": 0.9043682813644409, "learning_rate": 2.9405375678413417e-05, "loss": 0.6736, "step": 1745 }, { "epoch": 0.9189473684210526, "grad_norm": 0.8028733134269714, "learning_rate": 2.940468261946825e-05, "loss": 0.9183, "step": 1746 }, { "epoch": 0.9194736842105263, "grad_norm": 11.042693138122559, "learning_rate": 2.9403989165041044e-05, "loss": 1.6583, "step": 1747 }, { "epoch": 0.92, "grad_norm": 3.432318687438965, "learning_rate": 2.940329531515082e-05, "loss": 0.7686, "step": 1748 }, { "epoch": 0.9205263157894736, "grad_norm": 6.241179466247559, "learning_rate": 2.9402601069816645e-05, "loss": 0.654, "step": 1749 }, { "epoch": 0.9210526315789473, "grad_norm": 4.325395584106445, "learning_rate": 2.9401906429057574e-05, "loss": 0.3556, "step": 1750 }, { "epoch": 0.921578947368421, "grad_norm": 1.5290063619613647, "learning_rate": 2.9401211392892677e-05, "loss": 1.601, "step": 1751 }, { "epoch": 0.9221052631578948, "grad_norm": 1.4590153694152832, "learning_rate": 2.9400515961341035e-05, "loss": 1.0342, "step": 1752 }, { "epoch": 0.9226315789473685, "grad_norm": 2.1654052734375, "learning_rate": 2.939982013442174e-05, "loss": 1.6038, "step": 1753 }, { "epoch": 0.9231578947368421, "grad_norm": 11.959127426147461, "learning_rate": 2.9399123912153908e-05, "loss": 0.6332, "step": 1754 }, { "epoch": 0.9236842105263158, "grad_norm": 2.1623222827911377, "learning_rate": 2.939842729455664e-05, "loss": 0.1993, "step": 1755 }, { "epoch": 0.9242105263157895, "grad_norm": 2.1751246452331543, "learning_rate": 2.9397730281649067e-05, "loss": 1.1215, "step": 1756 }, { "epoch": 0.9247368421052632, "grad_norm": 3.4143033027648926, "learning_rate": 2.9397032873450323e-05, "loss": 0.9459, "step": 1757 }, { "epoch": 0.9252631578947368, "grad_norm": 1.3410007953643799, "learning_rate": 2.9396335069979562e-05, "loss": 0.9964, "step": 1758 }, { "epoch": 0.9257894736842105, "grad_norm": 1.164724588394165, "learning_rate": 2.9395636871255933e-05, "loss": 0.3303, "step": 1759 }, { "epoch": 0.9263157894736842, "grad_norm": 1.904927372932434, "learning_rate": 2.9394938277298613e-05, "loss": 0.7966, "step": 1760 }, { "epoch": 0.9268421052631579, "grad_norm": 2.1621692180633545, "learning_rate": 2.9394239288126782e-05, "loss": 1.0655, "step": 1761 }, { "epoch": 0.9273684210526316, "grad_norm": 1.9396727085113525, "learning_rate": 2.939353990375962e-05, "loss": 1.1009, "step": 1762 }, { "epoch": 0.9278947368421052, "grad_norm": 2.1531522274017334, "learning_rate": 2.9392840124216342e-05, "loss": 0.9206, "step": 1763 }, { "epoch": 0.9284210526315789, "grad_norm": 1.473966360092163, "learning_rate": 2.9392139949516154e-05, "loss": 1.0062, "step": 1764 }, { "epoch": 0.9289473684210526, "grad_norm": 5.68750524520874, "learning_rate": 2.939143937967828e-05, "loss": 1.3941, "step": 1765 }, { "epoch": 0.9294736842105263, "grad_norm": 1.1354111433029175, "learning_rate": 2.9390738414721954e-05, "loss": 1.3447, "step": 1766 }, { "epoch": 0.93, "grad_norm": 4.276787757873535, "learning_rate": 2.939003705466642e-05, "loss": 1.534, "step": 1767 }, { "epoch": 0.9305263157894736, "grad_norm": 1.2840847969055176, "learning_rate": 2.938933529953094e-05, "loss": 0.7145, "step": 1768 }, { "epoch": 0.9310526315789474, "grad_norm": 1.7286664247512817, "learning_rate": 2.9388633149334777e-05, "loss": 0.1145, "step": 1769 }, { "epoch": 0.9315789473684211, "grad_norm": 6.341940879821777, "learning_rate": 2.9387930604097205e-05, "loss": 2.5274, "step": 1770 }, { "epoch": 0.9321052631578948, "grad_norm": 1.1878658533096313, "learning_rate": 2.938722766383751e-05, "loss": 0.5424, "step": 1771 }, { "epoch": 0.9326315789473684, "grad_norm": 3.3078019618988037, "learning_rate": 2.9386524328575003e-05, "loss": 0.2485, "step": 1772 }, { "epoch": 0.9331578947368421, "grad_norm": 1.6980602741241455, "learning_rate": 2.938582059832899e-05, "loss": 1.1616, "step": 1773 }, { "epoch": 0.9336842105263158, "grad_norm": 3.3054215908050537, "learning_rate": 2.9385116473118785e-05, "loss": 1.3687, "step": 1774 }, { "epoch": 0.9342105263157895, "grad_norm": 3.2444324493408203, "learning_rate": 2.9384411952963724e-05, "loss": 1.7401, "step": 1775 }, { "epoch": 0.9347368421052632, "grad_norm": 1.1763970851898193, "learning_rate": 2.9383707037883153e-05, "loss": 1.25, "step": 1776 }, { "epoch": 0.9352631578947368, "grad_norm": 1.8481565713882446, "learning_rate": 2.938300172789642e-05, "loss": 0.904, "step": 1777 }, { "epoch": 0.9357894736842105, "grad_norm": 1.5394039154052734, "learning_rate": 2.9382296023022895e-05, "loss": 1.2871, "step": 1778 }, { "epoch": 0.9363157894736842, "grad_norm": 2.9562089443206787, "learning_rate": 2.9381589923281952e-05, "loss": 0.9019, "step": 1779 }, { "epoch": 0.9368421052631579, "grad_norm": 2.452955961227417, "learning_rate": 2.9380883428692972e-05, "loss": 0.6759, "step": 1780 }, { "epoch": 0.9373684210526316, "grad_norm": 5.483946800231934, "learning_rate": 2.9380176539275355e-05, "loss": 1.4204, "step": 1781 }, { "epoch": 0.9378947368421052, "grad_norm": 9.330098152160645, "learning_rate": 2.9379469255048513e-05, "loss": 2.1166, "step": 1782 }, { "epoch": 0.9384210526315789, "grad_norm": 5.44893741607666, "learning_rate": 2.9378761576031858e-05, "loss": 1.667, "step": 1783 }, { "epoch": 0.9389473684210526, "grad_norm": 21.2198543548584, "learning_rate": 2.937805350224482e-05, "loss": 1.3647, "step": 1784 }, { "epoch": 0.9394736842105263, "grad_norm": 1.1891385316848755, "learning_rate": 2.9377345033706843e-05, "loss": 1.1004, "step": 1785 }, { "epoch": 0.94, "grad_norm": 3.092072010040283, "learning_rate": 2.937663617043738e-05, "loss": 0.3973, "step": 1786 }, { "epoch": 0.9405263157894737, "grad_norm": 4.912240505218506, "learning_rate": 2.9375926912455884e-05, "loss": 0.4901, "step": 1787 }, { "epoch": 0.9410526315789474, "grad_norm": 1.5781091451644897, "learning_rate": 2.9375217259781833e-05, "loss": 1.2677, "step": 1788 }, { "epoch": 0.9415789473684211, "grad_norm": 16.19452667236328, "learning_rate": 2.937450721243471e-05, "loss": 1.1125, "step": 1789 }, { "epoch": 0.9421052631578948, "grad_norm": 1.818876028060913, "learning_rate": 2.9373796770434015e-05, "loss": 1.052, "step": 1790 }, { "epoch": 0.9426315789473684, "grad_norm": 1.8112890720367432, "learning_rate": 2.9373085933799242e-05, "loss": 0.8413, "step": 1791 }, { "epoch": 0.9431578947368421, "grad_norm": 0.8109291195869446, "learning_rate": 2.937237470254992e-05, "loss": 0.4724, "step": 1792 }, { "epoch": 0.9436842105263158, "grad_norm": 1.2814196348190308, "learning_rate": 2.9371663076705566e-05, "loss": 1.2765, "step": 1793 }, { "epoch": 0.9442105263157895, "grad_norm": 47.906097412109375, "learning_rate": 2.937095105628572e-05, "loss": 2.2085, "step": 1794 }, { "epoch": 0.9447368421052632, "grad_norm": 1.119391918182373, "learning_rate": 2.937023864130993e-05, "loss": 1.2151, "step": 1795 }, { "epoch": 0.9452631578947368, "grad_norm": 2.7573742866516113, "learning_rate": 2.936952583179776e-05, "loss": 1.4973, "step": 1796 }, { "epoch": 0.9457894736842105, "grad_norm": 1.0621460676193237, "learning_rate": 2.9368812627768777e-05, "loss": 1.7011, "step": 1797 }, { "epoch": 0.9463157894736842, "grad_norm": 1.82633638381958, "learning_rate": 2.9368099029242564e-05, "loss": 0.6421, "step": 1798 }, { "epoch": 0.9468421052631579, "grad_norm": 2.238363742828369, "learning_rate": 2.9367385036238707e-05, "loss": 1.4712, "step": 1799 }, { "epoch": 0.9473684210526315, "grad_norm": 1.1244910955429077, "learning_rate": 2.9366670648776818e-05, "loss": 0.7498, "step": 1800 }, { "epoch": 0.9478947368421052, "grad_norm": 2.550694704055786, "learning_rate": 2.9365955866876503e-05, "loss": 0.8432, "step": 1801 }, { "epoch": 0.9484210526315789, "grad_norm": 0.9619981050491333, "learning_rate": 2.9365240690557387e-05, "loss": 0.9428, "step": 1802 }, { "epoch": 0.9489473684210527, "grad_norm": 3.0906801223754883, "learning_rate": 2.9364525119839107e-05, "loss": 0.254, "step": 1803 }, { "epoch": 0.9494736842105264, "grad_norm": 17.610000610351562, "learning_rate": 2.936380915474131e-05, "loss": 2.4892, "step": 1804 }, { "epoch": 0.95, "grad_norm": 1.0261893272399902, "learning_rate": 2.9363092795283654e-05, "loss": 1.0611, "step": 1805 }, { "epoch": 0.9505263157894737, "grad_norm": 1.3554835319519043, "learning_rate": 2.9362376041485807e-05, "loss": 0.9824, "step": 1806 }, { "epoch": 0.9510526315789474, "grad_norm": 1.1118865013122559, "learning_rate": 2.936165889336744e-05, "loss": 0.9515, "step": 1807 }, { "epoch": 0.9515789473684211, "grad_norm": 9.150864601135254, "learning_rate": 2.936094135094825e-05, "loss": 0.5341, "step": 1808 }, { "epoch": 0.9521052631578948, "grad_norm": 3.7893803119659424, "learning_rate": 2.936022341424794e-05, "loss": 1.7552, "step": 1809 }, { "epoch": 0.9526315789473684, "grad_norm": 1.512068271636963, "learning_rate": 2.9359505083286215e-05, "loss": 1.0817, "step": 1810 }, { "epoch": 0.9531578947368421, "grad_norm": 2.110664129257202, "learning_rate": 2.935878635808279e-05, "loss": 1.1328, "step": 1811 }, { "epoch": 0.9536842105263158, "grad_norm": 6.667463779449463, "learning_rate": 2.9358067238657414e-05, "loss": 1.9689, "step": 1812 }, { "epoch": 0.9542105263157895, "grad_norm": 8.632956504821777, "learning_rate": 2.9357347725029814e-05, "loss": 0.961, "step": 1813 }, { "epoch": 0.9547368421052631, "grad_norm": 1.0220603942871094, "learning_rate": 2.935662781721976e-05, "loss": 0.7937, "step": 1814 }, { "epoch": 0.9552631578947368, "grad_norm": 1.1829009056091309, "learning_rate": 2.9355907515247008e-05, "loss": 0.5874, "step": 1815 }, { "epoch": 0.9557894736842105, "grad_norm": 1.6473325490951538, "learning_rate": 2.9355186819131334e-05, "loss": 1.3057, "step": 1816 }, { "epoch": 0.9563157894736842, "grad_norm": 1.158306360244751, "learning_rate": 2.9354465728892528e-05, "loss": 0.0314, "step": 1817 }, { "epoch": 0.9568421052631579, "grad_norm": 10.786005973815918, "learning_rate": 2.9353744244550382e-05, "loss": 2.2784, "step": 1818 }, { "epoch": 0.9573684210526315, "grad_norm": 7.915785312652588, "learning_rate": 2.935302236612471e-05, "loss": 0.8294, "step": 1819 }, { "epoch": 0.9578947368421052, "grad_norm": 1.1970584392547607, "learning_rate": 2.9352300093635335e-05, "loss": 1.3563, "step": 1820 }, { "epoch": 0.958421052631579, "grad_norm": 1.5558509826660156, "learning_rate": 2.9351577427102075e-05, "loss": 1.0525, "step": 1821 }, { "epoch": 0.9589473684210527, "grad_norm": 0.9184319972991943, "learning_rate": 2.935085436654478e-05, "loss": 0.668, "step": 1822 }, { "epoch": 0.9594736842105264, "grad_norm": 1.1146199703216553, "learning_rate": 2.93501309119833e-05, "loss": 1.2854, "step": 1823 }, { "epoch": 0.96, "grad_norm": 1.9025349617004395, "learning_rate": 2.9349407063437496e-05, "loss": 0.8864, "step": 1824 }, { "epoch": 0.9605263157894737, "grad_norm": 6.680036544799805, "learning_rate": 2.934868282092724e-05, "loss": 1.3019, "step": 1825 }, { "epoch": 0.9610526315789474, "grad_norm": 2.815171241760254, "learning_rate": 2.934795818447242e-05, "loss": 0.3377, "step": 1826 }, { "epoch": 0.9615789473684211, "grad_norm": 1.282981276512146, "learning_rate": 2.934723315409293e-05, "loss": 1.0778, "step": 1827 }, { "epoch": 0.9621052631578947, "grad_norm": 6.144766330718994, "learning_rate": 2.9346507729808676e-05, "loss": 0.1954, "step": 1828 }, { "epoch": 0.9626315789473684, "grad_norm": 1.2909563779830933, "learning_rate": 2.9345781911639576e-05, "loss": 1.141, "step": 1829 }, { "epoch": 0.9631578947368421, "grad_norm": 10.239936828613281, "learning_rate": 2.9345055699605546e-05, "loss": 3.1448, "step": 1830 }, { "epoch": 0.9636842105263158, "grad_norm": 3.629612922668457, "learning_rate": 2.9344329093726542e-05, "loss": 0.5131, "step": 1831 }, { "epoch": 0.9642105263157895, "grad_norm": 1.5074138641357422, "learning_rate": 2.93436020940225e-05, "loss": 1.7405, "step": 1832 }, { "epoch": 0.9647368421052631, "grad_norm": 2.027520179748535, "learning_rate": 2.9342874700513387e-05, "loss": 0.6478, "step": 1833 }, { "epoch": 0.9652631578947368, "grad_norm": 2.1978070735931396, "learning_rate": 2.934214691321917e-05, "loss": 0.7294, "step": 1834 }, { "epoch": 0.9657894736842105, "grad_norm": 2.9730889797210693, "learning_rate": 2.9341418732159826e-05, "loss": 0.7626, "step": 1835 }, { "epoch": 0.9663157894736842, "grad_norm": 1.2914584875106812, "learning_rate": 2.9340690157355358e-05, "loss": 1.3947, "step": 1836 }, { "epoch": 0.966842105263158, "grad_norm": 7.379958629608154, "learning_rate": 2.9339961188825765e-05, "loss": 2.4647, "step": 1837 }, { "epoch": 0.9673684210526315, "grad_norm": 2.926400899887085, "learning_rate": 2.9339231826591057e-05, "loss": 0.5296, "step": 1838 }, { "epoch": 0.9678947368421053, "grad_norm": 1.7127336263656616, "learning_rate": 2.9338502070671258e-05, "loss": 0.8867, "step": 1839 }, { "epoch": 0.968421052631579, "grad_norm": 5.599946975708008, "learning_rate": 2.933777192108641e-05, "loss": 1.0288, "step": 1840 }, { "epoch": 0.9689473684210527, "grad_norm": 1.7110023498535156, "learning_rate": 2.9337041377856562e-05, "loss": 1.5123, "step": 1841 }, { "epoch": 0.9694736842105263, "grad_norm": 6.067218780517578, "learning_rate": 2.9336310441001757e-05, "loss": 1.374, "step": 1842 }, { "epoch": 0.97, "grad_norm": 13.339165687561035, "learning_rate": 2.9335579110542075e-05, "loss": 2.0621, "step": 1843 }, { "epoch": 0.9705263157894737, "grad_norm": 1.5139007568359375, "learning_rate": 2.9334847386497587e-05, "loss": 1.1468, "step": 1844 }, { "epoch": 0.9710526315789474, "grad_norm": 2.751978874206543, "learning_rate": 2.9334115268888392e-05, "loss": 1.4863, "step": 1845 }, { "epoch": 0.9715789473684211, "grad_norm": 0.912716805934906, "learning_rate": 2.933338275773458e-05, "loss": 0.575, "step": 1846 }, { "epoch": 0.9721052631578947, "grad_norm": 6.437097072601318, "learning_rate": 2.933264985305627e-05, "loss": 1.4573, "step": 1847 }, { "epoch": 0.9726315789473684, "grad_norm": 2.406679391860962, "learning_rate": 2.933191655487358e-05, "loss": 1.426, "step": 1848 }, { "epoch": 0.9731578947368421, "grad_norm": 1.209294319152832, "learning_rate": 2.9331182863206647e-05, "loss": 1.1083, "step": 1849 }, { "epoch": 0.9736842105263158, "grad_norm": 1.4055339097976685, "learning_rate": 2.933044877807561e-05, "loss": 1.1797, "step": 1850 }, { "epoch": 0.9742105263157895, "grad_norm": 0.443554550409317, "learning_rate": 2.9329714299500624e-05, "loss": 0.0177, "step": 1851 }, { "epoch": 0.9747368421052631, "grad_norm": 1.1881070137023926, "learning_rate": 2.9328979427501854e-05, "loss": 1.2222, "step": 1852 }, { "epoch": 0.9752631578947368, "grad_norm": 1.7724149227142334, "learning_rate": 2.9328244162099475e-05, "loss": 1.0751, "step": 1853 }, { "epoch": 0.9757894736842105, "grad_norm": 3.7436909675598145, "learning_rate": 2.932750850331368e-05, "loss": 0.5386, "step": 1854 }, { "epoch": 0.9763157894736842, "grad_norm": 1.2736313343048096, "learning_rate": 2.932677245116466e-05, "loss": 1.151, "step": 1855 }, { "epoch": 0.9768421052631578, "grad_norm": 10.034126281738281, "learning_rate": 2.932603600567263e-05, "loss": 0.5635, "step": 1856 }, { "epoch": 0.9773684210526316, "grad_norm": 1.5949655771255493, "learning_rate": 2.9325299166857802e-05, "loss": 1.3225, "step": 1857 }, { "epoch": 0.9778947368421053, "grad_norm": 5.0193376541137695, "learning_rate": 2.9324561934740407e-05, "loss": 0.7483, "step": 1858 }, { "epoch": 0.978421052631579, "grad_norm": 1.0509535074234009, "learning_rate": 2.932382430934069e-05, "loss": 0.9718, "step": 1859 }, { "epoch": 0.9789473684210527, "grad_norm": 2.3995654582977295, "learning_rate": 2.9323086290678897e-05, "loss": 0.7383, "step": 1860 }, { "epoch": 0.9794736842105263, "grad_norm": 1.2402573823928833, "learning_rate": 2.9322347878775294e-05, "loss": 1.2607, "step": 1861 }, { "epoch": 0.98, "grad_norm": 0.9786499738693237, "learning_rate": 2.9321609073650157e-05, "loss": 1.065, "step": 1862 }, { "epoch": 0.9805263157894737, "grad_norm": 6.080226898193359, "learning_rate": 2.9320869875323767e-05, "loss": 0.3911, "step": 1863 }, { "epoch": 0.9810526315789474, "grad_norm": 5.234240531921387, "learning_rate": 2.9320130283816417e-05, "loss": 0.958, "step": 1864 }, { "epoch": 0.9815789473684211, "grad_norm": 48.88657760620117, "learning_rate": 2.9319390299148417e-05, "loss": 2.8281, "step": 1865 }, { "epoch": 0.9821052631578947, "grad_norm": 1.2519406080245972, "learning_rate": 2.9318649921340076e-05, "loss": 0.5246, "step": 1866 }, { "epoch": 0.9826315789473684, "grad_norm": 2.0961368083953857, "learning_rate": 2.931790915041173e-05, "loss": 1.0672, "step": 1867 }, { "epoch": 0.9831578947368421, "grad_norm": 3.2622575759887695, "learning_rate": 2.9317167986383705e-05, "loss": 0.9251, "step": 1868 }, { "epoch": 0.9836842105263158, "grad_norm": 1.264971375465393, "learning_rate": 2.9316426429276366e-05, "loss": 0.2829, "step": 1869 }, { "epoch": 0.9842105263157894, "grad_norm": 16.512882232666016, "learning_rate": 2.9315684479110062e-05, "loss": 1.0379, "step": 1870 }, { "epoch": 0.9847368421052631, "grad_norm": 3.8302979469299316, "learning_rate": 2.931494213590516e-05, "loss": 0.7015, "step": 1871 }, { "epoch": 0.9852631578947368, "grad_norm": 2.289673089981079, "learning_rate": 2.9314199399682053e-05, "loss": 1.1638, "step": 1872 }, { "epoch": 0.9857894736842105, "grad_norm": 1.4307857751846313, "learning_rate": 2.9313456270461123e-05, "loss": 0.0371, "step": 1873 }, { "epoch": 0.9863157894736843, "grad_norm": 4.602456569671631, "learning_rate": 2.9312712748262774e-05, "loss": 2.1234, "step": 1874 }, { "epoch": 0.9868421052631579, "grad_norm": 2.3993990421295166, "learning_rate": 2.9311968833107423e-05, "loss": 0.5891, "step": 1875 }, { "epoch": 0.9873684210526316, "grad_norm": 31.689926147460938, "learning_rate": 2.9311224525015494e-05, "loss": 1.496, "step": 1876 }, { "epoch": 0.9878947368421053, "grad_norm": 2.8490824699401855, "learning_rate": 2.9310479824007416e-05, "loss": 0.3687, "step": 1877 }, { "epoch": 0.988421052631579, "grad_norm": 1.2507219314575195, "learning_rate": 2.930973473010365e-05, "loss": 0.9543, "step": 1878 }, { "epoch": 0.9889473684210527, "grad_norm": 2.8183679580688477, "learning_rate": 2.930898924332463e-05, "loss": 0.1816, "step": 1879 }, { "epoch": 0.9894736842105263, "grad_norm": 1.2869808673858643, "learning_rate": 2.9308243363690844e-05, "loss": 0.5637, "step": 1880 }, { "epoch": 0.99, "grad_norm": 4.482398986816406, "learning_rate": 2.9307497091222753e-05, "loss": 0.2045, "step": 1881 }, { "epoch": 0.9905263157894737, "grad_norm": 4.65263557434082, "learning_rate": 2.930675042594086e-05, "loss": 1.2288, "step": 1882 }, { "epoch": 0.9910526315789474, "grad_norm": 1.563451886177063, "learning_rate": 2.9306003367865662e-05, "loss": 1.599, "step": 1883 }, { "epoch": 0.991578947368421, "grad_norm": 1.2689063549041748, "learning_rate": 2.9305255917017665e-05, "loss": 1.0688, "step": 1884 }, { "epoch": 0.9921052631578947, "grad_norm": 1.954530119895935, "learning_rate": 2.930450807341739e-05, "loss": 0.2562, "step": 1885 }, { "epoch": 0.9926315789473684, "grad_norm": 9.376235008239746, "learning_rate": 2.9303759837085375e-05, "loss": 0.9329, "step": 1886 }, { "epoch": 0.9931578947368421, "grad_norm": 4.2806477546691895, "learning_rate": 2.9303011208042158e-05, "loss": 1.4794, "step": 1887 }, { "epoch": 0.9936842105263158, "grad_norm": 3.1993582248687744, "learning_rate": 2.9302262186308297e-05, "loss": 0.9944, "step": 1888 }, { "epoch": 0.9942105263157894, "grad_norm": 1.6474570035934448, "learning_rate": 2.9301512771904347e-05, "loss": 1.0777, "step": 1889 }, { "epoch": 0.9947368421052631, "grad_norm": 1.2916871309280396, "learning_rate": 2.9300762964850895e-05, "loss": 1.3304, "step": 1890 }, { "epoch": 0.9952631578947368, "grad_norm": 1.1107717752456665, "learning_rate": 2.930001276516852e-05, "loss": 1.1093, "step": 1891 }, { "epoch": 0.9957894736842106, "grad_norm": 1.7554007768630981, "learning_rate": 2.9299262172877817e-05, "loss": 1.1471, "step": 1892 }, { "epoch": 0.9963157894736843, "grad_norm": 4.788026332855225, "learning_rate": 2.9298511187999404e-05, "loss": 0.49, "step": 1893 }, { "epoch": 0.9968421052631579, "grad_norm": 5.148394584655762, "learning_rate": 2.929775981055389e-05, "loss": 0.2753, "step": 1894 }, { "epoch": 0.9973684210526316, "grad_norm": 1.212622046470642, "learning_rate": 2.9297008040561907e-05, "loss": 1.0885, "step": 1895 }, { "epoch": 0.9978947368421053, "grad_norm": 1.1759798526763916, "learning_rate": 2.9296255878044094e-05, "loss": 0.8806, "step": 1896 }, { "epoch": 0.998421052631579, "grad_norm": 1.2078953981399536, "learning_rate": 2.9295503323021103e-05, "loss": 1.0733, "step": 1897 }, { "epoch": 0.9989473684210526, "grad_norm": 5.538965225219727, "learning_rate": 2.9294750375513598e-05, "loss": 2.357, "step": 1898 }, { "epoch": 0.9994736842105263, "grad_norm": 0.9785202741622925, "learning_rate": 2.9293997035542244e-05, "loss": 0.7298, "step": 1899 }, { "epoch": 1.0, "grad_norm": 1.185262680053711, "learning_rate": 2.929324330312773e-05, "loss": 1.5887, "step": 1900 }, { "epoch": 1.0, "eval_loss": 1.0015404224395752, "eval_runtime": 12.876, "eval_samples_per_second": 7.766, "eval_steps_per_second": 7.766, "step": 1900 } ], "logging_steps": 1, "max_steps": 19000, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1900, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6932541497704448e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }