diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13373 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 475, + "global_step": 1900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005263157894736842, + "grad_norm": 1.3774428367614746, + "learning_rate": 7.5e-07, + "loss": 0.7461, + "step": 1 + }, + { + "epoch": 0.0005263157894736842, + "eval_loss": 1.3115713596343994, + "eval_runtime": 13.3879, + "eval_samples_per_second": 7.469, + "eval_steps_per_second": 7.469, + "step": 1 + }, + { + "epoch": 0.0010526315789473684, + "grad_norm": 0.40974903106689453, + "learning_rate": 1.5e-06, + "loss": 1.2756, + "step": 2 + }, + { + "epoch": 0.0015789473684210526, + "grad_norm": 0.5177225470542908, + "learning_rate": 2.25e-06, + "loss": 1.7994, + "step": 3 + }, + { + "epoch": 0.002105263157894737, + "grad_norm": 0.5397021174430847, + "learning_rate": 3e-06, + "loss": 1.3129, + "step": 4 + }, + { + "epoch": 0.002631578947368421, + "grad_norm": 0.37644293904304504, + "learning_rate": 3.75e-06, + "loss": 1.2963, + "step": 5 + }, + { + "epoch": 0.003157894736842105, + "grad_norm": 0.6736767292022705, + "learning_rate": 4.5e-06, + "loss": 1.0839, + "step": 6 + }, + { + "epoch": 0.0036842105263157894, + "grad_norm": 0.9168409109115601, + "learning_rate": 5.25e-06, + "loss": 1.2321, + "step": 7 + }, + { + "epoch": 0.004210526315789474, + "grad_norm": 0.5211862325668335, + "learning_rate": 6e-06, + "loss": 1.567, + "step": 8 + }, + { + "epoch": 0.004736842105263158, + "grad_norm": 0.6043412685394287, + "learning_rate": 6.750000000000001e-06, + "loss": 1.236, + "step": 9 + }, + { + "epoch": 0.005263157894736842, + "grad_norm": 0.9153957366943359, + "learning_rate": 7.5e-06, + "loss": 1.9274, + "step": 10 + }, + { + "epoch": 0.005789473684210527, + "grad_norm": 0.3442760407924652, + "learning_rate": 8.25e-06, + "loss": 1.7819, + "step": 11 + }, + { + "epoch": 0.00631578947368421, + "grad_norm": 3.2623486518859863, + "learning_rate": 9e-06, + "loss": 3.5503, + "step": 12 + }, + { + "epoch": 0.006842105263157895, + "grad_norm": 0.5294060111045837, + "learning_rate": 9.75e-06, + "loss": 1.0113, + "step": 13 + }, + { + "epoch": 0.007368421052631579, + "grad_norm": 0.7050935626029968, + "learning_rate": 1.05e-05, + "loss": 0.8552, + "step": 14 + }, + { + "epoch": 0.007894736842105263, + "grad_norm": 0.3988778591156006, + "learning_rate": 1.125e-05, + "loss": 1.2478, + "step": 15 + }, + { + "epoch": 0.008421052631578947, + "grad_norm": 1.1330819129943848, + "learning_rate": 1.2e-05, + "loss": 0.5481, + "step": 16 + }, + { + "epoch": 0.008947368421052631, + "grad_norm": 0.3928195834159851, + "learning_rate": 1.275e-05, + "loss": 1.2705, + "step": 17 + }, + { + "epoch": 0.009473684210526316, + "grad_norm": 0.5050056576728821, + "learning_rate": 1.3500000000000001e-05, + "loss": 1.0001, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 0.40563464164733887, + "learning_rate": 1.4249999999999999e-05, + "loss": 1.3443, + "step": 19 + }, + { + "epoch": 0.010526315789473684, + "grad_norm": 0.802213728427887, + "learning_rate": 1.5e-05, + "loss": 1.6883, + "step": 20 + }, + { + "epoch": 0.011052631578947368, + "grad_norm": 0.6669230461120605, + "learning_rate": 1.575e-05, + "loss": 1.8126, + "step": 21 + }, + { + "epoch": 0.011578947368421053, + "grad_norm": 0.7659299373626709, + "learning_rate": 1.65e-05, + "loss": 0.892, + "step": 22 + }, + { + "epoch": 0.012105263157894737, + "grad_norm": 0.42965367436408997, + "learning_rate": 1.725e-05, + "loss": 1.5116, + "step": 23 + }, + { + "epoch": 0.01263157894736842, + "grad_norm": 0.7102161049842834, + "learning_rate": 1.8e-05, + "loss": 0.6337, + "step": 24 + }, + { + "epoch": 0.013157894736842105, + "grad_norm": 1.3091274499893188, + "learning_rate": 1.8750000000000002e-05, + "loss": 1.0829, + "step": 25 + }, + { + "epoch": 0.01368421052631579, + "grad_norm": 4.387722969055176, + "learning_rate": 1.95e-05, + "loss": 0.7469, + "step": 26 + }, + { + "epoch": 0.014210526315789474, + "grad_norm": 1.5182483196258545, + "learning_rate": 2.025e-05, + "loss": 0.7235, + "step": 27 + }, + { + "epoch": 0.014736842105263158, + "grad_norm": 4.085071563720703, + "learning_rate": 2.1e-05, + "loss": 2.2212, + "step": 28 + }, + { + "epoch": 0.015263157894736841, + "grad_norm": 0.7777957916259766, + "learning_rate": 2.175e-05, + "loss": 1.114, + "step": 29 + }, + { + "epoch": 0.015789473684210527, + "grad_norm": 2.115023136138916, + "learning_rate": 2.25e-05, + "loss": 0.3802, + "step": 30 + }, + { + "epoch": 0.01631578947368421, + "grad_norm": 6.118307113647461, + "learning_rate": 2.3250000000000003e-05, + "loss": 1.2828, + "step": 31 + }, + { + "epoch": 0.016842105263157894, + "grad_norm": 1.7932595014572144, + "learning_rate": 2.4e-05, + "loss": 1.1272, + "step": 32 + }, + { + "epoch": 0.017368421052631578, + "grad_norm": 1.0127062797546387, + "learning_rate": 2.475e-05, + "loss": 1.3309, + "step": 33 + }, + { + "epoch": 0.017894736842105262, + "grad_norm": 0.7217763066291809, + "learning_rate": 2.55e-05, + "loss": 1.2299, + "step": 34 + }, + { + "epoch": 0.018421052631578946, + "grad_norm": 4.896430492401123, + "learning_rate": 2.625e-05, + "loss": 0.8715, + "step": 35 + }, + { + "epoch": 0.018947368421052633, + "grad_norm": 1.978946328163147, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.078, + "step": 36 + }, + { + "epoch": 0.019473684210526317, + "grad_norm": 4.97585916519165, + "learning_rate": 2.7750000000000004e-05, + "loss": 1.4184, + "step": 37 + }, + { + "epoch": 0.02, + "grad_norm": 2.3499672412872314, + "learning_rate": 2.8499999999999998e-05, + "loss": 1.373, + "step": 38 + }, + { + "epoch": 0.020526315789473684, + "grad_norm": 3.067195415496826, + "learning_rate": 2.925e-05, + "loss": 0.4262, + "step": 39 + }, + { + "epoch": 0.021052631578947368, + "grad_norm": 11.364575386047363, + "learning_rate": 3e-05, + "loss": 1.8748, + "step": 40 + }, + { + "epoch": 0.02157894736842105, + "grad_norm": 1.4466383457183838, + "learning_rate": 2.999999979408673e-05, + "loss": 1.0143, + "step": 41 + }, + { + "epoch": 0.022105263157894735, + "grad_norm": 1.8457032442092896, + "learning_rate": 2.9999999176346915e-05, + "loss": 1.176, + "step": 42 + }, + { + "epoch": 0.022631578947368423, + "grad_norm": 1.4849117994308472, + "learning_rate": 2.9999998146780576e-05, + "loss": 1.1838, + "step": 43 + }, + { + "epoch": 0.023157894736842106, + "grad_norm": 1.6621780395507812, + "learning_rate": 2.9999996705387744e-05, + "loss": 1.0695, + "step": 44 + }, + { + "epoch": 0.02368421052631579, + "grad_norm": 2.543567180633545, + "learning_rate": 2.9999994852168458e-05, + "loss": 1.2559, + "step": 45 + }, + { + "epoch": 0.024210526315789474, + "grad_norm": 2.46481990814209, + "learning_rate": 2.999999258712277e-05, + "loss": 1.2556, + "step": 46 + }, + { + "epoch": 0.024736842105263158, + "grad_norm": 3.249171733856201, + "learning_rate": 2.999998991025073e-05, + "loss": 0.1304, + "step": 47 + }, + { + "epoch": 0.02526315789473684, + "grad_norm": 8.809581756591797, + "learning_rate": 2.9999986821552427e-05, + "loss": 3.5048, + "step": 48 + }, + { + "epoch": 0.025789473684210525, + "grad_norm": 2.1939406394958496, + "learning_rate": 2.999998332102794e-05, + "loss": 1.024, + "step": 49 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 5.13323450088501, + "learning_rate": 2.9999979408677368e-05, + "loss": 0.7764, + "step": 50 + }, + { + "epoch": 0.026842105263157896, + "grad_norm": 1.4928818941116333, + "learning_rate": 2.999997508450081e-05, + "loss": 1.3143, + "step": 51 + }, + { + "epoch": 0.02736842105263158, + "grad_norm": 5.20947265625, + "learning_rate": 2.999997034849839e-05, + "loss": 2.3539, + "step": 52 + }, + { + "epoch": 0.027894736842105264, + "grad_norm": 1.5518254041671753, + "learning_rate": 2.999996520067024e-05, + "loss": 1.0206, + "step": 53 + }, + { + "epoch": 0.028421052631578948, + "grad_norm": 1.5228031873703003, + "learning_rate": 2.9999959641016498e-05, + "loss": 1.1746, + "step": 54 + }, + { + "epoch": 0.02894736842105263, + "grad_norm": 4.204188823699951, + "learning_rate": 2.999995366953732e-05, + "loss": 1.6748, + "step": 55 + }, + { + "epoch": 0.029473684210526315, + "grad_norm": 12.567036628723145, + "learning_rate": 2.999994728623287e-05, + "loss": 1.6706, + "step": 56 + }, + { + "epoch": 0.03, + "grad_norm": 2.302267074584961, + "learning_rate": 2.9999940491103316e-05, + "loss": 1.2736, + "step": 57 + }, + { + "epoch": 0.030526315789473683, + "grad_norm": 2.4936375617980957, + "learning_rate": 2.999993328414885e-05, + "loss": 1.1823, + "step": 58 + }, + { + "epoch": 0.03105263157894737, + "grad_norm": 2.828848361968994, + "learning_rate": 2.9999925665369675e-05, + "loss": 2.1706, + "step": 59 + }, + { + "epoch": 0.031578947368421054, + "grad_norm": 3.864727258682251, + "learning_rate": 2.999991763476599e-05, + "loss": 1.4049, + "step": 60 + }, + { + "epoch": 0.032105263157894734, + "grad_norm": 2.246717691421509, + "learning_rate": 2.9999909192338023e-05, + "loss": 1.7979, + "step": 61 + }, + { + "epoch": 0.03263157894736842, + "grad_norm": 2.686671257019043, + "learning_rate": 2.9999900338086e-05, + "loss": 1.2399, + "step": 62 + }, + { + "epoch": 0.03315789473684211, + "grad_norm": 4.649468898773193, + "learning_rate": 2.9999891072010173e-05, + "loss": 1.6921, + "step": 63 + }, + { + "epoch": 0.03368421052631579, + "grad_norm": 6.839999675750732, + "learning_rate": 2.9999881394110785e-05, + "loss": 1.2485, + "step": 64 + }, + { + "epoch": 0.034210526315789476, + "grad_norm": 2.245439052581787, + "learning_rate": 2.9999871304388115e-05, + "loss": 1.8246, + "step": 65 + }, + { + "epoch": 0.034736842105263156, + "grad_norm": 3.225515127182007, + "learning_rate": 2.9999860802842423e-05, + "loss": 1.4347, + "step": 66 + }, + { + "epoch": 0.035263157894736843, + "grad_norm": 5.565622806549072, + "learning_rate": 2.9999849889474012e-05, + "loss": 1.2189, + "step": 67 + }, + { + "epoch": 0.035789473684210524, + "grad_norm": 10.95042896270752, + "learning_rate": 2.9999838564283172e-05, + "loss": 0.4937, + "step": 68 + }, + { + "epoch": 0.03631578947368421, + "grad_norm": 2.187350034713745, + "learning_rate": 2.9999826827270223e-05, + "loss": 1.4298, + "step": 69 + }, + { + "epoch": 0.03684210526315789, + "grad_norm": 2.14497447013855, + "learning_rate": 2.999981467843548e-05, + "loss": 1.1944, + "step": 70 + }, + { + "epoch": 0.03736842105263158, + "grad_norm": 1.1492475271224976, + "learning_rate": 2.9999802117779277e-05, + "loss": 0.6928, + "step": 71 + }, + { + "epoch": 0.037894736842105266, + "grad_norm": 4.218234062194824, + "learning_rate": 2.9999789145301967e-05, + "loss": 0.0767, + "step": 72 + }, + { + "epoch": 0.038421052631578946, + "grad_norm": 7.308555603027344, + "learning_rate": 2.9999775761003895e-05, + "loss": 1.2755, + "step": 73 + }, + { + "epoch": 0.03894736842105263, + "grad_norm": 2.922872543334961, + "learning_rate": 2.9999761964885436e-05, + "loss": 1.1238, + "step": 74 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 2.961275339126587, + "learning_rate": 2.9999747756946967e-05, + "loss": 1.0357, + "step": 75 + }, + { + "epoch": 0.04, + "grad_norm": 12.174813270568848, + "learning_rate": 2.9999733137188872e-05, + "loss": 0.7883, + "step": 76 + }, + { + "epoch": 0.04052631578947368, + "grad_norm": 2.3447883129119873, + "learning_rate": 2.9999718105611564e-05, + "loss": 3.1787, + "step": 77 + }, + { + "epoch": 0.04105263157894737, + "grad_norm": 1.4516093730926514, + "learning_rate": 2.9999702662215446e-05, + "loss": 1.1768, + "step": 78 + }, + { + "epoch": 0.041578947368421056, + "grad_norm": 2.680931568145752, + "learning_rate": 2.9999686807000945e-05, + "loss": 1.6191, + "step": 79 + }, + { + "epoch": 0.042105263157894736, + "grad_norm": 2.4612650871276855, + "learning_rate": 2.99996705399685e-05, + "loss": 0.9811, + "step": 80 + }, + { + "epoch": 0.04263157894736842, + "grad_norm": 10.88182258605957, + "learning_rate": 2.999965386111855e-05, + "loss": 1.5144, + "step": 81 + }, + { + "epoch": 0.0431578947368421, + "grad_norm": 1.9869924783706665, + "learning_rate": 2.9999636770451562e-05, + "loss": 0.7623, + "step": 82 + }, + { + "epoch": 0.04368421052631579, + "grad_norm": 25.559877395629883, + "learning_rate": 2.9999619267968e-05, + "loss": 0.4775, + "step": 83 + }, + { + "epoch": 0.04421052631578947, + "grad_norm": 2.2357089519500732, + "learning_rate": 2.9999601353668344e-05, + "loss": 0.9626, + "step": 84 + }, + { + "epoch": 0.04473684210526316, + "grad_norm": 2.5922367572784424, + "learning_rate": 2.9999583027553084e-05, + "loss": 1.161, + "step": 85 + }, + { + "epoch": 0.045263157894736845, + "grad_norm": 3.0082902908325195, + "learning_rate": 2.999956428962273e-05, + "loss": 0.6429, + "step": 86 + }, + { + "epoch": 0.045789473684210526, + "grad_norm": 1.8411225080490112, + "learning_rate": 2.9999545139877787e-05, + "loss": 1.2799, + "step": 87 + }, + { + "epoch": 0.04631578947368421, + "grad_norm": 61.66618347167969, + "learning_rate": 2.999952557831879e-05, + "loss": 3.875, + "step": 88 + }, + { + "epoch": 0.04684210526315789, + "grad_norm": 1.83926522731781, + "learning_rate": 2.9999505604946272e-05, + "loss": 1.1743, + "step": 89 + }, + { + "epoch": 0.04736842105263158, + "grad_norm": 3.020488977432251, + "learning_rate": 2.9999485219760786e-05, + "loss": 2.1196, + "step": 90 + }, + { + "epoch": 0.04789473684210526, + "grad_norm": 3.429885149002075, + "learning_rate": 2.999946442276288e-05, + "loss": 2.4292, + "step": 91 + }, + { + "epoch": 0.04842105263157895, + "grad_norm": 2.8225109577178955, + "learning_rate": 2.9999443213953137e-05, + "loss": 1.2811, + "step": 92 + }, + { + "epoch": 0.04894736842105263, + "grad_norm": 3.0931406021118164, + "learning_rate": 2.9999421593332133e-05, + "loss": 0.8613, + "step": 93 + }, + { + "epoch": 0.049473684210526316, + "grad_norm": 1.8076380491256714, + "learning_rate": 2.999939956090046e-05, + "loss": 0.9606, + "step": 94 + }, + { + "epoch": 0.05, + "grad_norm": 4.066877365112305, + "learning_rate": 2.999937711665873e-05, + "loss": 1.0512, + "step": 95 + }, + { + "epoch": 0.05052631578947368, + "grad_norm": 1.641595482826233, + "learning_rate": 2.9999354260607556e-05, + "loss": 0.7993, + "step": 96 + }, + { + "epoch": 0.05105263157894737, + "grad_norm": 9.759380340576172, + "learning_rate": 2.9999330992747566e-05, + "loss": 0.7986, + "step": 97 + }, + { + "epoch": 0.05157894736842105, + "grad_norm": 1.395878791809082, + "learning_rate": 2.999930731307939e-05, + "loss": 1.129, + "step": 98 + }, + { + "epoch": 0.05210526315789474, + "grad_norm": 5.108509540557861, + "learning_rate": 2.9999283221603697e-05, + "loss": 0.0843, + "step": 99 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 4.187808513641357, + "learning_rate": 2.999925871832113e-05, + "loss": 1.9898, + "step": 100 + }, + { + "epoch": 0.053157894736842105, + "grad_norm": 2.7410473823547363, + "learning_rate": 2.9999233803232368e-05, + "loss": 1.4947, + "step": 101 + }, + { + "epoch": 0.05368421052631579, + "grad_norm": 3.0927412509918213, + "learning_rate": 2.99992084763381e-05, + "loss": 0.5582, + "step": 102 + }, + { + "epoch": 0.05421052631578947, + "grad_norm": 28.332054138183594, + "learning_rate": 2.9999182737639015e-05, + "loss": 2.1906, + "step": 103 + }, + { + "epoch": 0.05473684210526316, + "grad_norm": 1.7883048057556152, + "learning_rate": 2.9999156587135824e-05, + "loss": 1.2287, + "step": 104 + }, + { + "epoch": 0.05526315789473684, + "grad_norm": 3.060166597366333, + "learning_rate": 2.999913002482924e-05, + "loss": 1.476, + "step": 105 + }, + { + "epoch": 0.05578947368421053, + "grad_norm": 1.8946037292480469, + "learning_rate": 2.9999103050719998e-05, + "loss": 0.8644, + "step": 106 + }, + { + "epoch": 0.05631578947368421, + "grad_norm": 2.7290570735931396, + "learning_rate": 2.9999075664808832e-05, + "loss": 0.1352, + "step": 107 + }, + { + "epoch": 0.056842105263157895, + "grad_norm": 6.147144794464111, + "learning_rate": 2.9999047867096502e-05, + "loss": 0.2661, + "step": 108 + }, + { + "epoch": 0.057368421052631575, + "grad_norm": 1.951554298400879, + "learning_rate": 2.999901965758377e-05, + "loss": 1.4944, + "step": 109 + }, + { + "epoch": 0.05789473684210526, + "grad_norm": 2.1169261932373047, + "learning_rate": 2.99989910362714e-05, + "loss": 1.0947, + "step": 110 + }, + { + "epoch": 0.05842105263157895, + "grad_norm": 1.7119637727737427, + "learning_rate": 2.9998962003160186e-05, + "loss": 1.2013, + "step": 111 + }, + { + "epoch": 0.05894736842105263, + "grad_norm": 1.4529640674591064, + "learning_rate": 2.9998932558250927e-05, + "loss": 1.1557, + "step": 112 + }, + { + "epoch": 0.05947368421052632, + "grad_norm": 2.9316415786743164, + "learning_rate": 2.9998902701544427e-05, + "loss": 1.3037, + "step": 113 + }, + { + "epoch": 0.06, + "grad_norm": 10.239468574523926, + "learning_rate": 2.999887243304151e-05, + "loss": 1.135, + "step": 114 + }, + { + "epoch": 0.060526315789473685, + "grad_norm": 2.1971309185028076, + "learning_rate": 2.9998841752743002e-05, + "loss": 1.227, + "step": 115 + }, + { + "epoch": 0.061052631578947365, + "grad_norm": 4.265197277069092, + "learning_rate": 2.999881066064975e-05, + "loss": 0.4307, + "step": 116 + }, + { + "epoch": 0.06157894736842105, + "grad_norm": 9.106600761413574, + "learning_rate": 2.9998779156762604e-05, + "loss": 1.7782, + "step": 117 + }, + { + "epoch": 0.06210526315789474, + "grad_norm": 5.628964424133301, + "learning_rate": 2.9998747241082433e-05, + "loss": 1.1438, + "step": 118 + }, + { + "epoch": 0.06263157894736843, + "grad_norm": 33.68876647949219, + "learning_rate": 2.9998714913610106e-05, + "loss": 1.8069, + "step": 119 + }, + { + "epoch": 0.06315789473684211, + "grad_norm": 2.011239767074585, + "learning_rate": 2.9998682174346518e-05, + "loss": 1.168, + "step": 120 + }, + { + "epoch": 0.06368421052631579, + "grad_norm": 2.534121036529541, + "learning_rate": 2.9998649023292564e-05, + "loss": 1.0316, + "step": 121 + }, + { + "epoch": 0.06421052631578947, + "grad_norm": 7.767317295074463, + "learning_rate": 2.9998615460449155e-05, + "loss": 2.0673, + "step": 122 + }, + { + "epoch": 0.06473684210526316, + "grad_norm": 2.559743642807007, + "learning_rate": 2.9998581485817213e-05, + "loss": 1.1438, + "step": 123 + }, + { + "epoch": 0.06526315789473684, + "grad_norm": 5.9045186042785645, + "learning_rate": 2.9998547099397673e-05, + "loss": 0.527, + "step": 124 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 2.093289375305176, + "learning_rate": 2.9998512301191472e-05, + "loss": 1.4109, + "step": 125 + }, + { + "epoch": 0.06631578947368422, + "grad_norm": 3.366138458251953, + "learning_rate": 2.9998477091199575e-05, + "loss": 1.536, + "step": 126 + }, + { + "epoch": 0.0668421052631579, + "grad_norm": 13.098956108093262, + "learning_rate": 2.9998441469422938e-05, + "loss": 1.4469, + "step": 127 + }, + { + "epoch": 0.06736842105263158, + "grad_norm": 1.7754340171813965, + "learning_rate": 2.999840543586255e-05, + "loss": 1.2916, + "step": 128 + }, + { + "epoch": 0.06789473684210526, + "grad_norm": 2.7748563289642334, + "learning_rate": 2.9998368990519393e-05, + "loss": 1.0124, + "step": 129 + }, + { + "epoch": 0.06842105263157895, + "grad_norm": 16.237028121948242, + "learning_rate": 2.9998332133394467e-05, + "loss": 0.7667, + "step": 130 + }, + { + "epoch": 0.06894736842105263, + "grad_norm": 1.7698700428009033, + "learning_rate": 2.9998294864488786e-05, + "loss": 1.7484, + "step": 131 + }, + { + "epoch": 0.06947368421052631, + "grad_norm": 2.1112704277038574, + "learning_rate": 2.9998257183803378e-05, + "loss": 1.3866, + "step": 132 + }, + { + "epoch": 0.07, + "grad_norm": 1.5450336933135986, + "learning_rate": 2.999821909133927e-05, + "loss": 1.1476, + "step": 133 + }, + { + "epoch": 0.07052631578947369, + "grad_norm": 2.787191390991211, + "learning_rate": 2.9998180587097518e-05, + "loss": 0.7392, + "step": 134 + }, + { + "epoch": 0.07105263157894737, + "grad_norm": 7.418491840362549, + "learning_rate": 2.999814167107916e-05, + "loss": 1.0287, + "step": 135 + }, + { + "epoch": 0.07157894736842105, + "grad_norm": 2.0604071617126465, + "learning_rate": 2.9998102343285288e-05, + "loss": 0.9736, + "step": 136 + }, + { + "epoch": 0.07210526315789474, + "grad_norm": 3.1856720447540283, + "learning_rate": 2.9998062603716966e-05, + "loss": 1.1029, + "step": 137 + }, + { + "epoch": 0.07263157894736842, + "grad_norm": 44.8514289855957, + "learning_rate": 2.9998022452375286e-05, + "loss": 2.1055, + "step": 138 + }, + { + "epoch": 0.0731578947368421, + "grad_norm": 2.2738101482391357, + "learning_rate": 2.999798188926136e-05, + "loss": 1.135, + "step": 139 + }, + { + "epoch": 0.07368421052631578, + "grad_norm": 4.192337512969971, + "learning_rate": 2.9997940914376287e-05, + "loss": 0.9468, + "step": 140 + }, + { + "epoch": 0.07421052631578948, + "grad_norm": 1.970901370048523, + "learning_rate": 2.9997899527721208e-05, + "loss": 1.0999, + "step": 141 + }, + { + "epoch": 0.07473684210526316, + "grad_norm": 7.665932655334473, + "learning_rate": 2.999785772929725e-05, + "loss": 0.8222, + "step": 142 + }, + { + "epoch": 0.07526315789473684, + "grad_norm": 1.8983104228973389, + "learning_rate": 2.9997815519105562e-05, + "loss": 1.2967, + "step": 143 + }, + { + "epoch": 0.07578947368421053, + "grad_norm": 1.8003042936325073, + "learning_rate": 2.9997772897147302e-05, + "loss": 1.4046, + "step": 144 + }, + { + "epoch": 0.07631578947368421, + "grad_norm": 9.573161125183105, + "learning_rate": 2.999772986342364e-05, + "loss": 0.5591, + "step": 145 + }, + { + "epoch": 0.07684210526315789, + "grad_norm": 4.940485954284668, + "learning_rate": 2.9997686417935764e-05, + "loss": 1.9492, + "step": 146 + }, + { + "epoch": 0.07736842105263157, + "grad_norm": 3.1047379970550537, + "learning_rate": 2.9997642560684854e-05, + "loss": 1.631, + "step": 147 + }, + { + "epoch": 0.07789473684210527, + "grad_norm": 2.6074681282043457, + "learning_rate": 2.999759829167213e-05, + "loss": 1.4144, + "step": 148 + }, + { + "epoch": 0.07842105263157895, + "grad_norm": 6.4516448974609375, + "learning_rate": 2.9997553610898793e-05, + "loss": 0.7626, + "step": 149 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 5.060396671295166, + "learning_rate": 2.999750851836608e-05, + "loss": 1.457, + "step": 150 + }, + { + "epoch": 0.07947368421052632, + "grad_norm": 2.47971510887146, + "learning_rate": 2.9997463014075222e-05, + "loss": 1.3472, + "step": 151 + }, + { + "epoch": 0.08, + "grad_norm": 2.023242473602295, + "learning_rate": 2.999741709802747e-05, + "loss": 1.045, + "step": 152 + }, + { + "epoch": 0.08052631578947368, + "grad_norm": 2.3610341548919678, + "learning_rate": 2.999737077022409e-05, + "loss": 1.0414, + "step": 153 + }, + { + "epoch": 0.08105263157894736, + "grad_norm": 6.400054931640625, + "learning_rate": 2.9997324030666347e-05, + "loss": 2.2875, + "step": 154 + }, + { + "epoch": 0.08157894736842106, + "grad_norm": 3.4902946949005127, + "learning_rate": 2.999727687935553e-05, + "loss": 0.851, + "step": 155 + }, + { + "epoch": 0.08210526315789474, + "grad_norm": 17.07858657836914, + "learning_rate": 2.9997229316292928e-05, + "loss": 1.6888, + "step": 156 + }, + { + "epoch": 0.08263157894736842, + "grad_norm": 0.5993421077728271, + "learning_rate": 2.999718134147985e-05, + "loss": 0.0141, + "step": 157 + }, + { + "epoch": 0.08315789473684211, + "grad_norm": 2.3327324390411377, + "learning_rate": 2.9997132954917615e-05, + "loss": 1.4322, + "step": 158 + }, + { + "epoch": 0.08368421052631579, + "grad_norm": 2.2541940212249756, + "learning_rate": 2.9997084156607543e-05, + "loss": 1.0867, + "step": 159 + }, + { + "epoch": 0.08421052631578947, + "grad_norm": 5.866251468658447, + "learning_rate": 2.9997034946550984e-05, + "loss": 1.4221, + "step": 160 + }, + { + "epoch": 0.08473684210526315, + "grad_norm": 4.001148223876953, + "learning_rate": 2.9996985324749288e-05, + "loss": 1.6462, + "step": 161 + }, + { + "epoch": 0.08526315789473685, + "grad_norm": 10.700677871704102, + "learning_rate": 2.9996935291203805e-05, + "loss": 1.456, + "step": 162 + }, + { + "epoch": 0.08578947368421053, + "grad_norm": 1.5370553731918335, + "learning_rate": 2.9996884845915925e-05, + "loss": 0.4811, + "step": 163 + }, + { + "epoch": 0.0863157894736842, + "grad_norm": 4.019623756408691, + "learning_rate": 2.9996833988887026e-05, + "loss": 0.0901, + "step": 164 + }, + { + "epoch": 0.0868421052631579, + "grad_norm": 1.621778130531311, + "learning_rate": 2.9996782720118502e-05, + "loss": 1.005, + "step": 165 + }, + { + "epoch": 0.08736842105263158, + "grad_norm": 2.1922006607055664, + "learning_rate": 2.999673103961176e-05, + "loss": 1.1868, + "step": 166 + }, + { + "epoch": 0.08789473684210526, + "grad_norm": 2.594514846801758, + "learning_rate": 2.999667894736823e-05, + "loss": 1.2468, + "step": 167 + }, + { + "epoch": 0.08842105263157894, + "grad_norm": 1.5204415321350098, + "learning_rate": 2.9996626443389325e-05, + "loss": 1.098, + "step": 168 + }, + { + "epoch": 0.08894736842105264, + "grad_norm": 14.031874656677246, + "learning_rate": 2.9996573527676498e-05, + "loss": 1.7613, + "step": 169 + }, + { + "epoch": 0.08947368421052632, + "grad_norm": 2.578813076019287, + "learning_rate": 2.99965202002312e-05, + "loss": 0.8607, + "step": 170 + }, + { + "epoch": 0.09, + "grad_norm": 4.223609447479248, + "learning_rate": 2.9996466461054897e-05, + "loss": 0.3025, + "step": 171 + }, + { + "epoch": 0.09052631578947369, + "grad_norm": 3.067014694213867, + "learning_rate": 2.9996412310149058e-05, + "loss": 1.1488, + "step": 172 + }, + { + "epoch": 0.09105263157894737, + "grad_norm": 1.7577857971191406, + "learning_rate": 2.9996357747515174e-05, + "loss": 1.3616, + "step": 173 + }, + { + "epoch": 0.09157894736842105, + "grad_norm": 4.0033278465271, + "learning_rate": 2.9996302773154742e-05, + "loss": 0.5106, + "step": 174 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 2.1187245845794678, + "learning_rate": 2.999624738706927e-05, + "loss": 1.1571, + "step": 175 + }, + { + "epoch": 0.09263157894736843, + "grad_norm": 10.947097778320312, + "learning_rate": 2.9996191589260284e-05, + "loss": 1.2541, + "step": 176 + }, + { + "epoch": 0.0931578947368421, + "grad_norm": 1.7811756134033203, + "learning_rate": 2.9996135379729314e-05, + "loss": 1.5256, + "step": 177 + }, + { + "epoch": 0.09368421052631579, + "grad_norm": 2.26837420463562, + "learning_rate": 2.9996078758477898e-05, + "loss": 0.9947, + "step": 178 + }, + { + "epoch": 0.09421052631578947, + "grad_norm": 4.844338417053223, + "learning_rate": 2.9996021725507595e-05, + "loss": 1.7111, + "step": 179 + }, + { + "epoch": 0.09473684210526316, + "grad_norm": 1.7822426557540894, + "learning_rate": 2.9995964280819967e-05, + "loss": 0.824, + "step": 180 + }, + { + "epoch": 0.09526315789473684, + "grad_norm": 2.7125401496887207, + "learning_rate": 2.99959064244166e-05, + "loss": 1.2433, + "step": 181 + }, + { + "epoch": 0.09578947368421052, + "grad_norm": 1.5130306482315063, + "learning_rate": 2.9995848156299076e-05, + "loss": 1.2715, + "step": 182 + }, + { + "epoch": 0.09631578947368422, + "grad_norm": 8.074607849121094, + "learning_rate": 2.999578947646899e-05, + "loss": 1.9851, + "step": 183 + }, + { + "epoch": 0.0968421052631579, + "grad_norm": 5.693285942077637, + "learning_rate": 2.999573038492796e-05, + "loss": 1.5789, + "step": 184 + }, + { + "epoch": 0.09736842105263158, + "grad_norm": 1.201459288597107, + "learning_rate": 2.9995670881677607e-05, + "loss": 1.1029, + "step": 185 + }, + { + "epoch": 0.09789473684210526, + "grad_norm": 2.4890646934509277, + "learning_rate": 2.9995610966719565e-05, + "loss": 1.1719, + "step": 186 + }, + { + "epoch": 0.09842105263157895, + "grad_norm": 6.686047077178955, + "learning_rate": 2.999555064005548e-05, + "loss": 1.8928, + "step": 187 + }, + { + "epoch": 0.09894736842105263, + "grad_norm": 1.3713222742080688, + "learning_rate": 2.9995489901687004e-05, + "loss": 0.7986, + "step": 188 + }, + { + "epoch": 0.09947368421052631, + "grad_norm": 1.3539314270019531, + "learning_rate": 2.999542875161581e-05, + "loss": 1.0582, + "step": 189 + }, + { + "epoch": 0.1, + "grad_norm": 7.031521797180176, + "learning_rate": 2.999536718984357e-05, + "loss": 1.2458, + "step": 190 + }, + { + "epoch": 0.10052631578947369, + "grad_norm": 2.317741870880127, + "learning_rate": 2.9995305216371986e-05, + "loss": 1.7422, + "step": 191 + }, + { + "epoch": 0.10105263157894737, + "grad_norm": 5.038298606872559, + "learning_rate": 2.9995242831202744e-05, + "loss": 1.2954, + "step": 192 + }, + { + "epoch": 0.10157894736842105, + "grad_norm": 16.174734115600586, + "learning_rate": 2.9995180034337566e-05, + "loss": 1.0241, + "step": 193 + }, + { + "epoch": 0.10210526315789474, + "grad_norm": 9.38874626159668, + "learning_rate": 2.9995116825778177e-05, + "loss": 1.0844, + "step": 194 + }, + { + "epoch": 0.10263157894736842, + "grad_norm": 2.1543664932250977, + "learning_rate": 2.9995053205526305e-05, + "loss": 1.1873, + "step": 195 + }, + { + "epoch": 0.1031578947368421, + "grad_norm": 3.171243906021118, + "learning_rate": 2.9994989173583712e-05, + "loss": 0.5879, + "step": 196 + }, + { + "epoch": 0.1036842105263158, + "grad_norm": 5.367387294769287, + "learning_rate": 2.9994924729952135e-05, + "loss": 1.1287, + "step": 197 + }, + { + "epoch": 0.10421052631578948, + "grad_norm": 6.392814636230469, + "learning_rate": 2.9994859874633358e-05, + "loss": 0.2617, + "step": 198 + }, + { + "epoch": 0.10473684210526316, + "grad_norm": 8.996861457824707, + "learning_rate": 2.999479460762916e-05, + "loss": 0.5161, + "step": 199 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 2.239112138748169, + "learning_rate": 2.9994728928941327e-05, + "loss": 1.4558, + "step": 200 + }, + { + "epoch": 0.10578947368421053, + "grad_norm": 1.4390355348587036, + "learning_rate": 2.9994662838571673e-05, + "loss": 1.2274, + "step": 201 + }, + { + "epoch": 0.10631578947368421, + "grad_norm": 20.712223052978516, + "learning_rate": 2.9994596336521997e-05, + "loss": 1.6812, + "step": 202 + }, + { + "epoch": 0.10684210526315789, + "grad_norm": 3.6506576538085938, + "learning_rate": 2.9994529422794136e-05, + "loss": 1.7615, + "step": 203 + }, + { + "epoch": 0.10736842105263159, + "grad_norm": 4.244356155395508, + "learning_rate": 2.9994462097389922e-05, + "loss": 2.1721, + "step": 204 + }, + { + "epoch": 0.10789473684210527, + "grad_norm": 5.399370193481445, + "learning_rate": 2.999439436031121e-05, + "loss": 1.2347, + "step": 205 + }, + { + "epoch": 0.10842105263157895, + "grad_norm": 13.014787673950195, + "learning_rate": 2.999432621155985e-05, + "loss": 1.0546, + "step": 206 + }, + { + "epoch": 0.10894736842105263, + "grad_norm": 6.201412200927734, + "learning_rate": 2.9994257651137727e-05, + "loss": 1.6362, + "step": 207 + }, + { + "epoch": 0.10947368421052632, + "grad_norm": 8.548736572265625, + "learning_rate": 2.999418867904671e-05, + "loss": 0.4087, + "step": 208 + }, + { + "epoch": 0.11, + "grad_norm": 1.0881563425064087, + "learning_rate": 2.9994119295288696e-05, + "loss": 0.0171, + "step": 209 + }, + { + "epoch": 0.11052631578947368, + "grad_norm": 5.160693168640137, + "learning_rate": 2.999404949986559e-05, + "loss": 0.7526, + "step": 210 + }, + { + "epoch": 0.11105263157894738, + "grad_norm": 3.1135430335998535, + "learning_rate": 2.999397929277931e-05, + "loss": 1.3041, + "step": 211 + }, + { + "epoch": 0.11157894736842106, + "grad_norm": 3.0570948123931885, + "learning_rate": 2.9993908674031787e-05, + "loss": 1.0342, + "step": 212 + }, + { + "epoch": 0.11210526315789474, + "grad_norm": 2.296029806137085, + "learning_rate": 2.9993837643624953e-05, + "loss": 1.2628, + "step": 213 + }, + { + "epoch": 0.11263157894736842, + "grad_norm": 34.04128646850586, + "learning_rate": 2.9993766201560764e-05, + "loss": 2.0594, + "step": 214 + }, + { + "epoch": 0.11315789473684211, + "grad_norm": 4.997979164123535, + "learning_rate": 2.999369434784118e-05, + "loss": 0.1955, + "step": 215 + }, + { + "epoch": 0.11368421052631579, + "grad_norm": 1.8177436590194702, + "learning_rate": 2.9993622082468165e-05, + "loss": 1.3383, + "step": 216 + }, + { + "epoch": 0.11421052631578947, + "grad_norm": 4.622316360473633, + "learning_rate": 2.9993549405443715e-05, + "loss": 0.4904, + "step": 217 + }, + { + "epoch": 0.11473684210526315, + "grad_norm": 1.736038327217102, + "learning_rate": 2.9993476316769822e-05, + "loss": 1.1311, + "step": 218 + }, + { + "epoch": 0.11526315789473685, + "grad_norm": 1.4792280197143555, + "learning_rate": 2.9993402816448487e-05, + "loss": 1.6061, + "step": 219 + }, + { + "epoch": 0.11578947368421053, + "grad_norm": 20.820289611816406, + "learning_rate": 2.9993328904481737e-05, + "loss": 3.1151, + "step": 220 + }, + { + "epoch": 0.1163157894736842, + "grad_norm": 3.1863715648651123, + "learning_rate": 2.999325458087159e-05, + "loss": 0.2512, + "step": 221 + }, + { + "epoch": 0.1168421052631579, + "grad_norm": 7.053295135498047, + "learning_rate": 2.99931798456201e-05, + "loss": 2.0884, + "step": 222 + }, + { + "epoch": 0.11736842105263158, + "grad_norm": 2.6936466693878174, + "learning_rate": 2.999310469872931e-05, + "loss": 1.0639, + "step": 223 + }, + { + "epoch": 0.11789473684210526, + "grad_norm": 6.108619213104248, + "learning_rate": 2.9993029140201288e-05, + "loss": 1.5187, + "step": 224 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 1.4771653413772583, + "learning_rate": 2.99929531700381e-05, + "loss": 0.8693, + "step": 225 + }, + { + "epoch": 0.11894736842105263, + "grad_norm": 5.0803632736206055, + "learning_rate": 2.999287678824184e-05, + "loss": 1.6351, + "step": 226 + }, + { + "epoch": 0.11947368421052632, + "grad_norm": 2.738546848297119, + "learning_rate": 2.9992799994814602e-05, + "loss": 0.076, + "step": 227 + }, + { + "epoch": 0.12, + "grad_norm": 1.8112925291061401, + "learning_rate": 2.9992722789758496e-05, + "loss": 1.2248, + "step": 228 + }, + { + "epoch": 0.12052631578947369, + "grad_norm": 2.918973684310913, + "learning_rate": 2.999264517307564e-05, + "loss": 1.0747, + "step": 229 + }, + { + "epoch": 0.12105263157894737, + "grad_norm": 26.450199127197266, + "learning_rate": 2.9992567144768167e-05, + "loss": 1.0269, + "step": 230 + }, + { + "epoch": 0.12157894736842105, + "grad_norm": 6.718221187591553, + "learning_rate": 2.9992488704838215e-05, + "loss": 1.3762, + "step": 231 + }, + { + "epoch": 0.12210526315789473, + "grad_norm": 12.591816902160645, + "learning_rate": 2.9992409853287942e-05, + "loss": 2.1439, + "step": 232 + }, + { + "epoch": 0.12263157894736842, + "grad_norm": 3.2106196880340576, + "learning_rate": 2.9992330590119516e-05, + "loss": 1.549, + "step": 233 + }, + { + "epoch": 0.1231578947368421, + "grad_norm": 3.1923153400421143, + "learning_rate": 2.9992250915335096e-05, + "loss": 0.052, + "step": 234 + }, + { + "epoch": 0.12368421052631579, + "grad_norm": 2.389961004257202, + "learning_rate": 2.999217082893689e-05, + "loss": 1.6529, + "step": 235 + }, + { + "epoch": 0.12421052631578948, + "grad_norm": 7.870512008666992, + "learning_rate": 2.9992090330927092e-05, + "loss": 0.9921, + "step": 236 + }, + { + "epoch": 0.12473684210526316, + "grad_norm": 7.302090167999268, + "learning_rate": 2.9992009421307904e-05, + "loss": 2.3439, + "step": 237 + }, + { + "epoch": 0.12526315789473685, + "grad_norm": 4.087291240692139, + "learning_rate": 2.9991928100081557e-05, + "loss": 0.301, + "step": 238 + }, + { + "epoch": 0.12578947368421053, + "grad_norm": 1.479772925376892, + "learning_rate": 2.9991846367250273e-05, + "loss": 1.1892, + "step": 239 + }, + { + "epoch": 0.12631578947368421, + "grad_norm": 2.7664432525634766, + "learning_rate": 2.9991764222816304e-05, + "loss": 0.0283, + "step": 240 + }, + { + "epoch": 0.1268421052631579, + "grad_norm": 8.476157188415527, + "learning_rate": 2.9991681666781904e-05, + "loss": 1.4744, + "step": 241 + }, + { + "epoch": 0.12736842105263158, + "grad_norm": 6.929798126220703, + "learning_rate": 2.9991598699149337e-05, + "loss": 0.122, + "step": 242 + }, + { + "epoch": 0.12789473684210526, + "grad_norm": 11.882857322692871, + "learning_rate": 2.9991515319920885e-05, + "loss": 2.2842, + "step": 243 + }, + { + "epoch": 0.12842105263157894, + "grad_norm": 3.38356351852417, + "learning_rate": 2.999143152909883e-05, + "loss": 0.5131, + "step": 244 + }, + { + "epoch": 0.12894736842105264, + "grad_norm": 2.857849359512329, + "learning_rate": 2.999134732668548e-05, + "loss": 1.4122, + "step": 245 + }, + { + "epoch": 0.12947368421052632, + "grad_norm": 5.270646572113037, + "learning_rate": 2.9991262712683142e-05, + "loss": 0.0852, + "step": 246 + }, + { + "epoch": 0.13, + "grad_norm": 7.410548210144043, + "learning_rate": 2.9991177687094145e-05, + "loss": 0.3834, + "step": 247 + }, + { + "epoch": 0.13052631578947368, + "grad_norm": 9.727782249450684, + "learning_rate": 2.9991092249920818e-05, + "loss": 1.3174, + "step": 248 + }, + { + "epoch": 0.13105263157894737, + "grad_norm": 2.9647574424743652, + "learning_rate": 2.9991006401165505e-05, + "loss": 1.0558, + "step": 249 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 6.312364101409912, + "learning_rate": 2.999092014083057e-05, + "loss": 1.5202, + "step": 250 + }, + { + "epoch": 0.13210526315789473, + "grad_norm": 6.21821928024292, + "learning_rate": 2.9990833468918376e-05, + "loss": 1.3892, + "step": 251 + }, + { + "epoch": 0.13263157894736843, + "grad_norm": 1.7169901132583618, + "learning_rate": 2.999074638543131e-05, + "loss": 1.3552, + "step": 252 + }, + { + "epoch": 0.13315789473684211, + "grad_norm": 1.8053324222564697, + "learning_rate": 2.9990658890371753e-05, + "loss": 1.3176, + "step": 253 + }, + { + "epoch": 0.1336842105263158, + "grad_norm": 2.0673868656158447, + "learning_rate": 2.9990570983742105e-05, + "loss": 1.1511, + "step": 254 + }, + { + "epoch": 0.13421052631578947, + "grad_norm": 13.948058128356934, + "learning_rate": 2.9990482665544792e-05, + "loss": 1.8145, + "step": 255 + }, + { + "epoch": 0.13473684210526315, + "grad_norm": 14.18269157409668, + "learning_rate": 2.999039393578223e-05, + "loss": 2.3278, + "step": 256 + }, + { + "epoch": 0.13526315789473684, + "grad_norm": 6.532087802886963, + "learning_rate": 2.9990304794456857e-05, + "loss": 1.6241, + "step": 257 + }, + { + "epoch": 0.13578947368421052, + "grad_norm": 5.4354987144470215, + "learning_rate": 2.9990215241571124e-05, + "loss": 1.094, + "step": 258 + }, + { + "epoch": 0.13631578947368422, + "grad_norm": 4.302351474761963, + "learning_rate": 2.9990125277127487e-05, + "loss": 0.8513, + "step": 259 + }, + { + "epoch": 0.1368421052631579, + "grad_norm": 5.297351837158203, + "learning_rate": 2.9990034901128414e-05, + "loss": 2.1124, + "step": 260 + }, + { + "epoch": 0.13736842105263158, + "grad_norm": 1.7843605279922485, + "learning_rate": 2.9989944113576387e-05, + "loss": 0.9198, + "step": 261 + }, + { + "epoch": 0.13789473684210526, + "grad_norm": 3.7895467281341553, + "learning_rate": 2.9989852914473898e-05, + "loss": 1.4874, + "step": 262 + }, + { + "epoch": 0.13842105263157894, + "grad_norm": 12.728391647338867, + "learning_rate": 2.9989761303823453e-05, + "loss": 1.3333, + "step": 263 + }, + { + "epoch": 0.13894736842105262, + "grad_norm": 4.476593017578125, + "learning_rate": 2.9989669281627566e-05, + "loss": 0.95, + "step": 264 + }, + { + "epoch": 0.1394736842105263, + "grad_norm": 2.2026288509368896, + "learning_rate": 2.998957684788877e-05, + "loss": 0.0549, + "step": 265 + }, + { + "epoch": 0.14, + "grad_norm": 2.320668935775757, + "learning_rate": 2.998948400260959e-05, + "loss": 1.6905, + "step": 266 + }, + { + "epoch": 0.1405263157894737, + "grad_norm": 7.45945930480957, + "learning_rate": 2.9989390745792585e-05, + "loss": 0.9003, + "step": 267 + }, + { + "epoch": 0.14105263157894737, + "grad_norm": 1.9993150234222412, + "learning_rate": 2.998929707744031e-05, + "loss": 1.774, + "step": 268 + }, + { + "epoch": 0.14157894736842105, + "grad_norm": 2.8484346866607666, + "learning_rate": 2.998920299755534e-05, + "loss": 0.6502, + "step": 269 + }, + { + "epoch": 0.14210526315789473, + "grad_norm": 1.472853183746338, + "learning_rate": 2.9989108506140254e-05, + "loss": 1.0543, + "step": 270 + }, + { + "epoch": 0.14263157894736841, + "grad_norm": 2.5155389308929443, + "learning_rate": 2.9989013603197653e-05, + "loss": 1.6436, + "step": 271 + }, + { + "epoch": 0.1431578947368421, + "grad_norm": 3.893866777420044, + "learning_rate": 2.998891828873014e-05, + "loss": 0.9275, + "step": 272 + }, + { + "epoch": 0.1436842105263158, + "grad_norm": 5.068791389465332, + "learning_rate": 2.9988822562740325e-05, + "loss": 1.3801, + "step": 273 + }, + { + "epoch": 0.14421052631578948, + "grad_norm": 3.420543909072876, + "learning_rate": 2.9988726425230842e-05, + "loss": 1.4276, + "step": 274 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 3.8543524742126465, + "learning_rate": 2.9988629876204337e-05, + "loss": 0.1829, + "step": 275 + }, + { + "epoch": 0.14526315789473684, + "grad_norm": 2.018007755279541, + "learning_rate": 2.9988532915663446e-05, + "loss": 0.9781, + "step": 276 + }, + { + "epoch": 0.14578947368421052, + "grad_norm": 7.19284200668335, + "learning_rate": 2.9988435543610846e-05, + "loss": 1.5077, + "step": 277 + }, + { + "epoch": 0.1463157894736842, + "grad_norm": 4.468955039978027, + "learning_rate": 2.99883377600492e-05, + "loss": 1.7061, + "step": 278 + }, + { + "epoch": 0.14684210526315788, + "grad_norm": 2.5749592781066895, + "learning_rate": 2.9988239564981193e-05, + "loss": 1.1039, + "step": 279 + }, + { + "epoch": 0.14736842105263157, + "grad_norm": 1.5888959169387817, + "learning_rate": 2.9988140958409528e-05, + "loss": 1.132, + "step": 280 + }, + { + "epoch": 0.14789473684210527, + "grad_norm": 1.6728452444076538, + "learning_rate": 2.9988041940336906e-05, + "loss": 1.239, + "step": 281 + }, + { + "epoch": 0.14842105263157895, + "grad_norm": 1.9604003429412842, + "learning_rate": 2.9987942510766047e-05, + "loss": 1.2393, + "step": 282 + }, + { + "epoch": 0.14894736842105263, + "grad_norm": 12.022218704223633, + "learning_rate": 2.9987842669699687e-05, + "loss": 0.808, + "step": 283 + }, + { + "epoch": 0.14947368421052631, + "grad_norm": 1.27422034740448, + "learning_rate": 2.998774241714056e-05, + "loss": 1.1194, + "step": 284 + }, + { + "epoch": 0.15, + "grad_norm": 1.4916560649871826, + "learning_rate": 2.9987641753091416e-05, + "loss": 0.0263, + "step": 285 + }, + { + "epoch": 0.15052631578947367, + "grad_norm": 6.565960884094238, + "learning_rate": 2.9987540677555027e-05, + "loss": 1.2911, + "step": 286 + }, + { + "epoch": 0.15105263157894736, + "grad_norm": 1.564158320426941, + "learning_rate": 2.9987439190534163e-05, + "loss": 0.6841, + "step": 287 + }, + { + "epoch": 0.15157894736842106, + "grad_norm": 2.0669963359832764, + "learning_rate": 2.998733729203161e-05, + "loss": 0.9868, + "step": 288 + }, + { + "epoch": 0.15210526315789474, + "grad_norm": 7.10345983505249, + "learning_rate": 2.9987234982050168e-05, + "loss": 0.6493, + "step": 289 + }, + { + "epoch": 0.15263157894736842, + "grad_norm": 2.2110228538513184, + "learning_rate": 2.9987132260592645e-05, + "loss": 1.225, + "step": 290 + }, + { + "epoch": 0.1531578947368421, + "grad_norm": 2.3035542964935303, + "learning_rate": 2.9987029127661857e-05, + "loss": 1.3038, + "step": 291 + }, + { + "epoch": 0.15368421052631578, + "grad_norm": 2.7143070697784424, + "learning_rate": 2.9986925583260644e-05, + "loss": 1.0185, + "step": 292 + }, + { + "epoch": 0.15421052631578946, + "grad_norm": 1.633699655532837, + "learning_rate": 2.9986821627391845e-05, + "loss": 1.1361, + "step": 293 + }, + { + "epoch": 0.15473684210526314, + "grad_norm": 2.975994825363159, + "learning_rate": 2.9986717260058314e-05, + "loss": 1.8912, + "step": 294 + }, + { + "epoch": 0.15526315789473685, + "grad_norm": 2.5356616973876953, + "learning_rate": 2.9986612481262916e-05, + "loss": 1.2134, + "step": 295 + }, + { + "epoch": 0.15578947368421053, + "grad_norm": 1.880893588066101, + "learning_rate": 2.9986507291008524e-05, + "loss": 0.0698, + "step": 296 + }, + { + "epoch": 0.1563157894736842, + "grad_norm": 8.554359436035156, + "learning_rate": 2.9986401689298032e-05, + "loss": 1.3118, + "step": 297 + }, + { + "epoch": 0.1568421052631579, + "grad_norm": 1.8531193733215332, + "learning_rate": 2.9986295676134337e-05, + "loss": 1.1979, + "step": 298 + }, + { + "epoch": 0.15736842105263157, + "grad_norm": 44.23398971557617, + "learning_rate": 2.998618925152035e-05, + "loss": 2.0334, + "step": 299 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 4.069817066192627, + "learning_rate": 2.9986082415458993e-05, + "loss": 1.2089, + "step": 300 + }, + { + "epoch": 0.15842105263157893, + "grad_norm": 1.7818050384521484, + "learning_rate": 2.9985975167953198e-05, + "loss": 1.0545, + "step": 301 + }, + { + "epoch": 0.15894736842105264, + "grad_norm": 2.3045952320098877, + "learning_rate": 2.9985867509005906e-05, + "loss": 1.1889, + "step": 302 + }, + { + "epoch": 0.15947368421052632, + "grad_norm": 4.53651237487793, + "learning_rate": 2.9985759438620082e-05, + "loss": 0.78, + "step": 303 + }, + { + "epoch": 0.16, + "grad_norm": 2.4460010528564453, + "learning_rate": 2.9985650956798686e-05, + "loss": 1.3093, + "step": 304 + }, + { + "epoch": 0.16052631578947368, + "grad_norm": 4.067452907562256, + "learning_rate": 2.99855420635447e-05, + "loss": 0.7694, + "step": 305 + }, + { + "epoch": 0.16105263157894736, + "grad_norm": 1.7187111377716064, + "learning_rate": 2.9985432758861114e-05, + "loss": 1.1114, + "step": 306 + }, + { + "epoch": 0.16157894736842104, + "grad_norm": 5.563487529754639, + "learning_rate": 2.9985323042750924e-05, + "loss": 0.2845, + "step": 307 + }, + { + "epoch": 0.16210526315789472, + "grad_norm": 13.657363891601562, + "learning_rate": 2.9985212915217146e-05, + "loss": 0.7273, + "step": 308 + }, + { + "epoch": 0.16263157894736843, + "grad_norm": 44.487571716308594, + "learning_rate": 2.9985102376262803e-05, + "loss": 1.0723, + "step": 309 + }, + { + "epoch": 0.1631578947368421, + "grad_norm": 3.729584217071533, + "learning_rate": 2.998499142589093e-05, + "loss": 1.0203, + "step": 310 + }, + { + "epoch": 0.1636842105263158, + "grad_norm": 3.7574782371520996, + "learning_rate": 2.9984880064104575e-05, + "loss": 1.1488, + "step": 311 + }, + { + "epoch": 0.16421052631578947, + "grad_norm": 10.508902549743652, + "learning_rate": 2.998476829090679e-05, + "loss": 0.3696, + "step": 312 + }, + { + "epoch": 0.16473684210526315, + "grad_norm": 7.688244342803955, + "learning_rate": 2.9984656106300647e-05, + "loss": 1.8374, + "step": 313 + }, + { + "epoch": 0.16526315789473683, + "grad_norm": 2.0016376972198486, + "learning_rate": 2.9984543510289227e-05, + "loss": 0.8432, + "step": 314 + }, + { + "epoch": 0.16578947368421051, + "grad_norm": 5.203074932098389, + "learning_rate": 2.998443050287562e-05, + "loss": 1.2669, + "step": 315 + }, + { + "epoch": 0.16631578947368422, + "grad_norm": 1.451202630996704, + "learning_rate": 2.9984317084062928e-05, + "loss": 1.0893, + "step": 316 + }, + { + "epoch": 0.1668421052631579, + "grad_norm": 1.7263648509979248, + "learning_rate": 2.998420325385427e-05, + "loss": 1.219, + "step": 317 + }, + { + "epoch": 0.16736842105263158, + "grad_norm": 8.742837905883789, + "learning_rate": 2.9984089012252765e-05, + "loss": 0.3662, + "step": 318 + }, + { + "epoch": 0.16789473684210526, + "grad_norm": 4.271378040313721, + "learning_rate": 2.9983974359261556e-05, + "loss": 0.9902, + "step": 319 + }, + { + "epoch": 0.16842105263157894, + "grad_norm": 1.9590401649475098, + "learning_rate": 2.998385929488378e-05, + "loss": 1.0124, + "step": 320 + }, + { + "epoch": 0.16894736842105262, + "grad_norm": 1.6438995599746704, + "learning_rate": 2.998374381912261e-05, + "loss": 1.0227, + "step": 321 + }, + { + "epoch": 0.1694736842105263, + "grad_norm": 8.173205375671387, + "learning_rate": 2.99836279319812e-05, + "loss": 0.3485, + "step": 322 + }, + { + "epoch": 0.17, + "grad_norm": 3.0618059635162354, + "learning_rate": 2.9983511633462746e-05, + "loss": 1.213, + "step": 323 + }, + { + "epoch": 0.1705263157894737, + "grad_norm": 2.2398312091827393, + "learning_rate": 2.9983394923570436e-05, + "loss": 0.7417, + "step": 324 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 2.8603358268737793, + "learning_rate": 2.9983277802307475e-05, + "loss": 0.7479, + "step": 325 + }, + { + "epoch": 0.17157894736842105, + "grad_norm": 6.809089660644531, + "learning_rate": 2.9983160269677074e-05, + "loss": 0.1203, + "step": 326 + }, + { + "epoch": 0.17210526315789473, + "grad_norm": 1.7018848657608032, + "learning_rate": 2.998304232568247e-05, + "loss": 0.7828, + "step": 327 + }, + { + "epoch": 0.1726315789473684, + "grad_norm": 1.4800243377685547, + "learning_rate": 2.9982923970326887e-05, + "loss": 0.0606, + "step": 328 + }, + { + "epoch": 0.1731578947368421, + "grad_norm": 3.1752805709838867, + "learning_rate": 2.998280520361359e-05, + "loss": 0.5135, + "step": 329 + }, + { + "epoch": 0.1736842105263158, + "grad_norm": 1.6199371814727783, + "learning_rate": 2.9982686025545824e-05, + "loss": 1.2659, + "step": 330 + }, + { + "epoch": 0.17421052631578948, + "grad_norm": 8.967512130737305, + "learning_rate": 2.9982566436126873e-05, + "loss": 1.8805, + "step": 331 + }, + { + "epoch": 0.17473684210526316, + "grad_norm": 20.32599449157715, + "learning_rate": 2.9982446435360016e-05, + "loss": 0.6175, + "step": 332 + }, + { + "epoch": 0.17526315789473684, + "grad_norm": 12.868733406066895, + "learning_rate": 2.9982326023248548e-05, + "loss": 1.1088, + "step": 333 + }, + { + "epoch": 0.17578947368421052, + "grad_norm": 9.508431434631348, + "learning_rate": 2.9982205199795773e-05, + "loss": 1.2409, + "step": 334 + }, + { + "epoch": 0.1763157894736842, + "grad_norm": 1.3848768472671509, + "learning_rate": 2.998208396500501e-05, + "loss": 1.1361, + "step": 335 + }, + { + "epoch": 0.17684210526315788, + "grad_norm": 1.4120675325393677, + "learning_rate": 2.998196231887959e-05, + "loss": 0.8192, + "step": 336 + }, + { + "epoch": 0.1773684210526316, + "grad_norm": 1.9072043895721436, + "learning_rate": 2.998184026142285e-05, + "loss": 1.1288, + "step": 337 + }, + { + "epoch": 0.17789473684210527, + "grad_norm": 17.120038986206055, + "learning_rate": 2.9981717792638143e-05, + "loss": 0.6453, + "step": 338 + }, + { + "epoch": 0.17842105263157895, + "grad_norm": 1.8352843523025513, + "learning_rate": 2.998159491252883e-05, + "loss": 1.7777, + "step": 339 + }, + { + "epoch": 0.17894736842105263, + "grad_norm": 1.8334228992462158, + "learning_rate": 2.9981471621098277e-05, + "loss": 1.1382, + "step": 340 + }, + { + "epoch": 0.1794736842105263, + "grad_norm": 2.4187421798706055, + "learning_rate": 2.9981347918349885e-05, + "loss": 1.1575, + "step": 341 + }, + { + "epoch": 0.18, + "grad_norm": 7.076684951782227, + "learning_rate": 2.998122380428704e-05, + "loss": 0.7906, + "step": 342 + }, + { + "epoch": 0.18052631578947367, + "grad_norm": 3.0878186225891113, + "learning_rate": 2.9981099278913147e-05, + "loss": 1.2896, + "step": 343 + }, + { + "epoch": 0.18105263157894738, + "grad_norm": 3.1805942058563232, + "learning_rate": 2.9980974342231633e-05, + "loss": 0.4216, + "step": 344 + }, + { + "epoch": 0.18157894736842106, + "grad_norm": 1.9451438188552856, + "learning_rate": 2.9980848994245926e-05, + "loss": 1.4878, + "step": 345 + }, + { + "epoch": 0.18210526315789474, + "grad_norm": 1.4234017133712769, + "learning_rate": 2.9980723234959464e-05, + "loss": 0.8428, + "step": 346 + }, + { + "epoch": 0.18263157894736842, + "grad_norm": 6.759546756744385, + "learning_rate": 2.99805970643757e-05, + "loss": 1.7714, + "step": 347 + }, + { + "epoch": 0.1831578947368421, + "grad_norm": 7.0381550788879395, + "learning_rate": 2.9980470482498105e-05, + "loss": 0.4636, + "step": 348 + }, + { + "epoch": 0.18368421052631578, + "grad_norm": 6.027357578277588, + "learning_rate": 2.9980343489330143e-05, + "loss": 1.5307, + "step": 349 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 4.1133012771606445, + "learning_rate": 2.998021608487531e-05, + "loss": 1.3673, + "step": 350 + }, + { + "epoch": 0.18473684210526317, + "grad_norm": 4.925607204437256, + "learning_rate": 2.9980088269137097e-05, + "loss": 2.2288, + "step": 351 + }, + { + "epoch": 0.18526315789473685, + "grad_norm": 2.568140983581543, + "learning_rate": 2.997996004211902e-05, + "loss": 1.3601, + "step": 352 + }, + { + "epoch": 0.18578947368421053, + "grad_norm": 2.9798226356506348, + "learning_rate": 2.9979831403824593e-05, + "loss": 1.1923, + "step": 353 + }, + { + "epoch": 0.1863157894736842, + "grad_norm": 1.192610263824463, + "learning_rate": 2.997970235425735e-05, + "loss": 0.0341, + "step": 354 + }, + { + "epoch": 0.1868421052631579, + "grad_norm": 1.2151992321014404, + "learning_rate": 2.997957289342084e-05, + "loss": 0.9861, + "step": 355 + }, + { + "epoch": 0.18736842105263157, + "grad_norm": 1.9701948165893555, + "learning_rate": 2.9979443021318607e-05, + "loss": 0.0926, + "step": 356 + }, + { + "epoch": 0.18789473684210525, + "grad_norm": 1.1996488571166992, + "learning_rate": 2.9979312737954225e-05, + "loss": 0.8892, + "step": 357 + }, + { + "epoch": 0.18842105263157893, + "grad_norm": 3.7606606483459473, + "learning_rate": 2.9979182043331268e-05, + "loss": 1.3173, + "step": 358 + }, + { + "epoch": 0.18894736842105264, + "grad_norm": 4.111024856567383, + "learning_rate": 2.9979050937453324e-05, + "loss": 1.2348, + "step": 359 + }, + { + "epoch": 0.18947368421052632, + "grad_norm": 16.119956970214844, + "learning_rate": 2.997891942032399e-05, + "loss": 1.7936, + "step": 360 + }, + { + "epoch": 0.19, + "grad_norm": 9.241058349609375, + "learning_rate": 2.9978787491946886e-05, + "loss": 0.2741, + "step": 361 + }, + { + "epoch": 0.19052631578947368, + "grad_norm": 1.398040533065796, + "learning_rate": 2.9978655152325623e-05, + "loss": 1.1073, + "step": 362 + }, + { + "epoch": 0.19105263157894736, + "grad_norm": 6.744707107543945, + "learning_rate": 2.997852240146384e-05, + "loss": 0.9413, + "step": 363 + }, + { + "epoch": 0.19157894736842104, + "grad_norm": 1.4302072525024414, + "learning_rate": 2.997838923936518e-05, + "loss": 1.1093, + "step": 364 + }, + { + "epoch": 0.19210526315789472, + "grad_norm": 1.3738486766815186, + "learning_rate": 2.99782556660333e-05, + "loss": 1.3803, + "step": 365 + }, + { + "epoch": 0.19263157894736843, + "grad_norm": 1.2413164377212524, + "learning_rate": 2.9978121681471868e-05, + "loss": 0.9807, + "step": 366 + }, + { + "epoch": 0.1931578947368421, + "grad_norm": 1.4329386949539185, + "learning_rate": 2.997798728568456e-05, + "loss": 1.3757, + "step": 367 + }, + { + "epoch": 0.1936842105263158, + "grad_norm": 1.2162076234817505, + "learning_rate": 2.9977852478675068e-05, + "loss": 0.7532, + "step": 368 + }, + { + "epoch": 0.19421052631578947, + "grad_norm": 2.649707078933716, + "learning_rate": 2.9977717260447092e-05, + "loss": 1.0267, + "step": 369 + }, + { + "epoch": 0.19473684210526315, + "grad_norm": 6.033254146575928, + "learning_rate": 2.9977581631004343e-05, + "loss": 2.927, + "step": 370 + }, + { + "epoch": 0.19526315789473683, + "grad_norm": 1.8854405879974365, + "learning_rate": 2.9977445590350548e-05, + "loss": 1.3284, + "step": 371 + }, + { + "epoch": 0.1957894736842105, + "grad_norm": 1.3157703876495361, + "learning_rate": 2.9977309138489443e-05, + "loss": 1.6263, + "step": 372 + }, + { + "epoch": 0.19631578947368422, + "grad_norm": 6.529455184936523, + "learning_rate": 2.997717227542477e-05, + "loss": 0.5972, + "step": 373 + }, + { + "epoch": 0.1968421052631579, + "grad_norm": 5.371290683746338, + "learning_rate": 2.997703500116029e-05, + "loss": 0.2108, + "step": 374 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 11.562923431396484, + "learning_rate": 2.9976897315699767e-05, + "loss": 0.5319, + "step": 375 + }, + { + "epoch": 0.19789473684210526, + "grad_norm": 1.7732006311416626, + "learning_rate": 2.9976759219046988e-05, + "loss": 1.3096, + "step": 376 + }, + { + "epoch": 0.19842105263157894, + "grad_norm": 2.1219842433929443, + "learning_rate": 2.997662071120574e-05, + "loss": 1.5544, + "step": 377 + }, + { + "epoch": 0.19894736842105262, + "grad_norm": 2.662497043609619, + "learning_rate": 2.9976481792179827e-05, + "loss": 1.3716, + "step": 378 + }, + { + "epoch": 0.1994736842105263, + "grad_norm": 1.3942060470581055, + "learning_rate": 2.997634246197306e-05, + "loss": 1.8652, + "step": 379 + }, + { + "epoch": 0.2, + "grad_norm": 1.5858086347579956, + "learning_rate": 2.9976202720589273e-05, + "loss": 0.8704, + "step": 380 + }, + { + "epoch": 0.2005263157894737, + "grad_norm": 4.291787624359131, + "learning_rate": 2.997606256803229e-05, + "loss": 0.5378, + "step": 381 + }, + { + "epoch": 0.20105263157894737, + "grad_norm": 1.4939614534378052, + "learning_rate": 2.9975922004305972e-05, + "loss": 1.5748, + "step": 382 + }, + { + "epoch": 0.20157894736842105, + "grad_norm": 3.255887269973755, + "learning_rate": 2.9975781029414168e-05, + "loss": 0.5039, + "step": 383 + }, + { + "epoch": 0.20210526315789473, + "grad_norm": 1.5569735765457153, + "learning_rate": 2.9975639643360756e-05, + "loss": 1.2235, + "step": 384 + }, + { + "epoch": 0.2026315789473684, + "grad_norm": 10.412734985351562, + "learning_rate": 2.997549784614961e-05, + "loss": 1.7202, + "step": 385 + }, + { + "epoch": 0.2031578947368421, + "grad_norm": 1.6121785640716553, + "learning_rate": 2.997535563778463e-05, + "loss": 1.4258, + "step": 386 + }, + { + "epoch": 0.2036842105263158, + "grad_norm": 7.74535608291626, + "learning_rate": 2.9975213018269714e-05, + "loss": 0.3529, + "step": 387 + }, + { + "epoch": 0.20421052631578948, + "grad_norm": 1.4278053045272827, + "learning_rate": 2.997506998760878e-05, + "loss": 1.1017, + "step": 388 + }, + { + "epoch": 0.20473684210526316, + "grad_norm": 4.179811477661133, + "learning_rate": 2.9974926545805762e-05, + "loss": 0.3317, + "step": 389 + }, + { + "epoch": 0.20526315789473684, + "grad_norm": 5.233457088470459, + "learning_rate": 2.997478269286459e-05, + "loss": 2.0176, + "step": 390 + }, + { + "epoch": 0.20578947368421052, + "grad_norm": 10.88487720489502, + "learning_rate": 2.9974638428789216e-05, + "loss": 1.885, + "step": 391 + }, + { + "epoch": 0.2063157894736842, + "grad_norm": 1.2455674409866333, + "learning_rate": 2.9974493753583597e-05, + "loss": 1.3968, + "step": 392 + }, + { + "epoch": 0.20684210526315788, + "grad_norm": 1.7053565979003906, + "learning_rate": 2.9974348667251715e-05, + "loss": 1.0096, + "step": 393 + }, + { + "epoch": 0.2073684210526316, + "grad_norm": 4.857005596160889, + "learning_rate": 2.9974203169797543e-05, + "loss": 0.5129, + "step": 394 + }, + { + "epoch": 0.20789473684210527, + "grad_norm": 1.7618751525878906, + "learning_rate": 2.997405726122508e-05, + "loss": 0.8299, + "step": 395 + }, + { + "epoch": 0.20842105263157895, + "grad_norm": 6.38074254989624, + "learning_rate": 2.9973910941538333e-05, + "loss": 1.711, + "step": 396 + }, + { + "epoch": 0.20894736842105263, + "grad_norm": 4.114274024963379, + "learning_rate": 2.9973764210741312e-05, + "loss": 0.2308, + "step": 397 + }, + { + "epoch": 0.2094736842105263, + "grad_norm": 1.7079466581344604, + "learning_rate": 2.9973617068838056e-05, + "loss": 1.3165, + "step": 398 + }, + { + "epoch": 0.21, + "grad_norm": 2.909496307373047, + "learning_rate": 2.9973469515832604e-05, + "loss": 0.4798, + "step": 399 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 1.0828051567077637, + "learning_rate": 2.9973321551728995e-05, + "loss": 0.751, + "step": 400 + }, + { + "epoch": 0.21105263157894738, + "grad_norm": 2.6114165782928467, + "learning_rate": 2.9973173176531305e-05, + "loss": 1.226, + "step": 401 + }, + { + "epoch": 0.21157894736842106, + "grad_norm": 2.137087106704712, + "learning_rate": 2.99730243902436e-05, + "loss": 1.0014, + "step": 402 + }, + { + "epoch": 0.21210526315789474, + "grad_norm": 34.939544677734375, + "learning_rate": 2.9972875192869965e-05, + "loss": 1.3549, + "step": 403 + }, + { + "epoch": 0.21263157894736842, + "grad_norm": 9.31963062286377, + "learning_rate": 2.9972725584414506e-05, + "loss": 1.6069, + "step": 404 + }, + { + "epoch": 0.2131578947368421, + "grad_norm": 1.6685644388198853, + "learning_rate": 2.9972575564881316e-05, + "loss": 0.6431, + "step": 405 + }, + { + "epoch": 0.21368421052631578, + "grad_norm": 18.59819984436035, + "learning_rate": 2.9972425134274522e-05, + "loss": 3.0248, + "step": 406 + }, + { + "epoch": 0.21421052631578946, + "grad_norm": 5.35085391998291, + "learning_rate": 2.9972274292598255e-05, + "loss": 1.6838, + "step": 407 + }, + { + "epoch": 0.21473684210526317, + "grad_norm": 2.953678846359253, + "learning_rate": 2.9972123039856654e-05, + "loss": 0.9957, + "step": 408 + }, + { + "epoch": 0.21526315789473685, + "grad_norm": 1.5719188451766968, + "learning_rate": 2.997197137605387e-05, + "loss": 1.0414, + "step": 409 + }, + { + "epoch": 0.21578947368421053, + "grad_norm": 5.360487937927246, + "learning_rate": 2.997181930119407e-05, + "loss": 1.5796, + "step": 410 + }, + { + "epoch": 0.2163157894736842, + "grad_norm": 1.611450433731079, + "learning_rate": 2.997166681528143e-05, + "loss": 1.025, + "step": 411 + }, + { + "epoch": 0.2168421052631579, + "grad_norm": 2.5470244884490967, + "learning_rate": 2.997151391832013e-05, + "loss": 1.0886, + "step": 412 + }, + { + "epoch": 0.21736842105263157, + "grad_norm": 1.2462735176086426, + "learning_rate": 2.9971360610314374e-05, + "loss": 0.5111, + "step": 413 + }, + { + "epoch": 0.21789473684210525, + "grad_norm": 1.5405879020690918, + "learning_rate": 2.9971206891268367e-05, + "loss": 0.9845, + "step": 414 + }, + { + "epoch": 0.21842105263157896, + "grad_norm": 1.282374620437622, + "learning_rate": 2.9971052761186338e-05, + "loss": 0.4869, + "step": 415 + }, + { + "epoch": 0.21894736842105264, + "grad_norm": 1.9443198442459106, + "learning_rate": 2.9970898220072505e-05, + "loss": 0.0399, + "step": 416 + }, + { + "epoch": 0.21947368421052632, + "grad_norm": 1.685855746269226, + "learning_rate": 2.9970743267931123e-05, + "loss": 1.3679, + "step": 417 + }, + { + "epoch": 0.22, + "grad_norm": 2.6044466495513916, + "learning_rate": 2.997058790476644e-05, + "loss": 1.4172, + "step": 418 + }, + { + "epoch": 0.22052631578947368, + "grad_norm": 3.274233818054199, + "learning_rate": 2.9970432130582727e-05, + "loss": 1.5899, + "step": 419 + }, + { + "epoch": 0.22105263157894736, + "grad_norm": 7.699522972106934, + "learning_rate": 2.9970275945384252e-05, + "loss": 1.5596, + "step": 420 + }, + { + "epoch": 0.22157894736842104, + "grad_norm": 1.1986091136932373, + "learning_rate": 2.997011934917531e-05, + "loss": 1.1682, + "step": 421 + }, + { + "epoch": 0.22210526315789475, + "grad_norm": 56.02631378173828, + "learning_rate": 2.9969962341960196e-05, + "loss": 1.7715, + "step": 422 + }, + { + "epoch": 0.22263157894736843, + "grad_norm": 2.560245990753174, + "learning_rate": 2.996980492374323e-05, + "loss": 0.9785, + "step": 423 + }, + { + "epoch": 0.2231578947368421, + "grad_norm": 11.875818252563477, + "learning_rate": 2.9969647094528718e-05, + "loss": 1.4328, + "step": 424 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 1.9637598991394043, + "learning_rate": 2.9969488854321007e-05, + "loss": 2.0752, + "step": 425 + }, + { + "epoch": 0.22421052631578947, + "grad_norm": 1.1943355798721313, + "learning_rate": 2.9969330203124433e-05, + "loss": 0.9533, + "step": 426 + }, + { + "epoch": 0.22473684210526315, + "grad_norm": 3.17193341255188, + "learning_rate": 2.996917114094336e-05, + "loss": 0.3756, + "step": 427 + }, + { + "epoch": 0.22526315789473683, + "grad_norm": 6.2271928787231445, + "learning_rate": 2.9969011667782152e-05, + "loss": 1.5742, + "step": 428 + }, + { + "epoch": 0.22578947368421054, + "grad_norm": 3.3042328357696533, + "learning_rate": 2.9968851783645182e-05, + "loss": 1.2712, + "step": 429 + }, + { + "epoch": 0.22631578947368422, + "grad_norm": 1.6236546039581299, + "learning_rate": 2.9968691488536842e-05, + "loss": 1.3309, + "step": 430 + }, + { + "epoch": 0.2268421052631579, + "grad_norm": 3.908189535140991, + "learning_rate": 2.9968530782461537e-05, + "loss": 2.0394, + "step": 431 + }, + { + "epoch": 0.22736842105263158, + "grad_norm": 4.952622413635254, + "learning_rate": 2.9968369665423677e-05, + "loss": 1.2584, + "step": 432 + }, + { + "epoch": 0.22789473684210526, + "grad_norm": 3.6018247604370117, + "learning_rate": 2.9968208137427685e-05, + "loss": 0.6666, + "step": 433 + }, + { + "epoch": 0.22842105263157894, + "grad_norm": 1.319495677947998, + "learning_rate": 2.9968046198477994e-05, + "loss": 1.0497, + "step": 434 + }, + { + "epoch": 0.22894736842105262, + "grad_norm": 9.26498031616211, + "learning_rate": 2.9967883848579052e-05, + "loss": 1.8237, + "step": 435 + }, + { + "epoch": 0.2294736842105263, + "grad_norm": 2.161882162094116, + "learning_rate": 2.996772108773532e-05, + "loss": 1.5562, + "step": 436 + }, + { + "epoch": 0.23, + "grad_norm": 1.2435736656188965, + "learning_rate": 2.9967557915951258e-05, + "loss": 1.292, + "step": 437 + }, + { + "epoch": 0.2305263157894737, + "grad_norm": 1.310845971107483, + "learning_rate": 2.996739433323135e-05, + "loss": 0.8349, + "step": 438 + }, + { + "epoch": 0.23105263157894737, + "grad_norm": 10.944952011108398, + "learning_rate": 2.9967230339580095e-05, + "loss": 0.5981, + "step": 439 + }, + { + "epoch": 0.23157894736842105, + "grad_norm": 3.8131868839263916, + "learning_rate": 2.996706593500198e-05, + "loss": 1.3155, + "step": 440 + }, + { + "epoch": 0.23210526315789473, + "grad_norm": 3.2014780044555664, + "learning_rate": 2.9966901119501535e-05, + "loss": 1.2769, + "step": 441 + }, + { + "epoch": 0.2326315789473684, + "grad_norm": 0.4154053330421448, + "learning_rate": 2.9966735893083274e-05, + "loss": 0.0089, + "step": 442 + }, + { + "epoch": 0.2331578947368421, + "grad_norm": 1.2587388753890991, + "learning_rate": 2.996657025575174e-05, + "loss": 1.34, + "step": 443 + }, + { + "epoch": 0.2336842105263158, + "grad_norm": 1.6259454488754272, + "learning_rate": 2.996640420751147e-05, + "loss": 1.2116, + "step": 444 + }, + { + "epoch": 0.23421052631578948, + "grad_norm": 24.173168182373047, + "learning_rate": 2.9966237748367032e-05, + "loss": 1.2547, + "step": 445 + }, + { + "epoch": 0.23473684210526316, + "grad_norm": 1.4230668544769287, + "learning_rate": 2.9966070878322994e-05, + "loss": 0.9427, + "step": 446 + }, + { + "epoch": 0.23526315789473684, + "grad_norm": 1.4833992719650269, + "learning_rate": 2.996590359738394e-05, + "loss": 0.9436, + "step": 447 + }, + { + "epoch": 0.23578947368421052, + "grad_norm": 2.0306854248046875, + "learning_rate": 2.996573590555446e-05, + "loss": 1.333, + "step": 448 + }, + { + "epoch": 0.2363157894736842, + "grad_norm": 3.2706172466278076, + "learning_rate": 2.996556780283916e-05, + "loss": 0.3585, + "step": 449 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 2.0781776905059814, + "learning_rate": 2.996539928924265e-05, + "loss": 1.1149, + "step": 450 + }, + { + "epoch": 0.2373684210526316, + "grad_norm": 3.0296876430511475, + "learning_rate": 2.996523036476956e-05, + "loss": 0.4218, + "step": 451 + }, + { + "epoch": 0.23789473684210527, + "grad_norm": 4.4293293952941895, + "learning_rate": 2.9965061029424524e-05, + "loss": 1.6268, + "step": 452 + }, + { + "epoch": 0.23842105263157895, + "grad_norm": 1.4754269123077393, + "learning_rate": 2.9964891283212202e-05, + "loss": 1.3374, + "step": 453 + }, + { + "epoch": 0.23894736842105263, + "grad_norm": 8.703702926635742, + "learning_rate": 2.9964721126137245e-05, + "loss": 1.5505, + "step": 454 + }, + { + "epoch": 0.2394736842105263, + "grad_norm": 1.779236078262329, + "learning_rate": 2.9964550558204332e-05, + "loss": 1.0386, + "step": 455 + }, + { + "epoch": 0.24, + "grad_norm": 2.5311484336853027, + "learning_rate": 2.9964379579418136e-05, + "loss": 1.4634, + "step": 456 + }, + { + "epoch": 0.24052631578947367, + "grad_norm": 8.054793357849121, + "learning_rate": 2.9964208189783355e-05, + "loss": 0.4973, + "step": 457 + }, + { + "epoch": 0.24105263157894738, + "grad_norm": 4.909681797027588, + "learning_rate": 2.9964036389304696e-05, + "loss": 1.1552, + "step": 458 + }, + { + "epoch": 0.24157894736842106, + "grad_norm": 6.568388938903809, + "learning_rate": 2.996386417798688e-05, + "loss": 1.339, + "step": 459 + }, + { + "epoch": 0.24210526315789474, + "grad_norm": 1.7436282634735107, + "learning_rate": 2.9963691555834626e-05, + "loss": 1.6004, + "step": 460 + }, + { + "epoch": 0.24263157894736842, + "grad_norm": 2.2429983615875244, + "learning_rate": 2.996351852285268e-05, + "loss": 1.1068, + "step": 461 + }, + { + "epoch": 0.2431578947368421, + "grad_norm": 2.5454885959625244, + "learning_rate": 2.996334507904579e-05, + "loss": 0.2007, + "step": 462 + }, + { + "epoch": 0.24368421052631578, + "grad_norm": 1.5255674123764038, + "learning_rate": 2.9963171224418722e-05, + "loss": 1.0913, + "step": 463 + }, + { + "epoch": 0.24421052631578946, + "grad_norm": 1.427351713180542, + "learning_rate": 2.9962996958976242e-05, + "loss": 0.7462, + "step": 464 + }, + { + "epoch": 0.24473684210526317, + "grad_norm": 1.3530781269073486, + "learning_rate": 2.9962822282723136e-05, + "loss": 1.3773, + "step": 465 + }, + { + "epoch": 0.24526315789473685, + "grad_norm": 1.161293625831604, + "learning_rate": 2.996264719566421e-05, + "loss": 0.7285, + "step": 466 + }, + { + "epoch": 0.24578947368421053, + "grad_norm": 3.624866008758545, + "learning_rate": 2.996247169780426e-05, + "loss": 0.9183, + "step": 467 + }, + { + "epoch": 0.2463157894736842, + "grad_norm": 1.182325005531311, + "learning_rate": 2.9962295789148104e-05, + "loss": 1.1596, + "step": 468 + }, + { + "epoch": 0.2468421052631579, + "grad_norm": 2.112578868865967, + "learning_rate": 2.9962119469700577e-05, + "loss": 1.302, + "step": 469 + }, + { + "epoch": 0.24736842105263157, + "grad_norm": 2.4179584980010986, + "learning_rate": 2.9961942739466516e-05, + "loss": 1.1984, + "step": 470 + }, + { + "epoch": 0.24789473684210525, + "grad_norm": 5.681208610534668, + "learning_rate": 2.9961765598450782e-05, + "loss": 1.3961, + "step": 471 + }, + { + "epoch": 0.24842105263157896, + "grad_norm": 1.9745169878005981, + "learning_rate": 2.9961588046658222e-05, + "loss": 0.8747, + "step": 472 + }, + { + "epoch": 0.24894736842105264, + "grad_norm": 1.0899487733840942, + "learning_rate": 2.9961410084093727e-05, + "loss": 0.8849, + "step": 473 + }, + { + "epoch": 0.24947368421052632, + "grad_norm": 27.703815460205078, + "learning_rate": 2.9961231710762173e-05, + "loss": 1.362, + "step": 474 + }, + { + "epoch": 0.25, + "grad_norm": 77.52987670898438, + "learning_rate": 2.996105292666846e-05, + "loss": 1.9388, + "step": 475 + }, + { + "epoch": 0.25, + "eval_loss": 1.0506709814071655, + "eval_runtime": 12.9408, + "eval_samples_per_second": 7.727, + "eval_steps_per_second": 7.727, + "step": 475 + }, + { + "epoch": 0.2505263157894737, + "grad_norm": 1.5593270063400269, + "learning_rate": 2.99608737318175e-05, + "loss": 1.1288, + "step": 476 + }, + { + "epoch": 0.25105263157894736, + "grad_norm": 3.246509552001953, + "learning_rate": 2.9960694126214204e-05, + "loss": 1.1222, + "step": 477 + }, + { + "epoch": 0.25157894736842107, + "grad_norm": 1.3423675298690796, + "learning_rate": 2.996051410986351e-05, + "loss": 0.8958, + "step": 478 + }, + { + "epoch": 0.2521052631578947, + "grad_norm": 2.2557055950164795, + "learning_rate": 2.9960333682770367e-05, + "loss": 1.0483, + "step": 479 + }, + { + "epoch": 0.25263157894736843, + "grad_norm": 1.5628443956375122, + "learning_rate": 2.9960152844939716e-05, + "loss": 1.1525, + "step": 480 + }, + { + "epoch": 0.2531578947368421, + "grad_norm": 4.363856315612793, + "learning_rate": 2.9959971596376527e-05, + "loss": 2.8042, + "step": 481 + }, + { + "epoch": 0.2536842105263158, + "grad_norm": 5.757142543792725, + "learning_rate": 2.9959789937085774e-05, + "loss": 0.9964, + "step": 482 + }, + { + "epoch": 0.2542105263157895, + "grad_norm": 2.2105233669281006, + "learning_rate": 2.995960786707245e-05, + "loss": 0.844, + "step": 483 + }, + { + "epoch": 0.25473684210526315, + "grad_norm": 1.6293426752090454, + "learning_rate": 2.9959425386341544e-05, + "loss": 1.1284, + "step": 484 + }, + { + "epoch": 0.25526315789473686, + "grad_norm": 5.280759334564209, + "learning_rate": 2.9959242494898076e-05, + "loss": 1.1822, + "step": 485 + }, + { + "epoch": 0.2557894736842105, + "grad_norm": 3.082880735397339, + "learning_rate": 2.9959059192747064e-05, + "loss": 0.4506, + "step": 486 + }, + { + "epoch": 0.2563157894736842, + "grad_norm": 1.4250710010528564, + "learning_rate": 2.995887547989354e-05, + "loss": 0.8048, + "step": 487 + }, + { + "epoch": 0.25684210526315787, + "grad_norm": 2.168121814727783, + "learning_rate": 2.9958691356342546e-05, + "loss": 1.1605, + "step": 488 + }, + { + "epoch": 0.2573684210526316, + "grad_norm": 1.3707858324050903, + "learning_rate": 2.995850682209914e-05, + "loss": 0.9299, + "step": 489 + }, + { + "epoch": 0.2578947368421053, + "grad_norm": 4.050701141357422, + "learning_rate": 2.9958321877168392e-05, + "loss": 0.5374, + "step": 490 + }, + { + "epoch": 0.25842105263157894, + "grad_norm": 7.512131214141846, + "learning_rate": 2.9958136521555373e-05, + "loss": 1.5434, + "step": 491 + }, + { + "epoch": 0.25894736842105265, + "grad_norm": 2.668602466583252, + "learning_rate": 2.9957950755265167e-05, + "loss": 1.0796, + "step": 492 + }, + { + "epoch": 0.2594736842105263, + "grad_norm": 1.4869720935821533, + "learning_rate": 2.995776457830289e-05, + "loss": 1.2192, + "step": 493 + }, + { + "epoch": 0.26, + "grad_norm": 2.3927292823791504, + "learning_rate": 2.995757799067364e-05, + "loss": 1.1358, + "step": 494 + }, + { + "epoch": 0.26052631578947366, + "grad_norm": 1.5988667011260986, + "learning_rate": 2.995739099238255e-05, + "loss": 0.9472, + "step": 495 + }, + { + "epoch": 0.26105263157894737, + "grad_norm": 1.5140737295150757, + "learning_rate": 2.9957203583434742e-05, + "loss": 0.6382, + "step": 496 + }, + { + "epoch": 0.2615789473684211, + "grad_norm": 1.3366851806640625, + "learning_rate": 2.9957015763835368e-05, + "loss": 0.909, + "step": 497 + }, + { + "epoch": 0.26210526315789473, + "grad_norm": 1.2181779146194458, + "learning_rate": 2.9956827533589592e-05, + "loss": 0.8209, + "step": 498 + }, + { + "epoch": 0.26263157894736844, + "grad_norm": 5.791037559509277, + "learning_rate": 2.995663889270257e-05, + "loss": 0.7449, + "step": 499 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 8.648612976074219, + "learning_rate": 2.9956449841179486e-05, + "loss": 1.3843, + "step": 500 + }, + { + "epoch": 0.2636842105263158, + "grad_norm": 4.5848307609558105, + "learning_rate": 2.995626037902553e-05, + "loss": 0.9603, + "step": 501 + }, + { + "epoch": 0.26421052631578945, + "grad_norm": 1.4494788646697998, + "learning_rate": 2.99560705062459e-05, + "loss": 0.7254, + "step": 502 + }, + { + "epoch": 0.26473684210526316, + "grad_norm": 16.663463592529297, + "learning_rate": 2.9955880222845818e-05, + "loss": 0.3266, + "step": 503 + }, + { + "epoch": 0.26526315789473687, + "grad_norm": 1.475623607635498, + "learning_rate": 2.99556895288305e-05, + "loss": 1.3868, + "step": 504 + }, + { + "epoch": 0.2657894736842105, + "grad_norm": 1.3773105144500732, + "learning_rate": 2.995549842420519e-05, + "loss": 0.1299, + "step": 505 + }, + { + "epoch": 0.26631578947368423, + "grad_norm": 1.1917214393615723, + "learning_rate": 2.9955306908975126e-05, + "loss": 1.0726, + "step": 506 + }, + { + "epoch": 0.2668421052631579, + "grad_norm": 5.162087917327881, + "learning_rate": 2.9955114983145567e-05, + "loss": 1.251, + "step": 507 + }, + { + "epoch": 0.2673684210526316, + "grad_norm": 1.6344598531723022, + "learning_rate": 2.9954922646721785e-05, + "loss": 1.1682, + "step": 508 + }, + { + "epoch": 0.26789473684210524, + "grad_norm": 4.443221569061279, + "learning_rate": 2.9954729899709065e-05, + "loss": 1.285, + "step": 509 + }, + { + "epoch": 0.26842105263157895, + "grad_norm": 1.5260694026947021, + "learning_rate": 2.9954536742112688e-05, + "loss": 1.305, + "step": 510 + }, + { + "epoch": 0.26894736842105266, + "grad_norm": 1.5212806463241577, + "learning_rate": 2.9954343173937968e-05, + "loss": 0.8619, + "step": 511 + }, + { + "epoch": 0.2694736842105263, + "grad_norm": 46.06574249267578, + "learning_rate": 2.9954149195190214e-05, + "loss": 1.2454, + "step": 512 + }, + { + "epoch": 0.27, + "grad_norm": 1.1406009197235107, + "learning_rate": 2.995395480587475e-05, + "loss": 0.8901, + "step": 513 + }, + { + "epoch": 0.27052631578947367, + "grad_norm": 1.038522720336914, + "learning_rate": 2.995376000599692e-05, + "loss": 0.5812, + "step": 514 + }, + { + "epoch": 0.2710526315789474, + "grad_norm": 4.814447402954102, + "learning_rate": 2.9953564795562064e-05, + "loss": 0.5695, + "step": 515 + }, + { + "epoch": 0.27157894736842103, + "grad_norm": 3.0107245445251465, + "learning_rate": 2.995336917457555e-05, + "loss": 1.6017, + "step": 516 + }, + { + "epoch": 0.27210526315789474, + "grad_norm": 0.9930879473686218, + "learning_rate": 2.995317314304274e-05, + "loss": 0.9568, + "step": 517 + }, + { + "epoch": 0.27263157894736845, + "grad_norm": 24.54469108581543, + "learning_rate": 2.995297670096902e-05, + "loss": 0.8168, + "step": 518 + }, + { + "epoch": 0.2731578947368421, + "grad_norm": 2.448007106781006, + "learning_rate": 2.995277984835979e-05, + "loss": 1.5648, + "step": 519 + }, + { + "epoch": 0.2736842105263158, + "grad_norm": 3.251126527786255, + "learning_rate": 2.995258258522044e-05, + "loss": 1.4491, + "step": 520 + }, + { + "epoch": 0.27421052631578946, + "grad_norm": 1.0893594026565552, + "learning_rate": 2.9952384911556394e-05, + "loss": 1.3153, + "step": 521 + }, + { + "epoch": 0.27473684210526317, + "grad_norm": 4.283177375793457, + "learning_rate": 2.9952186827373085e-05, + "loss": 0.2124, + "step": 522 + }, + { + "epoch": 0.2752631578947368, + "grad_norm": 3.8956921100616455, + "learning_rate": 2.995198833267594e-05, + "loss": 1.0426, + "step": 523 + }, + { + "epoch": 0.27578947368421053, + "grad_norm": 4.345573425292969, + "learning_rate": 2.995178942747042e-05, + "loss": 0.2201, + "step": 524 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 1.2920668125152588, + "learning_rate": 2.9951590111761975e-05, + "loss": 1.4519, + "step": 525 + }, + { + "epoch": 0.2768421052631579, + "grad_norm": 1.0732390880584717, + "learning_rate": 2.9951390385556084e-05, + "loss": 0.8767, + "step": 526 + }, + { + "epoch": 0.2773684210526316, + "grad_norm": 1.2250876426696777, + "learning_rate": 2.995119024885823e-05, + "loss": 0.6662, + "step": 527 + }, + { + "epoch": 0.27789473684210525, + "grad_norm": 4.883981704711914, + "learning_rate": 2.9950989701673906e-05, + "loss": 1.5669, + "step": 528 + }, + { + "epoch": 0.27842105263157896, + "grad_norm": 1.9038349390029907, + "learning_rate": 2.995078874400862e-05, + "loss": 1.1141, + "step": 529 + }, + { + "epoch": 0.2789473684210526, + "grad_norm": 8.930418968200684, + "learning_rate": 2.9950587375867887e-05, + "loss": 1.2227, + "step": 530 + }, + { + "epoch": 0.2794736842105263, + "grad_norm": 2.2693824768066406, + "learning_rate": 2.995038559725724e-05, + "loss": 1.0804, + "step": 531 + }, + { + "epoch": 0.28, + "grad_norm": 2.965463161468506, + "learning_rate": 2.995018340818221e-05, + "loss": 1.2129, + "step": 532 + }, + { + "epoch": 0.2805263157894737, + "grad_norm": 2.060148000717163, + "learning_rate": 2.9949980808648357e-05, + "loss": 0.6663, + "step": 533 + }, + { + "epoch": 0.2810526315789474, + "grad_norm": 1.0076817274093628, + "learning_rate": 2.9949777798661236e-05, + "loss": 0.7417, + "step": 534 + }, + { + "epoch": 0.28157894736842104, + "grad_norm": 2.449284791946411, + "learning_rate": 2.9949574378226426e-05, + "loss": 1.5616, + "step": 535 + }, + { + "epoch": 0.28210526315789475, + "grad_norm": 1.3279973268508911, + "learning_rate": 2.994937054734952e-05, + "loss": 0.9661, + "step": 536 + }, + { + "epoch": 0.2826315789473684, + "grad_norm": 3.356504201889038, + "learning_rate": 2.9949166306036095e-05, + "loss": 1.584, + "step": 537 + }, + { + "epoch": 0.2831578947368421, + "grad_norm": 1.2326231002807617, + "learning_rate": 2.994896165429177e-05, + "loss": 0.9024, + "step": 538 + }, + { + "epoch": 0.2836842105263158, + "grad_norm": 7.02909517288208, + "learning_rate": 2.994875659212216e-05, + "loss": 1.0278, + "step": 539 + }, + { + "epoch": 0.28421052631578947, + "grad_norm": 1.6224465370178223, + "learning_rate": 2.9948551119532902e-05, + "loss": 1.278, + "step": 540 + }, + { + "epoch": 0.2847368421052632, + "grad_norm": 1.3300915956497192, + "learning_rate": 2.994834523652963e-05, + "loss": 1.1461, + "step": 541 + }, + { + "epoch": 0.28526315789473683, + "grad_norm": 1.0725505352020264, + "learning_rate": 2.9948138943117996e-05, + "loss": 0.9856, + "step": 542 + }, + { + "epoch": 0.28578947368421054, + "grad_norm": 1.2995253801345825, + "learning_rate": 2.9947932239303673e-05, + "loss": 0.922, + "step": 543 + }, + { + "epoch": 0.2863157894736842, + "grad_norm": 3.25260066986084, + "learning_rate": 2.9947725125092326e-05, + "loss": 1.5961, + "step": 544 + }, + { + "epoch": 0.2868421052631579, + "grad_norm": 1.5616159439086914, + "learning_rate": 2.9947517600489643e-05, + "loss": 1.4245, + "step": 545 + }, + { + "epoch": 0.2873684210526316, + "grad_norm": 14.093785285949707, + "learning_rate": 2.9947309665501327e-05, + "loss": 0.9921, + "step": 546 + }, + { + "epoch": 0.28789473684210526, + "grad_norm": 1.165595293045044, + "learning_rate": 2.9947101320133083e-05, + "loss": 1.0, + "step": 547 + }, + { + "epoch": 0.28842105263157897, + "grad_norm": 2.5077242851257324, + "learning_rate": 2.994689256439063e-05, + "loss": 1.5877, + "step": 548 + }, + { + "epoch": 0.2889473684210526, + "grad_norm": 1.8317209482192993, + "learning_rate": 2.9946683398279706e-05, + "loss": 1.0294, + "step": 549 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 1.0942083597183228, + "learning_rate": 2.9946473821806044e-05, + "loss": 0.4813, + "step": 550 + }, + { + "epoch": 0.29, + "grad_norm": 1.6907622814178467, + "learning_rate": 2.9946263834975403e-05, + "loss": 0.9892, + "step": 551 + }, + { + "epoch": 0.2905263157894737, + "grad_norm": 9.270956993103027, + "learning_rate": 2.994605343779355e-05, + "loss": 0.6519, + "step": 552 + }, + { + "epoch": 0.2910526315789474, + "grad_norm": 3.074319362640381, + "learning_rate": 2.9945842630266258e-05, + "loss": 0.6775, + "step": 553 + }, + { + "epoch": 0.29157894736842105, + "grad_norm": 1.4713774919509888, + "learning_rate": 2.994563141239932e-05, + "loss": 1.1378, + "step": 554 + }, + { + "epoch": 0.29210526315789476, + "grad_norm": 3.8262014389038086, + "learning_rate": 2.9945419784198524e-05, + "loss": 1.2291, + "step": 555 + }, + { + "epoch": 0.2926315789473684, + "grad_norm": 3.6956279277801514, + "learning_rate": 2.9945207745669695e-05, + "loss": 0.2079, + "step": 556 + }, + { + "epoch": 0.2931578947368421, + "grad_norm": 1.555780053138733, + "learning_rate": 2.9944995296818642e-05, + "loss": 0.9855, + "step": 557 + }, + { + "epoch": 0.29368421052631577, + "grad_norm": 7.439387321472168, + "learning_rate": 2.9944782437651203e-05, + "loss": 0.1662, + "step": 558 + }, + { + "epoch": 0.2942105263157895, + "grad_norm": 2.846749782562256, + "learning_rate": 2.994456916817322e-05, + "loss": 0.4499, + "step": 559 + }, + { + "epoch": 0.29473684210526313, + "grad_norm": 2.7918450832366943, + "learning_rate": 2.9944355488390555e-05, + "loss": 0.0985, + "step": 560 + }, + { + "epoch": 0.29526315789473684, + "grad_norm": 3.7580766677856445, + "learning_rate": 2.9944141398309067e-05, + "loss": 0.7624, + "step": 561 + }, + { + "epoch": 0.29578947368421055, + "grad_norm": 1.398541808128357, + "learning_rate": 2.994392689793464e-05, + "loss": 0.7911, + "step": 562 + }, + { + "epoch": 0.2963157894736842, + "grad_norm": 7.155591011047363, + "learning_rate": 2.9943711987273156e-05, + "loss": 0.2768, + "step": 563 + }, + { + "epoch": 0.2968421052631579, + "grad_norm": 15.522272109985352, + "learning_rate": 2.994349666633052e-05, + "loss": 1.0977, + "step": 564 + }, + { + "epoch": 0.29736842105263156, + "grad_norm": 2.732008457183838, + "learning_rate": 2.9943280935112644e-05, + "loss": 0.749, + "step": 565 + }, + { + "epoch": 0.29789473684210527, + "grad_norm": 1.2546172142028809, + "learning_rate": 2.994306479362545e-05, + "loss": 0.823, + "step": 566 + }, + { + "epoch": 0.2984210526315789, + "grad_norm": 1.2818124294281006, + "learning_rate": 2.994284824187487e-05, + "loss": 1.1513, + "step": 567 + }, + { + "epoch": 0.29894736842105263, + "grad_norm": 1.5917184352874756, + "learning_rate": 2.9942631279866853e-05, + "loss": 0.8293, + "step": 568 + }, + { + "epoch": 0.29947368421052634, + "grad_norm": 1.2113322019577026, + "learning_rate": 2.9942413907607355e-05, + "loss": 0.3476, + "step": 569 + }, + { + "epoch": 0.3, + "grad_norm": 2.0724539756774902, + "learning_rate": 2.994219612510234e-05, + "loss": 0.6935, + "step": 570 + }, + { + "epoch": 0.3005263157894737, + "grad_norm": 2.444352149963379, + "learning_rate": 2.9941977932357794e-05, + "loss": 0.1079, + "step": 571 + }, + { + "epoch": 0.30105263157894735, + "grad_norm": 4.327816486358643, + "learning_rate": 2.99417593293797e-05, + "loss": 0.5792, + "step": 572 + }, + { + "epoch": 0.30157894736842106, + "grad_norm": 1.2691162824630737, + "learning_rate": 2.9941540316174066e-05, + "loss": 0.9346, + "step": 573 + }, + { + "epoch": 0.3021052631578947, + "grad_norm": 2.0975606441497803, + "learning_rate": 2.99413208927469e-05, + "loss": 1.1176, + "step": 574 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 1.5340697765350342, + "learning_rate": 2.994110105910423e-05, + "loss": 0.862, + "step": 575 + }, + { + "epoch": 0.3031578947368421, + "grad_norm": 16.70974349975586, + "learning_rate": 2.9940880815252097e-05, + "loss": 0.6121, + "step": 576 + }, + { + "epoch": 0.3036842105263158, + "grad_norm": 3.6483004093170166, + "learning_rate": 2.9940660161196536e-05, + "loss": 1.0742, + "step": 577 + }, + { + "epoch": 0.3042105263157895, + "grad_norm": 1.5293868780136108, + "learning_rate": 2.994043909694361e-05, + "loss": 0.7131, + "step": 578 + }, + { + "epoch": 0.30473684210526314, + "grad_norm": 3.1697440147399902, + "learning_rate": 2.994021762249939e-05, + "loss": 0.7688, + "step": 579 + }, + { + "epoch": 0.30526315789473685, + "grad_norm": 11.020563125610352, + "learning_rate": 2.993999573786995e-05, + "loss": 2.4059, + "step": 580 + }, + { + "epoch": 0.3057894736842105, + "grad_norm": 1.883289098739624, + "learning_rate": 2.9939773443061393e-05, + "loss": 1.0566, + "step": 581 + }, + { + "epoch": 0.3063157894736842, + "grad_norm": 1.52479887008667, + "learning_rate": 2.9939550738079814e-05, + "loss": 1.0846, + "step": 582 + }, + { + "epoch": 0.3068421052631579, + "grad_norm": 4.475525856018066, + "learning_rate": 2.9939327622931333e-05, + "loss": 0.6157, + "step": 583 + }, + { + "epoch": 0.30736842105263157, + "grad_norm": 1.8167206048965454, + "learning_rate": 2.993910409762207e-05, + "loss": 1.3638, + "step": 584 + }, + { + "epoch": 0.3078947368421053, + "grad_norm": 1.2058393955230713, + "learning_rate": 2.9938880162158164e-05, + "loss": 1.0466, + "step": 585 + }, + { + "epoch": 0.30842105263157893, + "grad_norm": 8.277087211608887, + "learning_rate": 2.9938655816545764e-05, + "loss": 1.4132, + "step": 586 + }, + { + "epoch": 0.30894736842105264, + "grad_norm": 2.545767307281494, + "learning_rate": 2.993843106079103e-05, + "loss": 0.7397, + "step": 587 + }, + { + "epoch": 0.3094736842105263, + "grad_norm": 1.162050485610962, + "learning_rate": 2.993820589490013e-05, + "loss": 1.1346, + "step": 588 + }, + { + "epoch": 0.31, + "grad_norm": 1.3114622831344604, + "learning_rate": 2.9937980318879247e-05, + "loss": 0.9332, + "step": 589 + }, + { + "epoch": 0.3105263157894737, + "grad_norm": 2.397101640701294, + "learning_rate": 2.9937754332734576e-05, + "loss": 0.7573, + "step": 590 + }, + { + "epoch": 0.31105263157894736, + "grad_norm": 1.4673283100128174, + "learning_rate": 2.993752793647232e-05, + "loss": 0.8484, + "step": 591 + }, + { + "epoch": 0.31157894736842107, + "grad_norm": 9.11661148071289, + "learning_rate": 2.9937301130098696e-05, + "loss": 0.4733, + "step": 592 + }, + { + "epoch": 0.3121052631578947, + "grad_norm": 1.699437141418457, + "learning_rate": 2.9937073913619926e-05, + "loss": 0.5332, + "step": 593 + }, + { + "epoch": 0.3126315789473684, + "grad_norm": 27.69738006591797, + "learning_rate": 2.9936846287042254e-05, + "loss": 2.1628, + "step": 594 + }, + { + "epoch": 0.3131578947368421, + "grad_norm": 10.93797779083252, + "learning_rate": 2.993661825037193e-05, + "loss": 1.469, + "step": 595 + }, + { + "epoch": 0.3136842105263158, + "grad_norm": 4.551391124725342, + "learning_rate": 2.993638980361521e-05, + "loss": 0.6163, + "step": 596 + }, + { + "epoch": 0.3142105263157895, + "grad_norm": 2.5031051635742188, + "learning_rate": 2.9936160946778372e-05, + "loss": 1.3955, + "step": 597 + }, + { + "epoch": 0.31473684210526315, + "grad_norm": 1.7328745126724243, + "learning_rate": 2.993593167986769e-05, + "loss": 0.7825, + "step": 598 + }, + { + "epoch": 0.31526315789473686, + "grad_norm": 4.045513153076172, + "learning_rate": 2.993570200288947e-05, + "loss": 0.4318, + "step": 599 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 2.73246169090271, + "learning_rate": 2.993547191585001e-05, + "loss": 1.4336, + "step": 600 + }, + { + "epoch": 0.3163157894736842, + "grad_norm": 1.558140754699707, + "learning_rate": 2.9935241418755626e-05, + "loss": 1.0587, + "step": 601 + }, + { + "epoch": 0.31684210526315787, + "grad_norm": 11.362480163574219, + "learning_rate": 2.9935010511612655e-05, + "loss": 0.4542, + "step": 602 + }, + { + "epoch": 0.3173684210526316, + "grad_norm": 1.3051596879959106, + "learning_rate": 2.9934779194427427e-05, + "loss": 0.7598, + "step": 603 + }, + { + "epoch": 0.3178947368421053, + "grad_norm": 3.5925891399383545, + "learning_rate": 2.99345474672063e-05, + "loss": 1.1406, + "step": 604 + }, + { + "epoch": 0.31842105263157894, + "grad_norm": 12.116620063781738, + "learning_rate": 2.993431532995563e-05, + "loss": 1.519, + "step": 605 + }, + { + "epoch": 0.31894736842105265, + "grad_norm": 1.2009161710739136, + "learning_rate": 2.9934082782681797e-05, + "loss": 0.9572, + "step": 606 + }, + { + "epoch": 0.3194736842105263, + "grad_norm": 2.059299945831299, + "learning_rate": 2.993384982539118e-05, + "loss": 0.6667, + "step": 607 + }, + { + "epoch": 0.32, + "grad_norm": 2.226381778717041, + "learning_rate": 2.9933616458090174e-05, + "loss": 1.2235, + "step": 608 + }, + { + "epoch": 0.32052631578947366, + "grad_norm": 1.5386918783187866, + "learning_rate": 2.9933382680785196e-05, + "loss": 1.2193, + "step": 609 + }, + { + "epoch": 0.32105263157894737, + "grad_norm": 4.095519065856934, + "learning_rate": 2.9933148493482653e-05, + "loss": 0.2043, + "step": 610 + }, + { + "epoch": 0.3215789473684211, + "grad_norm": 16.421064376831055, + "learning_rate": 2.9932913896188978e-05, + "loss": 1.4538, + "step": 611 + }, + { + "epoch": 0.32210526315789473, + "grad_norm": 1.1642028093338013, + "learning_rate": 2.9932678888910614e-05, + "loss": 0.7069, + "step": 612 + }, + { + "epoch": 0.32263157894736844, + "grad_norm": 1.0497969388961792, + "learning_rate": 2.9932443471654013e-05, + "loss": 1.027, + "step": 613 + }, + { + "epoch": 0.3231578947368421, + "grad_norm": 1.70055091381073, + "learning_rate": 2.9932207644425635e-05, + "loss": 1.0163, + "step": 614 + }, + { + "epoch": 0.3236842105263158, + "grad_norm": 0.4913199841976166, + "learning_rate": 2.993197140723196e-05, + "loss": 0.0092, + "step": 615 + }, + { + "epoch": 0.32421052631578945, + "grad_norm": 9.340396881103516, + "learning_rate": 2.993173476007947e-05, + "loss": 0.7314, + "step": 616 + }, + { + "epoch": 0.32473684210526316, + "grad_norm": 3.6009857654571533, + "learning_rate": 2.993149770297466e-05, + "loss": 0.9824, + "step": 617 + }, + { + "epoch": 0.32526315789473687, + "grad_norm": 1.5114206075668335, + "learning_rate": 2.9931260235924046e-05, + "loss": 0.6525, + "step": 618 + }, + { + "epoch": 0.3257894736842105, + "grad_norm": 4.900846004486084, + "learning_rate": 2.9931022358934144e-05, + "loss": 1.0022, + "step": 619 + }, + { + "epoch": 0.3263157894736842, + "grad_norm": 3.2764687538146973, + "learning_rate": 2.993078407201148e-05, + "loss": 1.571, + "step": 620 + }, + { + "epoch": 0.3268421052631579, + "grad_norm": 1.1837842464447021, + "learning_rate": 2.9930545375162602e-05, + "loss": 1.2526, + "step": 621 + }, + { + "epoch": 0.3273684210526316, + "grad_norm": 1.7309764623641968, + "learning_rate": 2.993030626839406e-05, + "loss": 0.9898, + "step": 622 + }, + { + "epoch": 0.32789473684210524, + "grad_norm": 1.766406536102295, + "learning_rate": 2.9930066751712427e-05, + "loss": 0.6513, + "step": 623 + }, + { + "epoch": 0.32842105263157895, + "grad_norm": 1.1759055852890015, + "learning_rate": 2.9929826825124268e-05, + "loss": 1.0862, + "step": 624 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 46.77162551879883, + "learning_rate": 2.9929586488636174e-05, + "loss": 3.9448, + "step": 625 + }, + { + "epoch": 0.3294736842105263, + "grad_norm": 1.6841140985488892, + "learning_rate": 2.992934574225475e-05, + "loss": 1.6885, + "step": 626 + }, + { + "epoch": 0.33, + "grad_norm": 1.449564814567566, + "learning_rate": 2.9929104585986594e-05, + "loss": 1.354, + "step": 627 + }, + { + "epoch": 0.33052631578947367, + "grad_norm": 2.3545868396759033, + "learning_rate": 2.992886301983833e-05, + "loss": 0.7053, + "step": 628 + }, + { + "epoch": 0.3310526315789474, + "grad_norm": 1.053680181503296, + "learning_rate": 2.9928621043816602e-05, + "loss": 1.3087, + "step": 629 + }, + { + "epoch": 0.33157894736842103, + "grad_norm": 7.419710636138916, + "learning_rate": 2.9928378657928037e-05, + "loss": 0.3903, + "step": 630 + }, + { + "epoch": 0.33210526315789474, + "grad_norm": 1.032652497291565, + "learning_rate": 2.9928135862179304e-05, + "loss": 1.0834, + "step": 631 + }, + { + "epoch": 0.33263157894736844, + "grad_norm": 11.945768356323242, + "learning_rate": 2.9927892656577057e-05, + "loss": 0.5555, + "step": 632 + }, + { + "epoch": 0.3331578947368421, + "grad_norm": 3.9728310108184814, + "learning_rate": 2.9927649041127978e-05, + "loss": 0.2488, + "step": 633 + }, + { + "epoch": 0.3336842105263158, + "grad_norm": 1.3373970985412598, + "learning_rate": 2.992740501583876e-05, + "loss": 1.0203, + "step": 634 + }, + { + "epoch": 0.33421052631578946, + "grad_norm": 4.078335762023926, + "learning_rate": 2.9927160580716096e-05, + "loss": 1.0495, + "step": 635 + }, + { + "epoch": 0.33473684210526317, + "grad_norm": 1.9304755926132202, + "learning_rate": 2.99269157357667e-05, + "loss": 1.4033, + "step": 636 + }, + { + "epoch": 0.3352631578947368, + "grad_norm": 2.1133267879486084, + "learning_rate": 2.99266704809973e-05, + "loss": 1.1063, + "step": 637 + }, + { + "epoch": 0.3357894736842105, + "grad_norm": 6.6004180908203125, + "learning_rate": 2.9926424816414615e-05, + "loss": 1.1123, + "step": 638 + }, + { + "epoch": 0.33631578947368423, + "grad_norm": 1.2974992990493774, + "learning_rate": 2.9926178742025403e-05, + "loss": 2.3209, + "step": 639 + }, + { + "epoch": 0.3368421052631579, + "grad_norm": 1.2676901817321777, + "learning_rate": 2.992593225783641e-05, + "loss": 0.9767, + "step": 640 + }, + { + "epoch": 0.3373684210526316, + "grad_norm": 1.001715064048767, + "learning_rate": 2.9925685363854413e-05, + "loss": 0.8202, + "step": 641 + }, + { + "epoch": 0.33789473684210525, + "grad_norm": 3.934710741043091, + "learning_rate": 2.9925438060086187e-05, + "loss": 1.4305, + "step": 642 + }, + { + "epoch": 0.33842105263157896, + "grad_norm": 1.8003560304641724, + "learning_rate": 2.992519034653852e-05, + "loss": 0.8166, + "step": 643 + }, + { + "epoch": 0.3389473684210526, + "grad_norm": 1.138811469078064, + "learning_rate": 2.992494222321821e-05, + "loss": 0.8121, + "step": 644 + }, + { + "epoch": 0.3394736842105263, + "grad_norm": 1.1089503765106201, + "learning_rate": 2.992469369013208e-05, + "loss": 1.1773, + "step": 645 + }, + { + "epoch": 0.34, + "grad_norm": 9.685154914855957, + "learning_rate": 2.992444474728694e-05, + "loss": 1.0016, + "step": 646 + }, + { + "epoch": 0.3405263157894737, + "grad_norm": 1.1541014909744263, + "learning_rate": 2.9924195394689635e-05, + "loss": 1.0952, + "step": 647 + }, + { + "epoch": 0.3410526315789474, + "grad_norm": 1.8897780179977417, + "learning_rate": 2.9923945632347002e-05, + "loss": 1.2069, + "step": 648 + }, + { + "epoch": 0.34157894736842104, + "grad_norm": 3.347874879837036, + "learning_rate": 2.9923695460265912e-05, + "loss": 1.0908, + "step": 649 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 12.232775688171387, + "learning_rate": 2.992344487845322e-05, + "loss": 1.9784, + "step": 650 + }, + { + "epoch": 0.3426315789473684, + "grad_norm": 2.2362685203552246, + "learning_rate": 2.992319388691581e-05, + "loss": 0.3282, + "step": 651 + }, + { + "epoch": 0.3431578947368421, + "grad_norm": 1.7842745780944824, + "learning_rate": 2.9922942485660577e-05, + "loss": 1.34, + "step": 652 + }, + { + "epoch": 0.3436842105263158, + "grad_norm": 4.246903419494629, + "learning_rate": 2.9922690674694418e-05, + "loss": 0.8923, + "step": 653 + }, + { + "epoch": 0.34421052631578947, + "grad_norm": 5.96973991394043, + "learning_rate": 2.9922438454024246e-05, + "loss": 0.6434, + "step": 654 + }, + { + "epoch": 0.3447368421052632, + "grad_norm": 3.631617784500122, + "learning_rate": 2.992218582365699e-05, + "loss": 0.9737, + "step": 655 + }, + { + "epoch": 0.3452631578947368, + "grad_norm": 11.352360725402832, + "learning_rate": 2.9921932783599585e-05, + "loss": 1.8412, + "step": 656 + }, + { + "epoch": 0.34578947368421054, + "grad_norm": 1.3203455209732056, + "learning_rate": 2.9921679333858976e-05, + "loss": 1.0928, + "step": 657 + }, + { + "epoch": 0.3463157894736842, + "grad_norm": 1.1191567182540894, + "learning_rate": 2.9921425474442127e-05, + "loss": 1.4272, + "step": 658 + }, + { + "epoch": 0.3468421052631579, + "grad_norm": 2.8861405849456787, + "learning_rate": 2.9921171205356e-05, + "loss": 0.9221, + "step": 659 + }, + { + "epoch": 0.3473684210526316, + "grad_norm": 2.145612955093384, + "learning_rate": 2.992091652660758e-05, + "loss": 0.3259, + "step": 660 + }, + { + "epoch": 0.34789473684210526, + "grad_norm": 3.5325045585632324, + "learning_rate": 2.9920661438203862e-05, + "loss": 2.93, + "step": 661 + }, + { + "epoch": 0.34842105263157896, + "grad_norm": 1.2416492700576782, + "learning_rate": 2.9920405940151842e-05, + "loss": 1.2741, + "step": 662 + }, + { + "epoch": 0.3489473684210526, + "grad_norm": 2.492912530899048, + "learning_rate": 2.9920150032458538e-05, + "loss": 0.9804, + "step": 663 + }, + { + "epoch": 0.3494736842105263, + "grad_norm": 10.124689102172852, + "learning_rate": 2.9919893715130983e-05, + "loss": 1.3878, + "step": 664 + }, + { + "epoch": 0.35, + "grad_norm": 1.77699613571167, + "learning_rate": 2.9919636988176208e-05, + "loss": 1.2149, + "step": 665 + }, + { + "epoch": 0.3505263157894737, + "grad_norm": 1.3935060501098633, + "learning_rate": 2.9919379851601256e-05, + "loss": 1.2551, + "step": 666 + }, + { + "epoch": 0.3510526315789474, + "grad_norm": 5.249575138092041, + "learning_rate": 2.9919122305413196e-05, + "loss": 0.7921, + "step": 667 + }, + { + "epoch": 0.35157894736842105, + "grad_norm": 3.862658739089966, + "learning_rate": 2.9918864349619094e-05, + "loss": 0.269, + "step": 668 + }, + { + "epoch": 0.35210526315789475, + "grad_norm": 4.229588508605957, + "learning_rate": 2.991860598422604e-05, + "loss": 0.9079, + "step": 669 + }, + { + "epoch": 0.3526315789473684, + "grad_norm": 8.937419891357422, + "learning_rate": 2.9918347209241116e-05, + "loss": 0.694, + "step": 670 + }, + { + "epoch": 0.3531578947368421, + "grad_norm": 9.598597526550293, + "learning_rate": 2.9918088024671428e-05, + "loss": 0.4956, + "step": 671 + }, + { + "epoch": 0.35368421052631577, + "grad_norm": 11.96364974975586, + "learning_rate": 2.99178284305241e-05, + "loss": 1.865, + "step": 672 + }, + { + "epoch": 0.3542105263157895, + "grad_norm": 8.622352600097656, + "learning_rate": 2.9917568426806253e-05, + "loss": 1.3497, + "step": 673 + }, + { + "epoch": 0.3547368421052632, + "grad_norm": 1.6212915182113647, + "learning_rate": 2.991730801352503e-05, + "loss": 1.0978, + "step": 674 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 1.9762486219406128, + "learning_rate": 2.9917047190687578e-05, + "loss": 1.0297, + "step": 675 + }, + { + "epoch": 0.35578947368421054, + "grad_norm": 1.137174129486084, + "learning_rate": 2.991678595830106e-05, + "loss": 1.3445, + "step": 676 + }, + { + "epoch": 0.3563157894736842, + "grad_norm": 1.2727190256118774, + "learning_rate": 2.991652431637264e-05, + "loss": 0.6727, + "step": 677 + }, + { + "epoch": 0.3568421052631579, + "grad_norm": 3.9113903045654297, + "learning_rate": 2.991626226490951e-05, + "loss": 1.0311, + "step": 678 + }, + { + "epoch": 0.35736842105263156, + "grad_norm": 3.078036069869995, + "learning_rate": 2.9915999803918862e-05, + "loss": 1.232, + "step": 679 + }, + { + "epoch": 0.35789473684210527, + "grad_norm": 2.5189783573150635, + "learning_rate": 2.99157369334079e-05, + "loss": 1.1612, + "step": 680 + }, + { + "epoch": 0.358421052631579, + "grad_norm": 3.2792701721191406, + "learning_rate": 2.991547365338385e-05, + "loss": 0.1674, + "step": 681 + }, + { + "epoch": 0.3589473684210526, + "grad_norm": 2.9844839572906494, + "learning_rate": 2.9915209963853928e-05, + "loss": 0.2604, + "step": 682 + }, + { + "epoch": 0.35947368421052633, + "grad_norm": 1.3294965028762817, + "learning_rate": 2.991494586482538e-05, + "loss": 1.1524, + "step": 683 + }, + { + "epoch": 0.36, + "grad_norm": 7.3986382484436035, + "learning_rate": 2.9914681356305458e-05, + "loss": 1.324, + "step": 684 + }, + { + "epoch": 0.3605263157894737, + "grad_norm": 2.5278208255767822, + "learning_rate": 2.991441643830142e-05, + "loss": 0.7384, + "step": 685 + }, + { + "epoch": 0.36105263157894735, + "grad_norm": 1.2104181051254272, + "learning_rate": 2.991415111082054e-05, + "loss": 1.3023, + "step": 686 + }, + { + "epoch": 0.36157894736842106, + "grad_norm": 1.2583531141281128, + "learning_rate": 2.9913885373870108e-05, + "loss": 1.1101, + "step": 687 + }, + { + "epoch": 0.36210526315789476, + "grad_norm": 1.007744312286377, + "learning_rate": 2.9913619227457413e-05, + "loss": 1.1889, + "step": 688 + }, + { + "epoch": 0.3626315789473684, + "grad_norm": 6.841166019439697, + "learning_rate": 2.991335267158977e-05, + "loss": 1.5798, + "step": 689 + }, + { + "epoch": 0.3631578947368421, + "grad_norm": 4.756072044372559, + "learning_rate": 2.9913085706274485e-05, + "loss": 0.4925, + "step": 690 + }, + { + "epoch": 0.3636842105263158, + "grad_norm": 2.5629639625549316, + "learning_rate": 2.99128183315189e-05, + "loss": 0.872, + "step": 691 + }, + { + "epoch": 0.3642105263157895, + "grad_norm": 5.531483173370361, + "learning_rate": 2.9912550547330348e-05, + "loss": 0.8287, + "step": 692 + }, + { + "epoch": 0.36473684210526314, + "grad_norm": 1.3215042352676392, + "learning_rate": 2.9912282353716184e-05, + "loss": 1.0156, + "step": 693 + }, + { + "epoch": 0.36526315789473685, + "grad_norm": 2.0720999240875244, + "learning_rate": 2.9912013750683773e-05, + "loss": 1.2208, + "step": 694 + }, + { + "epoch": 0.36578947368421055, + "grad_norm": 1.3862260580062866, + "learning_rate": 2.9911744738240487e-05, + "loss": 1.0124, + "step": 695 + }, + { + "epoch": 0.3663157894736842, + "grad_norm": 9.334665298461914, + "learning_rate": 2.991147531639371e-05, + "loss": 0.9209, + "step": 696 + }, + { + "epoch": 0.3668421052631579, + "grad_norm": 6.73415994644165, + "learning_rate": 2.9911205485150846e-05, + "loss": 0.4709, + "step": 697 + }, + { + "epoch": 0.36736842105263157, + "grad_norm": 3.3749637603759766, + "learning_rate": 2.9910935244519294e-05, + "loss": 1.026, + "step": 698 + }, + { + "epoch": 0.3678947368421053, + "grad_norm": 7.481809139251709, + "learning_rate": 2.991066459450648e-05, + "loss": 0.6787, + "step": 699 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 1.9085243940353394, + "learning_rate": 2.991039353511983e-05, + "loss": 1.2883, + "step": 700 + }, + { + "epoch": 0.36894736842105263, + "grad_norm": 1.1209938526153564, + "learning_rate": 2.991012206636679e-05, + "loss": 1.3699, + "step": 701 + }, + { + "epoch": 0.36947368421052634, + "grad_norm": 5.468508243560791, + "learning_rate": 2.9909850188254814e-05, + "loss": 0.5472, + "step": 702 + }, + { + "epoch": 0.37, + "grad_norm": 1.069633960723877, + "learning_rate": 2.9909577900791367e-05, + "loss": 0.8953, + "step": 703 + }, + { + "epoch": 0.3705263157894737, + "grad_norm": 1.5387980937957764, + "learning_rate": 2.9909305203983916e-05, + "loss": 1.1598, + "step": 704 + }, + { + "epoch": 0.37105263157894736, + "grad_norm": 2.226820230484009, + "learning_rate": 2.9909032097839958e-05, + "loss": 1.1898, + "step": 705 + }, + { + "epoch": 0.37157894736842106, + "grad_norm": 1.4225033521652222, + "learning_rate": 2.9908758582366985e-05, + "loss": 1.3026, + "step": 706 + }, + { + "epoch": 0.3721052631578947, + "grad_norm": 5.471367835998535, + "learning_rate": 2.9908484657572507e-05, + "loss": 0.5267, + "step": 707 + }, + { + "epoch": 0.3726315789473684, + "grad_norm": 2.0702991485595703, + "learning_rate": 2.9908210323464047e-05, + "loss": 1.0605, + "step": 708 + }, + { + "epoch": 0.37315789473684213, + "grad_norm": 8.954567909240723, + "learning_rate": 2.9907935580049136e-05, + "loss": 0.5651, + "step": 709 + }, + { + "epoch": 0.3736842105263158, + "grad_norm": 1.6635040044784546, + "learning_rate": 2.9907660427335324e-05, + "loss": 0.6742, + "step": 710 + }, + { + "epoch": 0.3742105263157895, + "grad_norm": 2.7905099391937256, + "learning_rate": 2.990738486533015e-05, + "loss": 0.1368, + "step": 711 + }, + { + "epoch": 0.37473684210526315, + "grad_norm": 2.07755970954895, + "learning_rate": 2.990710889404119e-05, + "loss": 0.3577, + "step": 712 + }, + { + "epoch": 0.37526315789473685, + "grad_norm": 12.278775215148926, + "learning_rate": 2.9906832513476022e-05, + "loss": 1.7846, + "step": 713 + }, + { + "epoch": 0.3757894736842105, + "grad_norm": 2.079127073287964, + "learning_rate": 2.990655572364223e-05, + "loss": 1.0179, + "step": 714 + }, + { + "epoch": 0.3763157894736842, + "grad_norm": 3.017181396484375, + "learning_rate": 2.990627852454741e-05, + "loss": 1.0674, + "step": 715 + }, + { + "epoch": 0.37684210526315787, + "grad_norm": 4.084104537963867, + "learning_rate": 2.9906000916199182e-05, + "loss": 1.392, + "step": 716 + }, + { + "epoch": 0.3773684210526316, + "grad_norm": 2.539130210876465, + "learning_rate": 2.9905722898605162e-05, + "loss": 1.4383, + "step": 717 + }, + { + "epoch": 0.3778947368421053, + "grad_norm": 2.1830291748046875, + "learning_rate": 2.9905444471772978e-05, + "loss": 2.1873, + "step": 718 + }, + { + "epoch": 0.37842105263157894, + "grad_norm": 2.0604703426361084, + "learning_rate": 2.9905165635710286e-05, + "loss": 1.3176, + "step": 719 + }, + { + "epoch": 0.37894736842105264, + "grad_norm": 1.2084317207336426, + "learning_rate": 2.990488639042473e-05, + "loss": 0.9763, + "step": 720 + }, + { + "epoch": 0.3794736842105263, + "grad_norm": 4.228016376495361, + "learning_rate": 2.9904606735923988e-05, + "loss": 1.2977, + "step": 721 + }, + { + "epoch": 0.38, + "grad_norm": 7.167438983917236, + "learning_rate": 2.990432667221573e-05, + "loss": 2.1188, + "step": 722 + }, + { + "epoch": 0.38052631578947366, + "grad_norm": 2.2469797134399414, + "learning_rate": 2.9904046199307645e-05, + "loss": 1.2899, + "step": 723 + }, + { + "epoch": 0.38105263157894737, + "grad_norm": 1.2914834022521973, + "learning_rate": 2.9903765317207436e-05, + "loss": 1.1014, + "step": 724 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 1.4903666973114014, + "learning_rate": 2.9903484025922815e-05, + "loss": 1.505, + "step": 725 + }, + { + "epoch": 0.3821052631578947, + "grad_norm": 7.811427593231201, + "learning_rate": 2.9903202325461504e-05, + "loss": 1.1364, + "step": 726 + }, + { + "epoch": 0.38263157894736843, + "grad_norm": 1.9753170013427734, + "learning_rate": 2.9902920215831238e-05, + "loss": 1.3579, + "step": 727 + }, + { + "epoch": 0.3831578947368421, + "grad_norm": 4.232260227203369, + "learning_rate": 2.990263769703976e-05, + "loss": 0.3801, + "step": 728 + }, + { + "epoch": 0.3836842105263158, + "grad_norm": 4.112211227416992, + "learning_rate": 2.9902354769094828e-05, + "loss": 1.2913, + "step": 729 + }, + { + "epoch": 0.38421052631578945, + "grad_norm": 1.6348377466201782, + "learning_rate": 2.990207143200421e-05, + "loss": 0.8208, + "step": 730 + }, + { + "epoch": 0.38473684210526315, + "grad_norm": 2.1267893314361572, + "learning_rate": 2.9901787685775682e-05, + "loss": 1.4784, + "step": 731 + }, + { + "epoch": 0.38526315789473686, + "grad_norm": 1.5363768339157104, + "learning_rate": 2.990150353041704e-05, + "loss": 1.1479, + "step": 732 + }, + { + "epoch": 0.3857894736842105, + "grad_norm": 1.329637885093689, + "learning_rate": 2.9901218965936085e-05, + "loss": 1.1127, + "step": 733 + }, + { + "epoch": 0.3863157894736842, + "grad_norm": 5.673161029815674, + "learning_rate": 2.9900933992340627e-05, + "loss": 1.6329, + "step": 734 + }, + { + "epoch": 0.3868421052631579, + "grad_norm": 2.2011849880218506, + "learning_rate": 2.9900648609638487e-05, + "loss": 0.7907, + "step": 735 + }, + { + "epoch": 0.3873684210526316, + "grad_norm": 8.786739349365234, + "learning_rate": 2.9900362817837506e-05, + "loss": 1.9727, + "step": 736 + }, + { + "epoch": 0.38789473684210524, + "grad_norm": 1.4037230014801025, + "learning_rate": 2.9900076616945527e-05, + "loss": 0.0393, + "step": 737 + }, + { + "epoch": 0.38842105263157894, + "grad_norm": 1.1987260580062866, + "learning_rate": 2.989979000697041e-05, + "loss": 0.5779, + "step": 738 + }, + { + "epoch": 0.38894736842105265, + "grad_norm": 1.5265027284622192, + "learning_rate": 2.989950298792002e-05, + "loss": 1.1865, + "step": 739 + }, + { + "epoch": 0.3894736842105263, + "grad_norm": 2.8836402893066406, + "learning_rate": 2.9899215559802243e-05, + "loss": 1.974, + "step": 740 + }, + { + "epoch": 0.39, + "grad_norm": 1.7887353897094727, + "learning_rate": 2.9898927722624966e-05, + "loss": 0.1404, + "step": 741 + }, + { + "epoch": 0.39052631578947367, + "grad_norm": 6.494203567504883, + "learning_rate": 2.9898639476396095e-05, + "loss": 0.5194, + "step": 742 + }, + { + "epoch": 0.3910526315789474, + "grad_norm": 2.408672332763672, + "learning_rate": 2.9898350821123536e-05, + "loss": 0.6292, + "step": 743 + }, + { + "epoch": 0.391578947368421, + "grad_norm": 1.4377915859222412, + "learning_rate": 2.989806175681523e-05, + "loss": 1.4094, + "step": 744 + }, + { + "epoch": 0.39210526315789473, + "grad_norm": 1.4970815181732178, + "learning_rate": 2.9897772283479092e-05, + "loss": 0.8209, + "step": 745 + }, + { + "epoch": 0.39263157894736844, + "grad_norm": 2.360936403274536, + "learning_rate": 2.9897482401123088e-05, + "loss": 1.5443, + "step": 746 + }, + { + "epoch": 0.3931578947368421, + "grad_norm": 5.188052177429199, + "learning_rate": 2.9897192109755162e-05, + "loss": 1.5809, + "step": 747 + }, + { + "epoch": 0.3936842105263158, + "grad_norm": 1.4602450132369995, + "learning_rate": 2.9896901409383296e-05, + "loss": 0.0463, + "step": 748 + }, + { + "epoch": 0.39421052631578946, + "grad_norm": 4.554551601409912, + "learning_rate": 2.9896610300015463e-05, + "loss": 0.9516, + "step": 749 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 1.1301685571670532, + "learning_rate": 2.9896318781659662e-05, + "loss": 1.0014, + "step": 750 + }, + { + "epoch": 0.3952631578947368, + "grad_norm": 1.8906985521316528, + "learning_rate": 2.9896026854323896e-05, + "loss": 1.1485, + "step": 751 + }, + { + "epoch": 0.3957894736842105, + "grad_norm": 9.192191123962402, + "learning_rate": 2.9895734518016174e-05, + "loss": 0.8871, + "step": 752 + }, + { + "epoch": 0.39631578947368423, + "grad_norm": 1.4287827014923096, + "learning_rate": 2.9895441772744526e-05, + "loss": 1.3338, + "step": 753 + }, + { + "epoch": 0.3968421052631579, + "grad_norm": 1.5386351346969604, + "learning_rate": 2.989514861851699e-05, + "loss": 0.8937, + "step": 754 + }, + { + "epoch": 0.3973684210526316, + "grad_norm": 4.3271379470825195, + "learning_rate": 2.989485505534161e-05, + "loss": 0.9811, + "step": 755 + }, + { + "epoch": 0.39789473684210525, + "grad_norm": 13.09150505065918, + "learning_rate": 2.9894561083226452e-05, + "loss": 1.4184, + "step": 756 + }, + { + "epoch": 0.39842105263157895, + "grad_norm": 2.528856039047241, + "learning_rate": 2.9894266702179586e-05, + "loss": 1.3771, + "step": 757 + }, + { + "epoch": 0.3989473684210526, + "grad_norm": 1.3942149877548218, + "learning_rate": 2.989397191220909e-05, + "loss": 1.0354, + "step": 758 + }, + { + "epoch": 0.3994736842105263, + "grad_norm": 1.3833954334259033, + "learning_rate": 2.989367671332306e-05, + "loss": 1.2427, + "step": 759 + }, + { + "epoch": 0.4, + "grad_norm": 2.9078752994537354, + "learning_rate": 2.98933811055296e-05, + "loss": 0.2044, + "step": 760 + }, + { + "epoch": 0.4005263157894737, + "grad_norm": 4.573373794555664, + "learning_rate": 2.9893085088836828e-05, + "loss": 1.4173, + "step": 761 + }, + { + "epoch": 0.4010526315789474, + "grad_norm": 1.6260179281234741, + "learning_rate": 2.989278866325287e-05, + "loss": 0.8061, + "step": 762 + }, + { + "epoch": 0.40157894736842104, + "grad_norm": 53.73946762084961, + "learning_rate": 2.9892491828785866e-05, + "loss": 5.7539, + "step": 763 + }, + { + "epoch": 0.40210526315789474, + "grad_norm": 1.7073310613632202, + "learning_rate": 2.9892194585443964e-05, + "loss": 1.3272, + "step": 764 + }, + { + "epoch": 0.4026315789473684, + "grad_norm": 1.7463268041610718, + "learning_rate": 2.9891896933235324e-05, + "loss": 0.6709, + "step": 765 + }, + { + "epoch": 0.4031578947368421, + "grad_norm": 1.671372652053833, + "learning_rate": 2.9891598872168116e-05, + "loss": 1.0062, + "step": 766 + }, + { + "epoch": 0.4036842105263158, + "grad_norm": 3.4793472290039062, + "learning_rate": 2.989130040225053e-05, + "loss": 1.4948, + "step": 767 + }, + { + "epoch": 0.40421052631578946, + "grad_norm": 2.485945701599121, + "learning_rate": 2.9891001523490754e-05, + "loss": 1.3262, + "step": 768 + }, + { + "epoch": 0.4047368421052632, + "grad_norm": 1.4725351333618164, + "learning_rate": 2.9890702235897e-05, + "loss": 1.1616, + "step": 769 + }, + { + "epoch": 0.4052631578947368, + "grad_norm": 2.2476260662078857, + "learning_rate": 2.9890402539477476e-05, + "loss": 1.4187, + "step": 770 + }, + { + "epoch": 0.40578947368421053, + "grad_norm": 0.10535408556461334, + "learning_rate": 2.9890102434240415e-05, + "loss": 0.002, + "step": 771 + }, + { + "epoch": 0.4063157894736842, + "grad_norm": 1.7888716459274292, + "learning_rate": 2.9889801920194062e-05, + "loss": 1.9264, + "step": 772 + }, + { + "epoch": 0.4068421052631579, + "grad_norm": 3.374720573425293, + "learning_rate": 2.988950099734666e-05, + "loss": 0.3194, + "step": 773 + }, + { + "epoch": 0.4073684210526316, + "grad_norm": 1.5872454643249512, + "learning_rate": 2.9889199665706475e-05, + "loss": 0.8289, + "step": 774 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 4.640042304992676, + "learning_rate": 2.988889792528178e-05, + "loss": 0.8441, + "step": 775 + }, + { + "epoch": 0.40842105263157896, + "grad_norm": 5.191117763519287, + "learning_rate": 2.9888595776080856e-05, + "loss": 2.0781, + "step": 776 + }, + { + "epoch": 0.4089473684210526, + "grad_norm": 1.595027208328247, + "learning_rate": 2.9888293218111998e-05, + "loss": 1.5206, + "step": 777 + }, + { + "epoch": 0.4094736842105263, + "grad_norm": 2.689854145050049, + "learning_rate": 2.988799025138352e-05, + "loss": 0.7231, + "step": 778 + }, + { + "epoch": 0.41, + "grad_norm": 16.25735092163086, + "learning_rate": 2.988768687590373e-05, + "loss": 1.6409, + "step": 779 + }, + { + "epoch": 0.4105263157894737, + "grad_norm": 1.3042892217636108, + "learning_rate": 2.9887383091680964e-05, + "loss": 1.0157, + "step": 780 + }, + { + "epoch": 0.4110526315789474, + "grad_norm": 5.346875190734863, + "learning_rate": 2.9887078898723564e-05, + "loss": 0.657, + "step": 781 + }, + { + "epoch": 0.41157894736842104, + "grad_norm": 1.5279061794281006, + "learning_rate": 2.9886774297039878e-05, + "loss": 1.4934, + "step": 782 + }, + { + "epoch": 0.41210526315789475, + "grad_norm": 12.0784330368042, + "learning_rate": 2.9886469286638265e-05, + "loss": 3.1849, + "step": 783 + }, + { + "epoch": 0.4126315789473684, + "grad_norm": 1.3375122547149658, + "learning_rate": 2.9886163867527107e-05, + "loss": 1.1898, + "step": 784 + }, + { + "epoch": 0.4131578947368421, + "grad_norm": 4.016144275665283, + "learning_rate": 2.9885858039714786e-05, + "loss": 0.863, + "step": 785 + }, + { + "epoch": 0.41368421052631577, + "grad_norm": 2.205333948135376, + "learning_rate": 2.98855518032097e-05, + "loss": 0.8202, + "step": 786 + }, + { + "epoch": 0.4142105263157895, + "grad_norm": 2.130064010620117, + "learning_rate": 2.988524515802025e-05, + "loss": 1.7152, + "step": 787 + }, + { + "epoch": 0.4147368421052632, + "grad_norm": 2.0593249797821045, + "learning_rate": 2.9884938104154864e-05, + "loss": 1.2797, + "step": 788 + }, + { + "epoch": 0.41526315789473683, + "grad_norm": 4.233097553253174, + "learning_rate": 2.9884630641621963e-05, + "loss": 1.3987, + "step": 789 + }, + { + "epoch": 0.41578947368421054, + "grad_norm": 6.048382759094238, + "learning_rate": 2.9884322770429996e-05, + "loss": 1.7027, + "step": 790 + }, + { + "epoch": 0.4163157894736842, + "grad_norm": 4.876536846160889, + "learning_rate": 2.9884014490587418e-05, + "loss": 1.1787, + "step": 791 + }, + { + "epoch": 0.4168421052631579, + "grad_norm": 3.066049337387085, + "learning_rate": 2.9883705802102684e-05, + "loss": 2.2892, + "step": 792 + }, + { + "epoch": 0.41736842105263156, + "grad_norm": 3.7404627799987793, + "learning_rate": 2.9883396704984273e-05, + "loss": 0.4776, + "step": 793 + }, + { + "epoch": 0.41789473684210526, + "grad_norm": 1.4917809963226318, + "learning_rate": 2.9883087199240672e-05, + "loss": 1.1282, + "step": 794 + }, + { + "epoch": 0.41842105263157897, + "grad_norm": 1.7566046714782715, + "learning_rate": 2.988277728488038e-05, + "loss": 1.3632, + "step": 795 + }, + { + "epoch": 0.4189473684210526, + "grad_norm": 3.263943910598755, + "learning_rate": 2.98824669619119e-05, + "loss": 1.245, + "step": 796 + }, + { + "epoch": 0.41947368421052633, + "grad_norm": 1.3323115110397339, + "learning_rate": 2.9882156230343755e-05, + "loss": 0.8262, + "step": 797 + }, + { + "epoch": 0.42, + "grad_norm": 4.155140399932861, + "learning_rate": 2.988184509018448e-05, + "loss": 1.6147, + "step": 798 + }, + { + "epoch": 0.4205263157894737, + "grad_norm": 1.3995517492294312, + "learning_rate": 2.9881533541442615e-05, + "loss": 1.3523, + "step": 799 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 1.254440188407898, + "learning_rate": 2.9881221584126716e-05, + "loss": 0.7949, + "step": 800 + }, + { + "epoch": 0.42157894736842105, + "grad_norm": 1.74281644821167, + "learning_rate": 2.9880909218245335e-05, + "loss": 1.1625, + "step": 801 + }, + { + "epoch": 0.42210526315789476, + "grad_norm": 0.7531611323356628, + "learning_rate": 2.9880596443807065e-05, + "loss": 0.0144, + "step": 802 + }, + { + "epoch": 0.4226315789473684, + "grad_norm": 2.432140827178955, + "learning_rate": 2.9880283260820485e-05, + "loss": 1.2728, + "step": 803 + }, + { + "epoch": 0.4231578947368421, + "grad_norm": 12.019746780395508, + "learning_rate": 2.9879969669294193e-05, + "loss": 0.3237, + "step": 804 + }, + { + "epoch": 0.4236842105263158, + "grad_norm": 9.18172836303711, + "learning_rate": 2.98796556692368e-05, + "loss": 1.1377, + "step": 805 + }, + { + "epoch": 0.4242105263157895, + "grad_norm": 6.744290351867676, + "learning_rate": 2.9879341260656926e-05, + "loss": 0.8986, + "step": 806 + }, + { + "epoch": 0.42473684210526313, + "grad_norm": 6.863618850708008, + "learning_rate": 2.9879026443563207e-05, + "loss": 0.7325, + "step": 807 + }, + { + "epoch": 0.42526315789473684, + "grad_norm": 2.109163761138916, + "learning_rate": 2.9878711217964284e-05, + "loss": 0.7444, + "step": 808 + }, + { + "epoch": 0.42578947368421055, + "grad_norm": 1.5105681419372559, + "learning_rate": 2.9878395583868807e-05, + "loss": 0.0472, + "step": 809 + }, + { + "epoch": 0.4263157894736842, + "grad_norm": 2.5132837295532227, + "learning_rate": 2.9878079541285445e-05, + "loss": 1.1468, + "step": 810 + }, + { + "epoch": 0.4268421052631579, + "grad_norm": 1.0575823783874512, + "learning_rate": 2.987776309022288e-05, + "loss": 0.7931, + "step": 811 + }, + { + "epoch": 0.42736842105263156, + "grad_norm": 2.099186658859253, + "learning_rate": 2.9877446230689795e-05, + "loss": 0.1261, + "step": 812 + }, + { + "epoch": 0.42789473684210527, + "grad_norm": 2.151189088821411, + "learning_rate": 2.9877128962694892e-05, + "loss": 1.3023, + "step": 813 + }, + { + "epoch": 0.4284210526315789, + "grad_norm": 4.0430707931518555, + "learning_rate": 2.987681128624688e-05, + "loss": 0.4215, + "step": 814 + }, + { + "epoch": 0.42894736842105263, + "grad_norm": 2.170595169067383, + "learning_rate": 2.9876493201354475e-05, + "loss": 1.6202, + "step": 815 + }, + { + "epoch": 0.42947368421052634, + "grad_norm": 7.825321674346924, + "learning_rate": 2.987617470802642e-05, + "loss": 0.9704, + "step": 816 + }, + { + "epoch": 0.43, + "grad_norm": 1.3321274518966675, + "learning_rate": 2.9875855806271455e-05, + "loss": 0.985, + "step": 817 + }, + { + "epoch": 0.4305263157894737, + "grad_norm": 3.4284095764160156, + "learning_rate": 2.9875536496098335e-05, + "loss": 1.6423, + "step": 818 + }, + { + "epoch": 0.43105263157894735, + "grad_norm": 1.4579600095748901, + "learning_rate": 2.9875216777515827e-05, + "loss": 1.0348, + "step": 819 + }, + { + "epoch": 0.43157894736842106, + "grad_norm": 1.253885269165039, + "learning_rate": 2.987489665053271e-05, + "loss": 0.6495, + "step": 820 + }, + { + "epoch": 0.4321052631578947, + "grad_norm": 4.7803144454956055, + "learning_rate": 2.9874576115157773e-05, + "loss": 0.2175, + "step": 821 + }, + { + "epoch": 0.4326315789473684, + "grad_norm": 2.1819753646850586, + "learning_rate": 2.987425517139981e-05, + "loss": 0.9445, + "step": 822 + }, + { + "epoch": 0.43315789473684213, + "grad_norm": 1.4593207836151123, + "learning_rate": 2.9873933819267647e-05, + "loss": 1.3716, + "step": 823 + }, + { + "epoch": 0.4336842105263158, + "grad_norm": 2.473689556121826, + "learning_rate": 2.9873612058770094e-05, + "loss": 0.5247, + "step": 824 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 2.997178792953491, + "learning_rate": 2.9873289889915986e-05, + "loss": 0.9902, + "step": 825 + }, + { + "epoch": 0.43473684210526314, + "grad_norm": 6.612910270690918, + "learning_rate": 2.9872967312714176e-05, + "loss": 1.2973, + "step": 826 + }, + { + "epoch": 0.43526315789473685, + "grad_norm": 1.0687013864517212, + "learning_rate": 2.9872644327173513e-05, + "loss": 0.7058, + "step": 827 + }, + { + "epoch": 0.4357894736842105, + "grad_norm": 1.554075837135315, + "learning_rate": 2.9872320933302867e-05, + "loss": 1.4401, + "step": 828 + }, + { + "epoch": 0.4363157894736842, + "grad_norm": 4.839378833770752, + "learning_rate": 2.9871997131111122e-05, + "loss": 1.1889, + "step": 829 + }, + { + "epoch": 0.4368421052631579, + "grad_norm": 3.0694851875305176, + "learning_rate": 2.9871672920607158e-05, + "loss": 1.0211, + "step": 830 + }, + { + "epoch": 0.4373684210526316, + "grad_norm": 1.378533124923706, + "learning_rate": 2.9871348301799883e-05, + "loss": 0.6821, + "step": 831 + }, + { + "epoch": 0.4378947368421053, + "grad_norm": 1.238147497177124, + "learning_rate": 2.9871023274698204e-05, + "loss": 0.0303, + "step": 832 + }, + { + "epoch": 0.43842105263157893, + "grad_norm": 1.4806623458862305, + "learning_rate": 2.9870697839311053e-05, + "loss": 1.2068, + "step": 833 + }, + { + "epoch": 0.43894736842105264, + "grad_norm": 1.3188241720199585, + "learning_rate": 2.9870371995647353e-05, + "loss": 0.6705, + "step": 834 + }, + { + "epoch": 0.4394736842105263, + "grad_norm": 4.55914831161499, + "learning_rate": 2.9870045743716063e-05, + "loss": 0.0898, + "step": 835 + }, + { + "epoch": 0.44, + "grad_norm": 0.970885694026947, + "learning_rate": 2.9869719083526137e-05, + "loss": 1.0416, + "step": 836 + }, + { + "epoch": 0.4405263157894737, + "grad_norm": 1.664801836013794, + "learning_rate": 2.9869392015086538e-05, + "loss": 1.3485, + "step": 837 + }, + { + "epoch": 0.44105263157894736, + "grad_norm": 2.7517263889312744, + "learning_rate": 2.9869064538406247e-05, + "loss": 1.0703, + "step": 838 + }, + { + "epoch": 0.44157894736842107, + "grad_norm": 0.9083302021026611, + "learning_rate": 2.986873665349426e-05, + "loss": 0.0277, + "step": 839 + }, + { + "epoch": 0.4421052631578947, + "grad_norm": 2.142995834350586, + "learning_rate": 2.986840836035957e-05, + "loss": 1.0772, + "step": 840 + }, + { + "epoch": 0.44263157894736843, + "grad_norm": 0.20032253861427307, + "learning_rate": 2.98680796590112e-05, + "loss": 0.0051, + "step": 841 + }, + { + "epoch": 0.4431578947368421, + "grad_norm": 0.8623015880584717, + "learning_rate": 2.9867750549458173e-05, + "loss": 0.6315, + "step": 842 + }, + { + "epoch": 0.4436842105263158, + "grad_norm": 1.2665982246398926, + "learning_rate": 2.9867421031709517e-05, + "loss": 1.4777, + "step": 843 + }, + { + "epoch": 0.4442105263157895, + "grad_norm": 0.974398672580719, + "learning_rate": 2.986709110577429e-05, + "loss": 0.9672, + "step": 844 + }, + { + "epoch": 0.44473684210526315, + "grad_norm": 0.9700206518173218, + "learning_rate": 2.9866760771661544e-05, + "loss": 1.0432, + "step": 845 + }, + { + "epoch": 0.44526315789473686, + "grad_norm": 2.319387912750244, + "learning_rate": 2.9866430029380342e-05, + "loss": 1.3111, + "step": 846 + }, + { + "epoch": 0.4457894736842105, + "grad_norm": 0.9847918748855591, + "learning_rate": 2.9866098878939777e-05, + "loss": 1.1562, + "step": 847 + }, + { + "epoch": 0.4463157894736842, + "grad_norm": 5.33390474319458, + "learning_rate": 2.9865767320348932e-05, + "loss": 1.2682, + "step": 848 + }, + { + "epoch": 0.4468421052631579, + "grad_norm": 9.808623313903809, + "learning_rate": 2.9865435353616915e-05, + "loss": 1.3406, + "step": 849 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 6.855868816375732, + "learning_rate": 2.9865102978752837e-05, + "loss": 2.3309, + "step": 850 + }, + { + "epoch": 0.4478947368421053, + "grad_norm": 2.986323595046997, + "learning_rate": 2.9864770195765828e-05, + "loss": 1.596, + "step": 851 + }, + { + "epoch": 0.44842105263157894, + "grad_norm": 2.0322799682617188, + "learning_rate": 2.9864437004665016e-05, + "loss": 1.0168, + "step": 852 + }, + { + "epoch": 0.44894736842105265, + "grad_norm": 14.111528396606445, + "learning_rate": 2.9864103405459556e-05, + "loss": 1.2123, + "step": 853 + }, + { + "epoch": 0.4494736842105263, + "grad_norm": 1.4620112180709839, + "learning_rate": 2.9863769398158607e-05, + "loss": 1.1575, + "step": 854 + }, + { + "epoch": 0.45, + "grad_norm": 2.048269271850586, + "learning_rate": 2.9863434982771338e-05, + "loss": 1.4347, + "step": 855 + }, + { + "epoch": 0.45052631578947366, + "grad_norm": 1.1362407207489014, + "learning_rate": 2.9863100159306923e-05, + "loss": 0.9232, + "step": 856 + }, + { + "epoch": 0.45105263157894737, + "grad_norm": 14.55726432800293, + "learning_rate": 2.9862764927774567e-05, + "loss": 1.6201, + "step": 857 + }, + { + "epoch": 0.4515789473684211, + "grad_norm": 4.592569828033447, + "learning_rate": 2.9862429288183468e-05, + "loss": 1.5495, + "step": 858 + }, + { + "epoch": 0.45210526315789473, + "grad_norm": 0.8918259143829346, + "learning_rate": 2.986209324054284e-05, + "loss": 0.9341, + "step": 859 + }, + { + "epoch": 0.45263157894736844, + "grad_norm": 3.0468339920043945, + "learning_rate": 2.986175678486191e-05, + "loss": 0.9043, + "step": 860 + }, + { + "epoch": 0.4531578947368421, + "grad_norm": 1.4248181581497192, + "learning_rate": 2.9861419921149916e-05, + "loss": 0.983, + "step": 861 + }, + { + "epoch": 0.4536842105263158, + "grad_norm": 2.1727728843688965, + "learning_rate": 2.9861082649416107e-05, + "loss": 0.3648, + "step": 862 + }, + { + "epoch": 0.45421052631578945, + "grad_norm": 1.469250202178955, + "learning_rate": 2.9860744969669742e-05, + "loss": 0.9197, + "step": 863 + }, + { + "epoch": 0.45473684210526316, + "grad_norm": 3.119738817214966, + "learning_rate": 2.986040688192009e-05, + "loss": 1.2948, + "step": 864 + }, + { + "epoch": 0.45526315789473687, + "grad_norm": 7.314899921417236, + "learning_rate": 2.9860068386176437e-05, + "loss": 1.1322, + "step": 865 + }, + { + "epoch": 0.4557894736842105, + "grad_norm": 1.2942012548446655, + "learning_rate": 2.9859729482448073e-05, + "loss": 1.0436, + "step": 866 + }, + { + "epoch": 0.45631578947368423, + "grad_norm": 16.526906967163086, + "learning_rate": 2.985939017074431e-05, + "loss": 0.7217, + "step": 867 + }, + { + "epoch": 0.4568421052631579, + "grad_norm": 0.9518576860427856, + "learning_rate": 2.9859050451074453e-05, + "loss": 0.0462, + "step": 868 + }, + { + "epoch": 0.4573684210526316, + "grad_norm": 1.8784120082855225, + "learning_rate": 2.985871032344784e-05, + "loss": 1.1546, + "step": 869 + }, + { + "epoch": 0.45789473684210524, + "grad_norm": 2.373981475830078, + "learning_rate": 2.9858369787873795e-05, + "loss": 1.1577, + "step": 870 + }, + { + "epoch": 0.45842105263157895, + "grad_norm": 4.2424421310424805, + "learning_rate": 2.985802884436168e-05, + "loss": 0.8525, + "step": 871 + }, + { + "epoch": 0.4589473684210526, + "grad_norm": 1.318802833557129, + "learning_rate": 2.9857687492920854e-05, + "loss": 0.7319, + "step": 872 + }, + { + "epoch": 0.4594736842105263, + "grad_norm": 1.247564435005188, + "learning_rate": 2.9857345733560683e-05, + "loss": 1.0628, + "step": 873 + }, + { + "epoch": 0.46, + "grad_norm": 1.433790683746338, + "learning_rate": 2.985700356629056e-05, + "loss": 1.1417, + "step": 874 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 3.1146788597106934, + "learning_rate": 2.9856660991119867e-05, + "loss": 1.4214, + "step": 875 + }, + { + "epoch": 0.4610526315789474, + "grad_norm": 9.475061416625977, + "learning_rate": 2.9856318008058018e-05, + "loss": 1.015, + "step": 876 + }, + { + "epoch": 0.46157894736842103, + "grad_norm": 1.2461477518081665, + "learning_rate": 2.9855974617114425e-05, + "loss": 1.0464, + "step": 877 + }, + { + "epoch": 0.46210526315789474, + "grad_norm": 1.4924275875091553, + "learning_rate": 2.9855630818298518e-05, + "loss": 1.7069, + "step": 878 + }, + { + "epoch": 0.4626315789473684, + "grad_norm": 1.3920341730117798, + "learning_rate": 2.9855286611619733e-05, + "loss": 1.5036, + "step": 879 + }, + { + "epoch": 0.4631578947368421, + "grad_norm": 2.099785089492798, + "learning_rate": 2.985494199708753e-05, + "loss": 0.057, + "step": 880 + }, + { + "epoch": 0.4636842105263158, + "grad_norm": 2.4705018997192383, + "learning_rate": 2.985459697471136e-05, + "loss": 1.091, + "step": 881 + }, + { + "epoch": 0.46421052631578946, + "grad_norm": 7.607650279998779, + "learning_rate": 2.98542515445007e-05, + "loss": 0.3252, + "step": 882 + }, + { + "epoch": 0.46473684210526317, + "grad_norm": 1.6341148614883423, + "learning_rate": 2.985390570646503e-05, + "loss": 1.5967, + "step": 883 + }, + { + "epoch": 0.4652631578947368, + "grad_norm": 1.6126140356063843, + "learning_rate": 2.9853559460613846e-05, + "loss": 1.2722, + "step": 884 + }, + { + "epoch": 0.46578947368421053, + "grad_norm": 13.708699226379395, + "learning_rate": 2.985321280695666e-05, + "loss": 1.7056, + "step": 885 + }, + { + "epoch": 0.4663157894736842, + "grad_norm": 1.0587328672409058, + "learning_rate": 2.9852865745502988e-05, + "loss": 0.9254, + "step": 886 + }, + { + "epoch": 0.4668421052631579, + "grad_norm": 1.383562445640564, + "learning_rate": 2.9852518276262352e-05, + "loss": 0.821, + "step": 887 + }, + { + "epoch": 0.4673684210526316, + "grad_norm": 5.15762996673584, + "learning_rate": 2.9852170399244297e-05, + "loss": 1.1727, + "step": 888 + }, + { + "epoch": 0.46789473684210525, + "grad_norm": 25.248504638671875, + "learning_rate": 2.9851822114458374e-05, + "loss": 2.5076, + "step": 889 + }, + { + "epoch": 0.46842105263157896, + "grad_norm": 4.159931182861328, + "learning_rate": 2.985147342191414e-05, + "loss": 0.4708, + "step": 890 + }, + { + "epoch": 0.4689473684210526, + "grad_norm": 1.1170576810836792, + "learning_rate": 2.9851124321621177e-05, + "loss": 0.8062, + "step": 891 + }, + { + "epoch": 0.4694736842105263, + "grad_norm": 1.807730793952942, + "learning_rate": 2.9850774813589065e-05, + "loss": 0.5932, + "step": 892 + }, + { + "epoch": 0.47, + "grad_norm": 1.2220526933670044, + "learning_rate": 2.98504248978274e-05, + "loss": 1.0653, + "step": 893 + }, + { + "epoch": 0.4705263157894737, + "grad_norm": 7.340139389038086, + "learning_rate": 2.9850074574345787e-05, + "loss": 2.1767, + "step": 894 + }, + { + "epoch": 0.4710526315789474, + "grad_norm": 2.8421573638916016, + "learning_rate": 2.9849723843153847e-05, + "loss": 0.619, + "step": 895 + }, + { + "epoch": 0.47157894736842104, + "grad_norm": 1.6204451322555542, + "learning_rate": 2.9849372704261203e-05, + "loss": 0.469, + "step": 896 + }, + { + "epoch": 0.47210526315789475, + "grad_norm": 1.9400737285614014, + "learning_rate": 2.9849021157677506e-05, + "loss": 1.0654, + "step": 897 + }, + { + "epoch": 0.4726315789473684, + "grad_norm": 1.4052708148956299, + "learning_rate": 2.9848669203412404e-05, + "loss": 0.8773, + "step": 898 + }, + { + "epoch": 0.4731578947368421, + "grad_norm": 6.795404434204102, + "learning_rate": 2.984831684147556e-05, + "loss": 2.0904, + "step": 899 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 2.273846387863159, + "learning_rate": 2.9847964071876642e-05, + "loss": 1.3245, + "step": 900 + }, + { + "epoch": 0.47421052631578947, + "grad_norm": 1.403640866279602, + "learning_rate": 2.9847610894625343e-05, + "loss": 0.8352, + "step": 901 + }, + { + "epoch": 0.4747368421052632, + "grad_norm": 1.2695461511611938, + "learning_rate": 2.9847257309731357e-05, + "loss": 1.0238, + "step": 902 + }, + { + "epoch": 0.47526315789473683, + "grad_norm": 2.6055543422698975, + "learning_rate": 2.9846903317204388e-05, + "loss": 0.2152, + "step": 903 + }, + { + "epoch": 0.47578947368421054, + "grad_norm": 2.166196584701538, + "learning_rate": 2.984654891705416e-05, + "loss": 1.2715, + "step": 904 + }, + { + "epoch": 0.4763157894736842, + "grad_norm": 7.2398247718811035, + "learning_rate": 2.9846194109290404e-05, + "loss": 2.0351, + "step": 905 + }, + { + "epoch": 0.4768421052631579, + "grad_norm": 4.5485453605651855, + "learning_rate": 2.9845838893922854e-05, + "loss": 0.6056, + "step": 906 + }, + { + "epoch": 0.47736842105263155, + "grad_norm": 2.5836188793182373, + "learning_rate": 2.9845483270961267e-05, + "loss": 2.0319, + "step": 907 + }, + { + "epoch": 0.47789473684210526, + "grad_norm": 1.6816260814666748, + "learning_rate": 2.984512724041541e-05, + "loss": 0.8913, + "step": 908 + }, + { + "epoch": 0.47842105263157897, + "grad_norm": 1.6629785299301147, + "learning_rate": 2.9844770802295056e-05, + "loss": 1.5281, + "step": 909 + }, + { + "epoch": 0.4789473684210526, + "grad_norm": 1.2143621444702148, + "learning_rate": 2.9844413956609985e-05, + "loss": 1.0262, + "step": 910 + }, + { + "epoch": 0.47947368421052633, + "grad_norm": 4.233440399169922, + "learning_rate": 2.984405670337e-05, + "loss": 1.4231, + "step": 911 + }, + { + "epoch": 0.48, + "grad_norm": 1.3030482530593872, + "learning_rate": 2.9843699042584908e-05, + "loss": 0.6924, + "step": 912 + }, + { + "epoch": 0.4805263157894737, + "grad_norm": 5.886407375335693, + "learning_rate": 2.9843340974264532e-05, + "loss": 2.0718, + "step": 913 + }, + { + "epoch": 0.48105263157894734, + "grad_norm": 1.7257221937179565, + "learning_rate": 2.98429824984187e-05, + "loss": 2.0068, + "step": 914 + }, + { + "epoch": 0.48157894736842105, + "grad_norm": 2.2462844848632812, + "learning_rate": 2.9842623615057248e-05, + "loss": 1.7576, + "step": 915 + }, + { + "epoch": 0.48210526315789476, + "grad_norm": 6.659467697143555, + "learning_rate": 2.984226432419004e-05, + "loss": 0.8356, + "step": 916 + }, + { + "epoch": 0.4826315789473684, + "grad_norm": 1.0781395435333252, + "learning_rate": 2.9841904625826933e-05, + "loss": 1.0799, + "step": 917 + }, + { + "epoch": 0.4831578947368421, + "grad_norm": 1.9946705102920532, + "learning_rate": 2.9841544519977805e-05, + "loss": 1.7794, + "step": 918 + }, + { + "epoch": 0.48368421052631577, + "grad_norm": 1.2607777118682861, + "learning_rate": 2.9841184006652544e-05, + "loss": 1.1695, + "step": 919 + }, + { + "epoch": 0.4842105263157895, + "grad_norm": 1.6544026136398315, + "learning_rate": 2.9840823085861047e-05, + "loss": 1.0608, + "step": 920 + }, + { + "epoch": 0.48473684210526313, + "grad_norm": 3.6581201553344727, + "learning_rate": 2.9840461757613217e-05, + "loss": 0.1158, + "step": 921 + }, + { + "epoch": 0.48526315789473684, + "grad_norm": 1.2672442197799683, + "learning_rate": 2.9840100021918986e-05, + "loss": 1.1773, + "step": 922 + }, + { + "epoch": 0.48578947368421055, + "grad_norm": 11.554649353027344, + "learning_rate": 2.9839737878788276e-05, + "loss": 0.1758, + "step": 923 + }, + { + "epoch": 0.4863157894736842, + "grad_norm": 2.514875888824463, + "learning_rate": 2.9839375328231033e-05, + "loss": 0.6761, + "step": 924 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 1.2187618017196655, + "learning_rate": 2.9839012370257213e-05, + "loss": 0.8572, + "step": 925 + }, + { + "epoch": 0.48736842105263156, + "grad_norm": 11.437232971191406, + "learning_rate": 2.983864900487678e-05, + "loss": 1.4024, + "step": 926 + }, + { + "epoch": 0.48789473684210527, + "grad_norm": 1.1252726316452026, + "learning_rate": 2.9838285232099703e-05, + "loss": 0.8422, + "step": 927 + }, + { + "epoch": 0.4884210526315789, + "grad_norm": 7.090402603149414, + "learning_rate": 2.9837921051935983e-05, + "loss": 1.7886, + "step": 928 + }, + { + "epoch": 0.48894736842105263, + "grad_norm": 0.9771952629089355, + "learning_rate": 2.983755646439561e-05, + "loss": 0.6664, + "step": 929 + }, + { + "epoch": 0.48947368421052634, + "grad_norm": 1.0985069274902344, + "learning_rate": 2.983719146948859e-05, + "loss": 0.8674, + "step": 930 + }, + { + "epoch": 0.49, + "grad_norm": 4.855616092681885, + "learning_rate": 2.9836826067224953e-05, + "loss": 1.08, + "step": 931 + }, + { + "epoch": 0.4905263157894737, + "grad_norm": 4.352982044219971, + "learning_rate": 2.9836460257614726e-05, + "loss": 1.0946, + "step": 932 + }, + { + "epoch": 0.49105263157894735, + "grad_norm": 9.891953468322754, + "learning_rate": 2.9836094040667953e-05, + "loss": 0.5332, + "step": 933 + }, + { + "epoch": 0.49157894736842106, + "grad_norm": 4.198849678039551, + "learning_rate": 2.9835727416394692e-05, + "loss": 0.5912, + "step": 934 + }, + { + "epoch": 0.4921052631578947, + "grad_norm": 3.755746841430664, + "learning_rate": 2.9835360384805004e-05, + "loss": 1.9771, + "step": 935 + }, + { + "epoch": 0.4926315789473684, + "grad_norm": 1.8602410554885864, + "learning_rate": 2.9834992945908966e-05, + "loss": 0.9501, + "step": 936 + }, + { + "epoch": 0.49315789473684213, + "grad_norm": 7.063650608062744, + "learning_rate": 2.9834625099716668e-05, + "loss": 1.1328, + "step": 937 + }, + { + "epoch": 0.4936842105263158, + "grad_norm": 48.79571533203125, + "learning_rate": 2.983425684623821e-05, + "loss": 3.1328, + "step": 938 + }, + { + "epoch": 0.4942105263157895, + "grad_norm": 8.612360954284668, + "learning_rate": 2.98338881854837e-05, + "loss": 0.6469, + "step": 939 + }, + { + "epoch": 0.49473684210526314, + "grad_norm": 5.5089898109436035, + "learning_rate": 2.9833519117463263e-05, + "loss": 2.3209, + "step": 940 + }, + { + "epoch": 0.49526315789473685, + "grad_norm": 1.2845944166183472, + "learning_rate": 2.9833149642187026e-05, + "loss": 1.2498, + "step": 941 + }, + { + "epoch": 0.4957894736842105, + "grad_norm": 2.241567373275757, + "learning_rate": 2.9832779759665144e-05, + "loss": 0.1963, + "step": 942 + }, + { + "epoch": 0.4963157894736842, + "grad_norm": 1.6336573362350464, + "learning_rate": 2.983240946990776e-05, + "loss": 0.6764, + "step": 943 + }, + { + "epoch": 0.4968421052631579, + "grad_norm": 2.2285687923431396, + "learning_rate": 2.9832038772925044e-05, + "loss": 1.1649, + "step": 944 + }, + { + "epoch": 0.49736842105263157, + "grad_norm": 1.5614920854568481, + "learning_rate": 2.983166766872718e-05, + "loss": 1.1911, + "step": 945 + }, + { + "epoch": 0.4978947368421053, + "grad_norm": 1.2409616708755493, + "learning_rate": 2.9831296157324348e-05, + "loss": 0.9644, + "step": 946 + }, + { + "epoch": 0.49842105263157893, + "grad_norm": 1.8802260160446167, + "learning_rate": 2.983092423872675e-05, + "loss": 0.6794, + "step": 947 + }, + { + "epoch": 0.49894736842105264, + "grad_norm": 1.1808290481567383, + "learning_rate": 2.98305519129446e-05, + "loss": 1.2996, + "step": 948 + }, + { + "epoch": 0.4994736842105263, + "grad_norm": 1.2290716171264648, + "learning_rate": 2.983017917998812e-05, + "loss": 1.0257, + "step": 949 + }, + { + "epoch": 0.5, + "grad_norm": 1.3215667009353638, + "learning_rate": 2.9829806039867537e-05, + "loss": 1.1965, + "step": 950 + }, + { + "epoch": 0.5, + "eval_loss": 1.0288052558898926, + "eval_runtime": 12.7382, + "eval_samples_per_second": 7.85, + "eval_steps_per_second": 7.85, + "step": 950 + }, + { + "epoch": 0.5005263157894737, + "grad_norm": 1.5079114437103271, + "learning_rate": 2.9829432492593105e-05, + "loss": 1.0393, + "step": 951 + }, + { + "epoch": 0.5010526315789474, + "grad_norm": 6.629293441772461, + "learning_rate": 2.9829058538175076e-05, + "loss": 1.3261, + "step": 952 + }, + { + "epoch": 0.501578947368421, + "grad_norm": 6.566483974456787, + "learning_rate": 2.9828684176623714e-05, + "loss": 1.561, + "step": 953 + }, + { + "epoch": 0.5021052631578947, + "grad_norm": 1.9273375272750854, + "learning_rate": 2.98283094079493e-05, + "loss": 1.0629, + "step": 954 + }, + { + "epoch": 0.5026315789473684, + "grad_norm": 2.6924264430999756, + "learning_rate": 2.9827934232162128e-05, + "loss": 0.967, + "step": 955 + }, + { + "epoch": 0.5031578947368421, + "grad_norm": 1.5618788003921509, + "learning_rate": 2.982755864927249e-05, + "loss": 1.2447, + "step": 956 + }, + { + "epoch": 0.5036842105263157, + "grad_norm": 2.599567174911499, + "learning_rate": 2.98271826592907e-05, + "loss": 0.9259, + "step": 957 + }, + { + "epoch": 0.5042105263157894, + "grad_norm": 5.357890605926514, + "learning_rate": 2.9826806262227082e-05, + "loss": 0.7411, + "step": 958 + }, + { + "epoch": 0.5047368421052632, + "grad_norm": 2.8332183361053467, + "learning_rate": 2.9826429458091968e-05, + "loss": 0.1058, + "step": 959 + }, + { + "epoch": 0.5052631578947369, + "grad_norm": 1.5336825847625732, + "learning_rate": 2.982605224689571e-05, + "loss": 0.2564, + "step": 960 + }, + { + "epoch": 0.5057894736842106, + "grad_norm": 3.167874336242676, + "learning_rate": 2.9825674628648657e-05, + "loss": 0.9333, + "step": 961 + }, + { + "epoch": 0.5063157894736842, + "grad_norm": 1.4255472421646118, + "learning_rate": 2.982529660336118e-05, + "loss": 0.0605, + "step": 962 + }, + { + "epoch": 0.5068421052631579, + "grad_norm": 5.443603038787842, + "learning_rate": 2.9824918171043656e-05, + "loss": 0.9617, + "step": 963 + }, + { + "epoch": 0.5073684210526316, + "grad_norm": 1.5958141088485718, + "learning_rate": 2.9824539331706476e-05, + "loss": 1.8712, + "step": 964 + }, + { + "epoch": 0.5078947368421053, + "grad_norm": 1.544083595275879, + "learning_rate": 2.982416008536004e-05, + "loss": 1.9258, + "step": 965 + }, + { + "epoch": 0.508421052631579, + "grad_norm": 9.469457626342773, + "learning_rate": 2.982378043201476e-05, + "loss": 1.3554, + "step": 966 + }, + { + "epoch": 0.5089473684210526, + "grad_norm": 1.1175442934036255, + "learning_rate": 2.9823400371681062e-05, + "loss": 1.5603, + "step": 967 + }, + { + "epoch": 0.5094736842105263, + "grad_norm": 5.4420342445373535, + "learning_rate": 2.9823019904369377e-05, + "loss": 0.5942, + "step": 968 + }, + { + "epoch": 0.51, + "grad_norm": 1.5822449922561646, + "learning_rate": 2.9822639030090156e-05, + "loss": 1.0452, + "step": 969 + }, + { + "epoch": 0.5105263157894737, + "grad_norm": 4.8439483642578125, + "learning_rate": 2.9822257748853846e-05, + "loss": 0.2948, + "step": 970 + }, + { + "epoch": 0.5110526315789473, + "grad_norm": 5.272324085235596, + "learning_rate": 2.982187606067093e-05, + "loss": 1.0793, + "step": 971 + }, + { + "epoch": 0.511578947368421, + "grad_norm": 35.564964294433594, + "learning_rate": 2.9821493965551877e-05, + "loss": 4.9608, + "step": 972 + }, + { + "epoch": 0.5121052631578947, + "grad_norm": 8.41565227508545, + "learning_rate": 2.9821111463507177e-05, + "loss": 0.8292, + "step": 973 + }, + { + "epoch": 0.5126315789473684, + "grad_norm": 1.3578308820724487, + "learning_rate": 2.9820728554547338e-05, + "loss": 0.6588, + "step": 974 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 21.759471893310547, + "learning_rate": 2.9820345238682862e-05, + "loss": 5.2813, + "step": 975 + }, + { + "epoch": 0.5136842105263157, + "grad_norm": 1.089369535446167, + "learning_rate": 2.9819961515924288e-05, + "loss": 0.7576, + "step": 976 + }, + { + "epoch": 0.5142105263157895, + "grad_norm": 1.6790835857391357, + "learning_rate": 2.981957738628214e-05, + "loss": 1.0218, + "step": 977 + }, + { + "epoch": 0.5147368421052632, + "grad_norm": 0.8437327742576599, + "learning_rate": 2.9819192849766965e-05, + "loss": 0.459, + "step": 978 + }, + { + "epoch": 0.5152631578947369, + "grad_norm": 1.327656865119934, + "learning_rate": 2.981880790638933e-05, + "loss": 1.0783, + "step": 979 + }, + { + "epoch": 0.5157894736842106, + "grad_norm": 1.32616126537323, + "learning_rate": 2.981842255615979e-05, + "loss": 1.0975, + "step": 980 + }, + { + "epoch": 0.5163157894736842, + "grad_norm": 1.0787245035171509, + "learning_rate": 2.981803679908893e-05, + "loss": 1.1274, + "step": 981 + }, + { + "epoch": 0.5168421052631579, + "grad_norm": 5.349323272705078, + "learning_rate": 2.9817650635187348e-05, + "loss": 0.6531, + "step": 982 + }, + { + "epoch": 0.5173684210526316, + "grad_norm": 6.324141979217529, + "learning_rate": 2.981726406446564e-05, + "loss": 1.4738, + "step": 983 + }, + { + "epoch": 0.5178947368421053, + "grad_norm": 1.0158735513687134, + "learning_rate": 2.9816877086934416e-05, + "loss": 0.7424, + "step": 984 + }, + { + "epoch": 0.5184210526315789, + "grad_norm": 1.3030285835266113, + "learning_rate": 2.9816489702604307e-05, + "loss": 1.2574, + "step": 985 + }, + { + "epoch": 0.5189473684210526, + "grad_norm": 1.0021758079528809, + "learning_rate": 2.9816101911485944e-05, + "loss": 0.9003, + "step": 986 + }, + { + "epoch": 0.5194736842105263, + "grad_norm": 2.6594042778015137, + "learning_rate": 2.981571371358998e-05, + "loss": 0.4392, + "step": 987 + }, + { + "epoch": 0.52, + "grad_norm": 4.258882999420166, + "learning_rate": 2.981532510892707e-05, + "loss": 0.3734, + "step": 988 + }, + { + "epoch": 0.5205263157894737, + "grad_norm": 4.136307239532471, + "learning_rate": 2.9814936097507878e-05, + "loss": 1.9032, + "step": 989 + }, + { + "epoch": 0.5210526315789473, + "grad_norm": 4.121081352233887, + "learning_rate": 2.981454667934309e-05, + "loss": 1.5468, + "step": 990 + }, + { + "epoch": 0.521578947368421, + "grad_norm": 6.502449035644531, + "learning_rate": 2.9814156854443394e-05, + "loss": 1.3444, + "step": 991 + }, + { + "epoch": 0.5221052631578947, + "grad_norm": 9.599536895751953, + "learning_rate": 2.9813766622819494e-05, + "loss": 0.8619, + "step": 992 + }, + { + "epoch": 0.5226315789473684, + "grad_norm": 5.463332653045654, + "learning_rate": 2.9813375984482108e-05, + "loss": 0.8028, + "step": 993 + }, + { + "epoch": 0.5231578947368422, + "grad_norm": 2.675305128097534, + "learning_rate": 2.9812984939441955e-05, + "loss": 0.1877, + "step": 994 + }, + { + "epoch": 0.5236842105263158, + "grad_norm": 1.6895674467086792, + "learning_rate": 2.9812593487709778e-05, + "loss": 1.4556, + "step": 995 + }, + { + "epoch": 0.5242105263157895, + "grad_norm": 1.0445210933685303, + "learning_rate": 2.9812201629296313e-05, + "loss": 0.9134, + "step": 996 + }, + { + "epoch": 0.5247368421052632, + "grad_norm": 4.284270286560059, + "learning_rate": 2.9811809364212332e-05, + "loss": 0.7209, + "step": 997 + }, + { + "epoch": 0.5252631578947369, + "grad_norm": 3.461217164993286, + "learning_rate": 2.98114166924686e-05, + "loss": 0.4954, + "step": 998 + }, + { + "epoch": 0.5257894736842105, + "grad_norm": 4.461343288421631, + "learning_rate": 2.981102361407589e-05, + "loss": 1.5761, + "step": 999 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.5852874517440796, + "learning_rate": 2.9810630129045003e-05, + "loss": 1.1905, + "step": 1000 + }, + { + "epoch": 0.5268421052631579, + "grad_norm": 2.101400375366211, + "learning_rate": 2.9810236237386736e-05, + "loss": 1.3343, + "step": 1001 + }, + { + "epoch": 0.5273684210526316, + "grad_norm": 10.774019241333008, + "learning_rate": 2.980984193911191e-05, + "loss": 1.6976, + "step": 1002 + }, + { + "epoch": 0.5278947368421053, + "grad_norm": 1.1593283414840698, + "learning_rate": 2.9809447234231347e-05, + "loss": 0.9587, + "step": 1003 + }, + { + "epoch": 0.5284210526315789, + "grad_norm": 5.7743754386901855, + "learning_rate": 2.9809052122755885e-05, + "loss": 0.9899, + "step": 1004 + }, + { + "epoch": 0.5289473684210526, + "grad_norm": 1.3892626762390137, + "learning_rate": 2.9808656604696368e-05, + "loss": 1.1057, + "step": 1005 + }, + { + "epoch": 0.5294736842105263, + "grad_norm": 3.0294156074523926, + "learning_rate": 2.980826068006366e-05, + "loss": 0.6647, + "step": 1006 + }, + { + "epoch": 0.53, + "grad_norm": 4.9338483810424805, + "learning_rate": 2.9807864348868627e-05, + "loss": 1.9628, + "step": 1007 + }, + { + "epoch": 0.5305263157894737, + "grad_norm": 4.924473762512207, + "learning_rate": 2.980746761112215e-05, + "loss": 0.7927, + "step": 1008 + }, + { + "epoch": 0.5310526315789473, + "grad_norm": 10.412948608398438, + "learning_rate": 2.980707046683513e-05, + "loss": 1.1254, + "step": 1009 + }, + { + "epoch": 0.531578947368421, + "grad_norm": 9.721678733825684, + "learning_rate": 2.980667291601846e-05, + "loss": 1.9085, + "step": 1010 + }, + { + "epoch": 0.5321052631578947, + "grad_norm": 3.1283535957336426, + "learning_rate": 2.980627495868306e-05, + "loss": 1.4154, + "step": 1011 + }, + { + "epoch": 0.5326315789473685, + "grad_norm": 3.373187303543091, + "learning_rate": 2.980587659483985e-05, + "loss": 1.3, + "step": 1012 + }, + { + "epoch": 0.533157894736842, + "grad_norm": 8.386382102966309, + "learning_rate": 2.9805477824499782e-05, + "loss": 0.2979, + "step": 1013 + }, + { + "epoch": 0.5336842105263158, + "grad_norm": 2.241272211074829, + "learning_rate": 2.980507864767379e-05, + "loss": 0.2733, + "step": 1014 + }, + { + "epoch": 0.5342105263157895, + "grad_norm": 1.518618106842041, + "learning_rate": 2.9804679064372836e-05, + "loss": 1.8392, + "step": 1015 + }, + { + "epoch": 0.5347368421052632, + "grad_norm": 1.0135964155197144, + "learning_rate": 2.9804279074607893e-05, + "loss": 1.0983, + "step": 1016 + }, + { + "epoch": 0.5352631578947369, + "grad_norm": 1.5758641958236694, + "learning_rate": 2.9803878678389942e-05, + "loss": 1.0834, + "step": 1017 + }, + { + "epoch": 0.5357894736842105, + "grad_norm": 3.014568328857422, + "learning_rate": 2.9803477875729977e-05, + "loss": 0.4795, + "step": 1018 + }, + { + "epoch": 0.5363157894736842, + "grad_norm": 1.2805556058883667, + "learning_rate": 2.9803076666639e-05, + "loss": 1.2692, + "step": 1019 + }, + { + "epoch": 0.5368421052631579, + "grad_norm": 5.261938571929932, + "learning_rate": 2.9802675051128027e-05, + "loss": 0.8414, + "step": 1020 + }, + { + "epoch": 0.5373684210526316, + "grad_norm": 3.743300676345825, + "learning_rate": 2.9802273029208085e-05, + "loss": 1.3266, + "step": 1021 + }, + { + "epoch": 0.5378947368421053, + "grad_norm": 4.708719253540039, + "learning_rate": 2.9801870600890216e-05, + "loss": 0.132, + "step": 1022 + }, + { + "epoch": 0.5384210526315789, + "grad_norm": 6.205244064331055, + "learning_rate": 2.9801467766185457e-05, + "loss": 0.2055, + "step": 1023 + }, + { + "epoch": 0.5389473684210526, + "grad_norm": 11.566554069519043, + "learning_rate": 2.980106452510488e-05, + "loss": 0.1921, + "step": 1024 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 2.550504446029663, + "learning_rate": 2.9800660877659546e-05, + "loss": 0.6581, + "step": 1025 + }, + { + "epoch": 0.54, + "grad_norm": 6.586308002471924, + "learning_rate": 2.9800256823860548e-05, + "loss": 0.2069, + "step": 1026 + }, + { + "epoch": 0.5405263157894736, + "grad_norm": 2.6851885318756104, + "learning_rate": 2.9799852363718968e-05, + "loss": 1.5168, + "step": 1027 + }, + { + "epoch": 0.5410526315789473, + "grad_norm": 1.1070444583892822, + "learning_rate": 2.979944749724592e-05, + "loss": 0.8637, + "step": 1028 + }, + { + "epoch": 0.541578947368421, + "grad_norm": 0.5393061637878418, + "learning_rate": 2.979904222445252e-05, + "loss": 0.0123, + "step": 1029 + }, + { + "epoch": 0.5421052631578948, + "grad_norm": 4.805749416351318, + "learning_rate": 2.979863654534988e-05, + "loss": 0.6532, + "step": 1030 + }, + { + "epoch": 0.5426315789473685, + "grad_norm": 1.2212395668029785, + "learning_rate": 2.9798230459949154e-05, + "loss": 1.0349, + "step": 1031 + }, + { + "epoch": 0.5431578947368421, + "grad_norm": 1.0666829347610474, + "learning_rate": 2.9797823968261483e-05, + "loss": 0.816, + "step": 1032 + }, + { + "epoch": 0.5436842105263158, + "grad_norm": 1.6240037679672241, + "learning_rate": 2.9797417070298033e-05, + "loss": 1.1863, + "step": 1033 + }, + { + "epoch": 0.5442105263157895, + "grad_norm": 80.0791244506836, + "learning_rate": 2.979700976606997e-05, + "loss": 3.8047, + "step": 1034 + }, + { + "epoch": 0.5447368421052632, + "grad_norm": 7.121600151062012, + "learning_rate": 2.979660205558848e-05, + "loss": 1.8238, + "step": 1035 + }, + { + "epoch": 0.5452631578947369, + "grad_norm": 5.182654857635498, + "learning_rate": 2.9796193938864758e-05, + "loss": 1.7294, + "step": 1036 + }, + { + "epoch": 0.5457894736842105, + "grad_norm": 1.5659658908843994, + "learning_rate": 2.9795785415910002e-05, + "loss": 1.3367, + "step": 1037 + }, + { + "epoch": 0.5463157894736842, + "grad_norm": 2.371896266937256, + "learning_rate": 2.979537648673544e-05, + "loss": 0.6634, + "step": 1038 + }, + { + "epoch": 0.5468421052631579, + "grad_norm": 6.189696311950684, + "learning_rate": 2.979496715135228e-05, + "loss": 1.1687, + "step": 1039 + }, + { + "epoch": 0.5473684210526316, + "grad_norm": 3.078950881958008, + "learning_rate": 2.9794557409771775e-05, + "loss": 0.2819, + "step": 1040 + }, + { + "epoch": 0.5478947368421052, + "grad_norm": 1.354801893234253, + "learning_rate": 2.9794147262005182e-05, + "loss": 1.0736, + "step": 1041 + }, + { + "epoch": 0.5484210526315789, + "grad_norm": 7.661222457885742, + "learning_rate": 2.9793736708063735e-05, + "loss": 0.757, + "step": 1042 + }, + { + "epoch": 0.5489473684210526, + "grad_norm": 1.1365411281585693, + "learning_rate": 2.9793325747958736e-05, + "loss": 0.9885, + "step": 1043 + }, + { + "epoch": 0.5494736842105263, + "grad_norm": 2.5599515438079834, + "learning_rate": 2.9792914381701448e-05, + "loss": 1.5999, + "step": 1044 + }, + { + "epoch": 0.55, + "grad_norm": 4.250511169433594, + "learning_rate": 2.979250260930317e-05, + "loss": 0.8261, + "step": 1045 + }, + { + "epoch": 0.5505263157894736, + "grad_norm": 5.77577543258667, + "learning_rate": 2.9792090430775212e-05, + "loss": 1.4922, + "step": 1046 + }, + { + "epoch": 0.5510526315789473, + "grad_norm": 10.645989418029785, + "learning_rate": 2.979167784612888e-05, + "loss": 1.5039, + "step": 1047 + }, + { + "epoch": 0.5515789473684211, + "grad_norm": 22.560731887817383, + "learning_rate": 2.9791264855375516e-05, + "loss": 1.1108, + "step": 1048 + }, + { + "epoch": 0.5521052631578948, + "grad_norm": 0.9451678395271301, + "learning_rate": 2.9790851458526445e-05, + "loss": 1.0083, + "step": 1049 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 1.7276222705841064, + "learning_rate": 2.9790437655593025e-05, + "loss": 1.2542, + "step": 1050 + }, + { + "epoch": 0.5531578947368421, + "grad_norm": 12.997016906738281, + "learning_rate": 2.9790023446586616e-05, + "loss": 1.5738, + "step": 1051 + }, + { + "epoch": 0.5536842105263158, + "grad_norm": 5.94761848449707, + "learning_rate": 2.9789608831518585e-05, + "loss": 0.5001, + "step": 1052 + }, + { + "epoch": 0.5542105263157895, + "grad_norm": 3.714102029800415, + "learning_rate": 2.9789193810400324e-05, + "loss": 0.4654, + "step": 1053 + }, + { + "epoch": 0.5547368421052632, + "grad_norm": 1.2676738500595093, + "learning_rate": 2.978877838324322e-05, + "loss": 0.9719, + "step": 1054 + }, + { + "epoch": 0.5552631578947368, + "grad_norm": 4.261148452758789, + "learning_rate": 2.9788362550058683e-05, + "loss": 0.481, + "step": 1055 + }, + { + "epoch": 0.5557894736842105, + "grad_norm": 2.568772077560425, + "learning_rate": 2.9787946310858126e-05, + "loss": 0.2567, + "step": 1056 + }, + { + "epoch": 0.5563157894736842, + "grad_norm": 17.453506469726562, + "learning_rate": 2.9787529665652983e-05, + "loss": 1.1513, + "step": 1057 + }, + { + "epoch": 0.5568421052631579, + "grad_norm": 4.880800724029541, + "learning_rate": 2.9787112614454682e-05, + "loss": 0.7086, + "step": 1058 + }, + { + "epoch": 0.5573684210526316, + "grad_norm": 2.3661293983459473, + "learning_rate": 2.9786695157274686e-05, + "loss": 0.3181, + "step": 1059 + }, + { + "epoch": 0.5578947368421052, + "grad_norm": 1.5389021635055542, + "learning_rate": 2.9786277294124446e-05, + "loss": 1.1855, + "step": 1060 + }, + { + "epoch": 0.5584210526315789, + "grad_norm": 1.2657150030136108, + "learning_rate": 2.978585902501544e-05, + "loss": 1.0923, + "step": 1061 + }, + { + "epoch": 0.5589473684210526, + "grad_norm": 1.9125850200653076, + "learning_rate": 2.9785440349959154e-05, + "loss": 1.5275, + "step": 1062 + }, + { + "epoch": 0.5594736842105263, + "grad_norm": 1.2199132442474365, + "learning_rate": 2.978502126896708e-05, + "loss": 0.9496, + "step": 1063 + }, + { + "epoch": 0.56, + "grad_norm": 0.9119816422462463, + "learning_rate": 2.9784601782050716e-05, + "loss": 1.1481, + "step": 1064 + }, + { + "epoch": 0.5605263157894737, + "grad_norm": 1.264197587966919, + "learning_rate": 2.9784181889221588e-05, + "loss": 1.0066, + "step": 1065 + }, + { + "epoch": 0.5610526315789474, + "grad_norm": 1.3065340518951416, + "learning_rate": 2.978376159049123e-05, + "loss": 1.5019, + "step": 1066 + }, + { + "epoch": 0.5615789473684211, + "grad_norm": 5.354816913604736, + "learning_rate": 2.978334088587117e-05, + "loss": 1.2623, + "step": 1067 + }, + { + "epoch": 0.5621052631578948, + "grad_norm": 1.1491811275482178, + "learning_rate": 2.9782919775372958e-05, + "loss": 0.9462, + "step": 1068 + }, + { + "epoch": 0.5626315789473684, + "grad_norm": 1.9805035591125488, + "learning_rate": 2.9782498259008163e-05, + "loss": 0.8934, + "step": 1069 + }, + { + "epoch": 0.5631578947368421, + "grad_norm": 6.078151702880859, + "learning_rate": 2.9782076336788355e-05, + "loss": 2.088, + "step": 1070 + }, + { + "epoch": 0.5636842105263158, + "grad_norm": 1.2258517742156982, + "learning_rate": 2.9781654008725118e-05, + "loss": 1.2418, + "step": 1071 + }, + { + "epoch": 0.5642105263157895, + "grad_norm": 4.132420539855957, + "learning_rate": 2.9781231274830045e-05, + "loss": 1.0597, + "step": 1072 + }, + { + "epoch": 0.5647368421052632, + "grad_norm": 2.6240031719207764, + "learning_rate": 2.9780808135114745e-05, + "loss": 1.8114, + "step": 1073 + }, + { + "epoch": 0.5652631578947368, + "grad_norm": 5.4064154624938965, + "learning_rate": 2.9780384589590832e-05, + "loss": 1.7053, + "step": 1074 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 1.50705885887146, + "learning_rate": 2.9779960638269944e-05, + "loss": 1.2326, + "step": 1075 + }, + { + "epoch": 0.5663157894736842, + "grad_norm": 4.527035713195801, + "learning_rate": 2.9779536281163707e-05, + "loss": 1.0151, + "step": 1076 + }, + { + "epoch": 0.5668421052631579, + "grad_norm": 2.740678071975708, + "learning_rate": 2.9779111518283778e-05, + "loss": 1.3425, + "step": 1077 + }, + { + "epoch": 0.5673684210526316, + "grad_norm": 1.8033603429794312, + "learning_rate": 2.977868634964182e-05, + "loss": 1.4094, + "step": 1078 + }, + { + "epoch": 0.5678947368421052, + "grad_norm": 2.819211483001709, + "learning_rate": 2.9778260775249507e-05, + "loss": 0.5859, + "step": 1079 + }, + { + "epoch": 0.5684210526315789, + "grad_norm": 2.2031707763671875, + "learning_rate": 2.9777834795118516e-05, + "loss": 1.6467, + "step": 1080 + }, + { + "epoch": 0.5689473684210526, + "grad_norm": 1.0411590337753296, + "learning_rate": 2.9777408409260556e-05, + "loss": 1.1856, + "step": 1081 + }, + { + "epoch": 0.5694736842105264, + "grad_norm": 1.240462303161621, + "learning_rate": 2.977698161768732e-05, + "loss": 0.9601, + "step": 1082 + }, + { + "epoch": 0.57, + "grad_norm": 1.4794399738311768, + "learning_rate": 2.977655442041053e-05, + "loss": 0.6428, + "step": 1083 + }, + { + "epoch": 0.5705263157894737, + "grad_norm": 3.423283576965332, + "learning_rate": 2.9776126817441918e-05, + "loss": 0.5233, + "step": 1084 + }, + { + "epoch": 0.5710526315789474, + "grad_norm": 1.0337262153625488, + "learning_rate": 2.9775698808793216e-05, + "loss": 1.2001, + "step": 1085 + }, + { + "epoch": 0.5715789473684211, + "grad_norm": 2.4375438690185547, + "learning_rate": 2.9775270394476187e-05, + "loss": 0.8193, + "step": 1086 + }, + { + "epoch": 0.5721052631578948, + "grad_norm": 1.048431158065796, + "learning_rate": 2.9774841574502584e-05, + "loss": 1.0908, + "step": 1087 + }, + { + "epoch": 0.5726315789473684, + "grad_norm": 1.6780253648757935, + "learning_rate": 2.9774412348884184e-05, + "loss": 1.7247, + "step": 1088 + }, + { + "epoch": 0.5731578947368421, + "grad_norm": 3.6449801921844482, + "learning_rate": 2.9773982717632768e-05, + "loss": 1.8992, + "step": 1089 + }, + { + "epoch": 0.5736842105263158, + "grad_norm": 2.392915725708008, + "learning_rate": 2.9773552680760133e-05, + "loss": 0.5871, + "step": 1090 + }, + { + "epoch": 0.5742105263157895, + "grad_norm": 25.228097915649414, + "learning_rate": 2.9773122238278088e-05, + "loss": 1.6074, + "step": 1091 + }, + { + "epoch": 0.5747368421052632, + "grad_norm": 11.282571792602539, + "learning_rate": 2.9772691390198452e-05, + "loss": 1.8768, + "step": 1092 + }, + { + "epoch": 0.5752631578947368, + "grad_norm": 6.598527908325195, + "learning_rate": 2.9772260136533048e-05, + "loss": 1.2115, + "step": 1093 + }, + { + "epoch": 0.5757894736842105, + "grad_norm": 0.990164577960968, + "learning_rate": 2.977182847729372e-05, + "loss": 1.0003, + "step": 1094 + }, + { + "epoch": 0.5763157894736842, + "grad_norm": 2.4671967029571533, + "learning_rate": 2.9771396412492316e-05, + "loss": 0.0873, + "step": 1095 + }, + { + "epoch": 0.5768421052631579, + "grad_norm": 2.4675517082214355, + "learning_rate": 2.977096394214071e-05, + "loss": 1.2669, + "step": 1096 + }, + { + "epoch": 0.5773684210526315, + "grad_norm": 1.001388430595398, + "learning_rate": 2.9770531066250754e-05, + "loss": 1.3442, + "step": 1097 + }, + { + "epoch": 0.5778947368421052, + "grad_norm": 2.501490831375122, + "learning_rate": 2.9770097784834352e-05, + "loss": 0.8013, + "step": 1098 + }, + { + "epoch": 0.578421052631579, + "grad_norm": 1.80197274684906, + "learning_rate": 2.9769664097903394e-05, + "loss": 1.3928, + "step": 1099 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 1.4630690813064575, + "learning_rate": 2.9769230005469785e-05, + "loss": 1.5564, + "step": 1100 + }, + { + "epoch": 0.5794736842105264, + "grad_norm": 2.549288034439087, + "learning_rate": 2.9768795507545444e-05, + "loss": 0.8852, + "step": 1101 + }, + { + "epoch": 0.58, + "grad_norm": 3.0548112392425537, + "learning_rate": 2.97683606041423e-05, + "loss": 1.2653, + "step": 1102 + }, + { + "epoch": 0.5805263157894737, + "grad_norm": 2.3279740810394287, + "learning_rate": 2.9767925295272292e-05, + "loss": 1.3083, + "step": 1103 + }, + { + "epoch": 0.5810526315789474, + "grad_norm": 1.0182585716247559, + "learning_rate": 2.9767489580947375e-05, + "loss": 0.0161, + "step": 1104 + }, + { + "epoch": 0.5815789473684211, + "grad_norm": 2.9629054069519043, + "learning_rate": 2.976705346117951e-05, + "loss": 1.6623, + "step": 1105 + }, + { + "epoch": 0.5821052631578948, + "grad_norm": 5.767422199249268, + "learning_rate": 2.9766616935980668e-05, + "loss": 0.9786, + "step": 1106 + }, + { + "epoch": 0.5826315789473684, + "grad_norm": 4.677647590637207, + "learning_rate": 2.9766180005362835e-05, + "loss": 0.3723, + "step": 1107 + }, + { + "epoch": 0.5831578947368421, + "grad_norm": 1.2974311113357544, + "learning_rate": 2.9765742669338012e-05, + "loss": 0.793, + "step": 1108 + }, + { + "epoch": 0.5836842105263158, + "grad_norm": 1.323523759841919, + "learning_rate": 2.97653049279182e-05, + "loss": 1.2581, + "step": 1109 + }, + { + "epoch": 0.5842105263157895, + "grad_norm": 2.9345805644989014, + "learning_rate": 2.9764866781115417e-05, + "loss": 0.7428, + "step": 1110 + }, + { + "epoch": 0.5847368421052631, + "grad_norm": 8.754313468933105, + "learning_rate": 2.9764428228941702e-05, + "loss": 1.5404, + "step": 1111 + }, + { + "epoch": 0.5852631578947368, + "grad_norm": 5.4265875816345215, + "learning_rate": 2.976398927140908e-05, + "loss": 2.3934, + "step": 1112 + }, + { + "epoch": 0.5857894736842105, + "grad_norm": 3.4678874015808105, + "learning_rate": 2.9763549908529614e-05, + "loss": 1.079, + "step": 1113 + }, + { + "epoch": 0.5863157894736842, + "grad_norm": 9.305590629577637, + "learning_rate": 2.9763110140315365e-05, + "loss": 1.6112, + "step": 1114 + }, + { + "epoch": 0.5868421052631579, + "grad_norm": 2.989252805709839, + "learning_rate": 2.9762669966778406e-05, + "loss": 0.6553, + "step": 1115 + }, + { + "epoch": 0.5873684210526315, + "grad_norm": 1.2564352750778198, + "learning_rate": 2.9762229387930817e-05, + "loss": 1.0478, + "step": 1116 + }, + { + "epoch": 0.5878947368421052, + "grad_norm": 2.9239699840545654, + "learning_rate": 2.9761788403784703e-05, + "loss": 1.4082, + "step": 1117 + }, + { + "epoch": 0.588421052631579, + "grad_norm": 2.38325834274292, + "learning_rate": 2.9761347014352168e-05, + "loss": 1.1565, + "step": 1118 + }, + { + "epoch": 0.5889473684210527, + "grad_norm": 4.686885356903076, + "learning_rate": 2.9760905219645325e-05, + "loss": 1.3522, + "step": 1119 + }, + { + "epoch": 0.5894736842105263, + "grad_norm": 1.1427439451217651, + "learning_rate": 2.976046301967631e-05, + "loss": 0.7014, + "step": 1120 + }, + { + "epoch": 0.59, + "grad_norm": 5.191329002380371, + "learning_rate": 2.976002041445726e-05, + "loss": 1.1254, + "step": 1121 + }, + { + "epoch": 0.5905263157894737, + "grad_norm": 1.5960139036178589, + "learning_rate": 2.9759577404000332e-05, + "loss": 1.5535, + "step": 1122 + }, + { + "epoch": 0.5910526315789474, + "grad_norm": 1.104758858680725, + "learning_rate": 2.975913398831768e-05, + "loss": 1.2053, + "step": 1123 + }, + { + "epoch": 0.5915789473684211, + "grad_norm": 8.011280059814453, + "learning_rate": 2.9758690167421486e-05, + "loss": 1.5689, + "step": 1124 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 2.1890242099761963, + "learning_rate": 2.9758245941323932e-05, + "loss": 1.3737, + "step": 1125 + }, + { + "epoch": 0.5926315789473684, + "grad_norm": 5.8035502433776855, + "learning_rate": 2.9757801310037218e-05, + "loss": 1.4497, + "step": 1126 + }, + { + "epoch": 0.5931578947368421, + "grad_norm": 3.329540967941284, + "learning_rate": 2.9757356273573543e-05, + "loss": 0.3897, + "step": 1127 + }, + { + "epoch": 0.5936842105263158, + "grad_norm": 3.944429874420166, + "learning_rate": 2.9756910831945136e-05, + "loss": 1.0921, + "step": 1128 + }, + { + "epoch": 0.5942105263157895, + "grad_norm": 3.396190643310547, + "learning_rate": 2.9756464985164214e-05, + "loss": 0.4456, + "step": 1129 + }, + { + "epoch": 0.5947368421052631, + "grad_norm": 1.586104393005371, + "learning_rate": 2.9756018733243033e-05, + "loss": 1.401, + "step": 1130 + }, + { + "epoch": 0.5952631578947368, + "grad_norm": 1.2480674982070923, + "learning_rate": 2.9755572076193833e-05, + "loss": 1.3656, + "step": 1131 + }, + { + "epoch": 0.5957894736842105, + "grad_norm": 1.0409289598464966, + "learning_rate": 2.9755125014028876e-05, + "loss": 1.1957, + "step": 1132 + }, + { + "epoch": 0.5963157894736842, + "grad_norm": 1.0716015100479126, + "learning_rate": 2.9754677546760444e-05, + "loss": 0.9709, + "step": 1133 + }, + { + "epoch": 0.5968421052631578, + "grad_norm": 1.9948137998580933, + "learning_rate": 2.9754229674400822e-05, + "loss": 1.1416, + "step": 1134 + }, + { + "epoch": 0.5973684210526315, + "grad_norm": 1.064031720161438, + "learning_rate": 2.9753781396962303e-05, + "loss": 0.9325, + "step": 1135 + }, + { + "epoch": 0.5978947368421053, + "grad_norm": 2.7508461475372314, + "learning_rate": 2.9753332714457193e-05, + "loss": 0.1521, + "step": 1136 + }, + { + "epoch": 0.598421052631579, + "grad_norm": 2.104701519012451, + "learning_rate": 2.9752883626897813e-05, + "loss": 1.3733, + "step": 1137 + }, + { + "epoch": 0.5989473684210527, + "grad_norm": 1.180884838104248, + "learning_rate": 2.9752434134296494e-05, + "loss": 1.0614, + "step": 1138 + }, + { + "epoch": 0.5994736842105263, + "grad_norm": 1.3136661052703857, + "learning_rate": 2.9751984236665578e-05, + "loss": 0.761, + "step": 1139 + }, + { + "epoch": 0.6, + "grad_norm": 5.025672435760498, + "learning_rate": 2.975153393401741e-05, + "loss": 0.8013, + "step": 1140 + }, + { + "epoch": 0.6005263157894737, + "grad_norm": 0.5844347476959229, + "learning_rate": 2.975108322636436e-05, + "loss": 0.0122, + "step": 1141 + }, + { + "epoch": 0.6010526315789474, + "grad_norm": 1.406011700630188, + "learning_rate": 2.9750632113718795e-05, + "loss": 1.2908, + "step": 1142 + }, + { + "epoch": 0.6015789473684211, + "grad_norm": 51.026851654052734, + "learning_rate": 2.9750180596093108e-05, + "loss": 2.5605, + "step": 1143 + }, + { + "epoch": 0.6021052631578947, + "grad_norm": 1.4514580965042114, + "learning_rate": 2.9749728673499695e-05, + "loss": 1.0439, + "step": 1144 + }, + { + "epoch": 0.6026315789473684, + "grad_norm": 1.7348302602767944, + "learning_rate": 2.9749276345950957e-05, + "loss": 1.8712, + "step": 1145 + }, + { + "epoch": 0.6031578947368421, + "grad_norm": 2.1238996982574463, + "learning_rate": 2.974882361345932e-05, + "loss": 1.0086, + "step": 1146 + }, + { + "epoch": 0.6036842105263158, + "grad_norm": 3.6767418384552, + "learning_rate": 2.974837047603721e-05, + "loss": 0.2294, + "step": 1147 + }, + { + "epoch": 0.6042105263157894, + "grad_norm": 7.874570369720459, + "learning_rate": 2.9747916933697066e-05, + "loss": 2.2632, + "step": 1148 + }, + { + "epoch": 0.6047368421052631, + "grad_norm": 8.193496704101562, + "learning_rate": 2.9747462986451347e-05, + "loss": 1.793, + "step": 1149 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 1.6370347738265991, + "learning_rate": 2.9747008634312508e-05, + "loss": 1.4331, + "step": 1150 + }, + { + "epoch": 0.6057894736842105, + "grad_norm": 2.3346762657165527, + "learning_rate": 2.9746553877293025e-05, + "loss": 1.4356, + "step": 1151 + }, + { + "epoch": 0.6063157894736843, + "grad_norm": 3.7243125438690186, + "learning_rate": 2.9746098715405394e-05, + "loss": 1.8991, + "step": 1152 + }, + { + "epoch": 0.6068421052631578, + "grad_norm": 4.2178168296813965, + "learning_rate": 2.9745643148662095e-05, + "loss": 1.3015, + "step": 1153 + }, + { + "epoch": 0.6073684210526316, + "grad_norm": 2.0063540935516357, + "learning_rate": 2.9745187177075645e-05, + "loss": 1.0399, + "step": 1154 + }, + { + "epoch": 0.6078947368421053, + "grad_norm": 3.9274532794952393, + "learning_rate": 2.9744730800658563e-05, + "loss": 0.9394, + "step": 1155 + }, + { + "epoch": 0.608421052631579, + "grad_norm": 0.9116756319999695, + "learning_rate": 2.9744274019423375e-05, + "loss": 0.7879, + "step": 1156 + }, + { + "epoch": 0.6089473684210527, + "grad_norm": 1.725326418876648, + "learning_rate": 2.9743816833382627e-05, + "loss": 1.2359, + "step": 1157 + }, + { + "epoch": 0.6094736842105263, + "grad_norm": 2.4110519886016846, + "learning_rate": 2.9743359242548865e-05, + "loss": 1.0438, + "step": 1158 + }, + { + "epoch": 0.61, + "grad_norm": 3.515199661254883, + "learning_rate": 2.9742901246934657e-05, + "loss": 1.0046, + "step": 1159 + }, + { + "epoch": 0.6105263157894737, + "grad_norm": 3.636535167694092, + "learning_rate": 2.9742442846552578e-05, + "loss": 1.0394, + "step": 1160 + }, + { + "epoch": 0.6110526315789474, + "grad_norm": 8.082115173339844, + "learning_rate": 2.974198404141521e-05, + "loss": 0.8219, + "step": 1161 + }, + { + "epoch": 0.611578947368421, + "grad_norm": 1.6529840230941772, + "learning_rate": 2.9741524831535147e-05, + "loss": 0.0603, + "step": 1162 + }, + { + "epoch": 0.6121052631578947, + "grad_norm": 1.2058558464050293, + "learning_rate": 2.9741065216925002e-05, + "loss": 0.8499, + "step": 1163 + }, + { + "epoch": 0.6126315789473684, + "grad_norm": 1.5454597473144531, + "learning_rate": 2.97406051975974e-05, + "loss": 0.7998, + "step": 1164 + }, + { + "epoch": 0.6131578947368421, + "grad_norm": 1.1322457790374756, + "learning_rate": 2.9740144773564952e-05, + "loss": 1.1103, + "step": 1165 + }, + { + "epoch": 0.6136842105263158, + "grad_norm": 1.2677356004714966, + "learning_rate": 2.9739683944840315e-05, + "loss": 0.7271, + "step": 1166 + }, + { + "epoch": 0.6142105263157894, + "grad_norm": 29.934062957763672, + "learning_rate": 2.9739222711436137e-05, + "loss": 2.5771, + "step": 1167 + }, + { + "epoch": 0.6147368421052631, + "grad_norm": 1.108261227607727, + "learning_rate": 2.973876107336508e-05, + "loss": 1.017, + "step": 1168 + }, + { + "epoch": 0.6152631578947368, + "grad_norm": 4.538265705108643, + "learning_rate": 2.9738299030639818e-05, + "loss": 1.3243, + "step": 1169 + }, + { + "epoch": 0.6157894736842106, + "grad_norm": 1.448798418045044, + "learning_rate": 2.9737836583273037e-05, + "loss": 1.1748, + "step": 1170 + }, + { + "epoch": 0.6163157894736843, + "grad_norm": 1.834809422492981, + "learning_rate": 2.9737373731277432e-05, + "loss": 1.1335, + "step": 1171 + }, + { + "epoch": 0.6168421052631579, + "grad_norm": 1.8159127235412598, + "learning_rate": 2.9736910474665714e-05, + "loss": 1.3667, + "step": 1172 + }, + { + "epoch": 0.6173684210526316, + "grad_norm": 1.2512363195419312, + "learning_rate": 2.9736446813450603e-05, + "loss": 1.1666, + "step": 1173 + }, + { + "epoch": 0.6178947368421053, + "grad_norm": 13.280810356140137, + "learning_rate": 2.9735982747644817e-05, + "loss": 1.3808, + "step": 1174 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 10.25016975402832, + "learning_rate": 2.9735518277261113e-05, + "loss": 0.7421, + "step": 1175 + }, + { + "epoch": 0.6189473684210526, + "grad_norm": 2.516862630844116, + "learning_rate": 2.9735053402312234e-05, + "loss": 1.9148, + "step": 1176 + }, + { + "epoch": 0.6194736842105263, + "grad_norm": 1.1309010982513428, + "learning_rate": 2.973458812281095e-05, + "loss": 1.2431, + "step": 1177 + }, + { + "epoch": 0.62, + "grad_norm": 2.400505542755127, + "learning_rate": 2.9734122438770023e-05, + "loss": 1.3809, + "step": 1178 + }, + { + "epoch": 0.6205263157894737, + "grad_norm": 1.2216683626174927, + "learning_rate": 2.9733656350202248e-05, + "loss": 0.7622, + "step": 1179 + }, + { + "epoch": 0.6210526315789474, + "grad_norm": 1.942233920097351, + "learning_rate": 2.973318985712042e-05, + "loss": 1.2894, + "step": 1180 + }, + { + "epoch": 0.621578947368421, + "grad_norm": 3.7196786403656006, + "learning_rate": 2.9732722959537345e-05, + "loss": 0.9568, + "step": 1181 + }, + { + "epoch": 0.6221052631578947, + "grad_norm": 1.3381714820861816, + "learning_rate": 2.973225565746585e-05, + "loss": 1.1391, + "step": 1182 + }, + { + "epoch": 0.6226315789473684, + "grad_norm": 3.379469871520996, + "learning_rate": 2.973178795091875e-05, + "loss": 1.4507, + "step": 1183 + }, + { + "epoch": 0.6231578947368421, + "grad_norm": 4.584420680999756, + "learning_rate": 2.9731319839908895e-05, + "loss": 0.5189, + "step": 1184 + }, + { + "epoch": 0.6236842105263158, + "grad_norm": 1.164870023727417, + "learning_rate": 2.9730851324449133e-05, + "loss": 1.2698, + "step": 1185 + }, + { + "epoch": 0.6242105263157894, + "grad_norm": 2.1792705059051514, + "learning_rate": 2.9730382404552334e-05, + "loss": 0.9649, + "step": 1186 + }, + { + "epoch": 0.6247368421052631, + "grad_norm": 10.801166534423828, + "learning_rate": 2.9729913080231364e-05, + "loss": 1.006, + "step": 1187 + }, + { + "epoch": 0.6252631578947369, + "grad_norm": 6.744489669799805, + "learning_rate": 2.9729443351499118e-05, + "loss": 1.5334, + "step": 1188 + }, + { + "epoch": 0.6257894736842106, + "grad_norm": 0.9813660979270935, + "learning_rate": 2.9728973218368477e-05, + "loss": 0.8807, + "step": 1189 + }, + { + "epoch": 0.6263157894736842, + "grad_norm": 1.4452024698257446, + "learning_rate": 2.9728502680852368e-05, + "loss": 1.2235, + "step": 1190 + }, + { + "epoch": 0.6268421052631579, + "grad_norm": 1.5050314664840698, + "learning_rate": 2.9728031738963695e-05, + "loss": 0.9459, + "step": 1191 + }, + { + "epoch": 0.6273684210526316, + "grad_norm": 1.1202926635742188, + "learning_rate": 2.9727560392715395e-05, + "loss": 0.7596, + "step": 1192 + }, + { + "epoch": 0.6278947368421053, + "grad_norm": 0.803450345993042, + "learning_rate": 2.9727088642120406e-05, + "loss": 0.3567, + "step": 1193 + }, + { + "epoch": 0.628421052631579, + "grad_norm": 1.9323444366455078, + "learning_rate": 2.9726616487191675e-05, + "loss": 1.1573, + "step": 1194 + }, + { + "epoch": 0.6289473684210526, + "grad_norm": 1.0270202159881592, + "learning_rate": 2.9726143927942176e-05, + "loss": 1.09, + "step": 1195 + }, + { + "epoch": 0.6294736842105263, + "grad_norm": 12.028121948242188, + "learning_rate": 2.9725670964384876e-05, + "loss": 0.8462, + "step": 1196 + }, + { + "epoch": 0.63, + "grad_norm": 7.259978771209717, + "learning_rate": 2.9725197596532768e-05, + "loss": 1.0018, + "step": 1197 + }, + { + "epoch": 0.6305263157894737, + "grad_norm": 1.1468654870986938, + "learning_rate": 2.9724723824398838e-05, + "loss": 1.1135, + "step": 1198 + }, + { + "epoch": 0.6310526315789474, + "grad_norm": 1.0924807786941528, + "learning_rate": 2.9724249647996095e-05, + "loss": 1.2861, + "step": 1199 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 1.762115478515625, + "learning_rate": 2.972377506733756e-05, + "loss": 1.2612, + "step": 1200 + }, + { + "epoch": 0.6321052631578947, + "grad_norm": 1.1973563432693481, + "learning_rate": 2.9723300082436266e-05, + "loss": 1.042, + "step": 1201 + }, + { + "epoch": 0.6326315789473684, + "grad_norm": 1.2056132555007935, + "learning_rate": 2.972282469330525e-05, + "loss": 0.8789, + "step": 1202 + }, + { + "epoch": 0.6331578947368421, + "grad_norm": 0.9773816466331482, + "learning_rate": 2.9722348899957564e-05, + "loss": 0.9174, + "step": 1203 + }, + { + "epoch": 0.6336842105263157, + "grad_norm": 2.236300468444824, + "learning_rate": 2.972187270240627e-05, + "loss": 1.404, + "step": 1204 + }, + { + "epoch": 0.6342105263157894, + "grad_norm": 1.2097399234771729, + "learning_rate": 2.9721396100664447e-05, + "loss": 1.2159, + "step": 1205 + }, + { + "epoch": 0.6347368421052632, + "grad_norm": 1.800818681716919, + "learning_rate": 2.9720919094745176e-05, + "loss": 0.8479, + "step": 1206 + }, + { + "epoch": 0.6352631578947369, + "grad_norm": 2.954768180847168, + "learning_rate": 2.9720441684661552e-05, + "loss": 1.3383, + "step": 1207 + }, + { + "epoch": 0.6357894736842106, + "grad_norm": 2.48077130317688, + "learning_rate": 2.9719963870426685e-05, + "loss": 1.2594, + "step": 1208 + }, + { + "epoch": 0.6363157894736842, + "grad_norm": 4.164140701293945, + "learning_rate": 2.9719485652053696e-05, + "loss": 1.3007, + "step": 1209 + }, + { + "epoch": 0.6368421052631579, + "grad_norm": 4.807854175567627, + "learning_rate": 2.971900702955571e-05, + "loss": 0.2724, + "step": 1210 + }, + { + "epoch": 0.6373684210526316, + "grad_norm": 2.658689498901367, + "learning_rate": 2.971852800294587e-05, + "loss": 1.4455, + "step": 1211 + }, + { + "epoch": 0.6378947368421053, + "grad_norm": 5.018266677856445, + "learning_rate": 2.9718048572237323e-05, + "loss": 1.5204, + "step": 1212 + }, + { + "epoch": 0.638421052631579, + "grad_norm": 7.115269184112549, + "learning_rate": 2.9717568737443236e-05, + "loss": 1.763, + "step": 1213 + }, + { + "epoch": 0.6389473684210526, + "grad_norm": 1.1763184070587158, + "learning_rate": 2.9717088498576787e-05, + "loss": 0.7555, + "step": 1214 + }, + { + "epoch": 0.6394736842105263, + "grad_norm": 0.8004067540168762, + "learning_rate": 2.9716607855651154e-05, + "loss": 0.5879, + "step": 1215 + }, + { + "epoch": 0.64, + "grad_norm": 0.9092245697975159, + "learning_rate": 2.971612680867953e-05, + "loss": 0.9113, + "step": 1216 + }, + { + "epoch": 0.6405263157894737, + "grad_norm": 1.03469717502594, + "learning_rate": 2.9715645357675133e-05, + "loss": 0.8598, + "step": 1217 + }, + { + "epoch": 0.6410526315789473, + "grad_norm": 13.371722221374512, + "learning_rate": 2.9715163502651176e-05, + "loss": 0.6834, + "step": 1218 + }, + { + "epoch": 0.641578947368421, + "grad_norm": 1.745549201965332, + "learning_rate": 2.9714681243620885e-05, + "loss": 1.4798, + "step": 1219 + }, + { + "epoch": 0.6421052631578947, + "grad_norm": 2.9852797985076904, + "learning_rate": 2.971419858059751e-05, + "loss": 1.6255, + "step": 1220 + }, + { + "epoch": 0.6426315789473684, + "grad_norm": 1.1600061655044556, + "learning_rate": 2.971371551359429e-05, + "loss": 0.9992, + "step": 1221 + }, + { + "epoch": 0.6431578947368422, + "grad_norm": 1.1885265111923218, + "learning_rate": 2.97132320426245e-05, + "loss": 0.9063, + "step": 1222 + }, + { + "epoch": 0.6436842105263157, + "grad_norm": 1.2887687683105469, + "learning_rate": 2.9712748167701406e-05, + "loss": 1.0904, + "step": 1223 + }, + { + "epoch": 0.6442105263157895, + "grad_norm": 3.543919324874878, + "learning_rate": 2.9712263888838295e-05, + "loss": 0.6287, + "step": 1224 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 1.4697569608688354, + "learning_rate": 2.9711779206048457e-05, + "loss": 1.6434, + "step": 1225 + }, + { + "epoch": 0.6452631578947369, + "grad_norm": 1.5276075601577759, + "learning_rate": 2.9711294119345212e-05, + "loss": 0.9159, + "step": 1226 + }, + { + "epoch": 0.6457894736842106, + "grad_norm": 1.8441684246063232, + "learning_rate": 2.9710808628741866e-05, + "loss": 1.2164, + "step": 1227 + }, + { + "epoch": 0.6463157894736842, + "grad_norm": 5.690790176391602, + "learning_rate": 2.9710322734251753e-05, + "loss": 0.818, + "step": 1228 + }, + { + "epoch": 0.6468421052631579, + "grad_norm": 2.064948081970215, + "learning_rate": 2.9709836435888213e-05, + "loss": 1.5381, + "step": 1229 + }, + { + "epoch": 0.6473684210526316, + "grad_norm": 1.3582369089126587, + "learning_rate": 2.9709349733664602e-05, + "loss": 0.4742, + "step": 1230 + }, + { + "epoch": 0.6478947368421053, + "grad_norm": 1.6712125539779663, + "learning_rate": 2.9708862627594275e-05, + "loss": 0.9667, + "step": 1231 + }, + { + "epoch": 0.6484210526315789, + "grad_norm": 5.520822048187256, + "learning_rate": 2.970837511769061e-05, + "loss": 0.7173, + "step": 1232 + }, + { + "epoch": 0.6489473684210526, + "grad_norm": 3.7994682788848877, + "learning_rate": 2.9707887203966986e-05, + "loss": 0.8053, + "step": 1233 + }, + { + "epoch": 0.6494736842105263, + "grad_norm": 1.3790806531906128, + "learning_rate": 2.970739888643681e-05, + "loss": 1.2264, + "step": 1234 + }, + { + "epoch": 0.65, + "grad_norm": 2.789325475692749, + "learning_rate": 2.9706910165113477e-05, + "loss": 0.6273, + "step": 1235 + }, + { + "epoch": 0.6505263157894737, + "grad_norm": 2.6303861141204834, + "learning_rate": 2.970642104001041e-05, + "loss": 0.0705, + "step": 1236 + }, + { + "epoch": 0.6510526315789473, + "grad_norm": 6.236796855926514, + "learning_rate": 2.9705931511141037e-05, + "loss": 0.147, + "step": 1237 + }, + { + "epoch": 0.651578947368421, + "grad_norm": 7.691266059875488, + "learning_rate": 2.9705441578518798e-05, + "loss": 0.6646, + "step": 1238 + }, + { + "epoch": 0.6521052631578947, + "grad_norm": 1.5175530910491943, + "learning_rate": 2.9704951242157145e-05, + "loss": 1.33, + "step": 1239 + }, + { + "epoch": 0.6526315789473685, + "grad_norm": 1.3474842309951782, + "learning_rate": 2.9704460502069544e-05, + "loss": 0.8195, + "step": 1240 + }, + { + "epoch": 0.6531578947368422, + "grad_norm": 4.029512882232666, + "learning_rate": 2.9703969358269462e-05, + "loss": 0.3568, + "step": 1241 + }, + { + "epoch": 0.6536842105263158, + "grad_norm": 1.3466565608978271, + "learning_rate": 2.9703477810770384e-05, + "loss": 1.259, + "step": 1242 + }, + { + "epoch": 0.6542105263157895, + "grad_norm": 2.5386829376220703, + "learning_rate": 2.970298585958581e-05, + "loss": 1.0745, + "step": 1243 + }, + { + "epoch": 0.6547368421052632, + "grad_norm": 2.428055763244629, + "learning_rate": 2.9702493504729244e-05, + "loss": 0.8593, + "step": 1244 + }, + { + "epoch": 0.6552631578947369, + "grad_norm": 1.3094781637191772, + "learning_rate": 2.97020007462142e-05, + "loss": 0.8212, + "step": 1245 + }, + { + "epoch": 0.6557894736842105, + "grad_norm": 1.4779995679855347, + "learning_rate": 2.970150758405421e-05, + "loss": 1.0186, + "step": 1246 + }, + { + "epoch": 0.6563157894736842, + "grad_norm": 1.1934393644332886, + "learning_rate": 2.9701014018262817e-05, + "loss": 1.1701, + "step": 1247 + }, + { + "epoch": 0.6568421052631579, + "grad_norm": 0.7196000218391418, + "learning_rate": 2.9700520048853566e-05, + "loss": 0.0174, + "step": 1248 + }, + { + "epoch": 0.6573684210526316, + "grad_norm": 0.9140834808349609, + "learning_rate": 2.9700025675840028e-05, + "loss": 0.9861, + "step": 1249 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 4.055169582366943, + "learning_rate": 2.969953089923577e-05, + "loss": 0.4162, + "step": 1250 + }, + { + "epoch": 0.6584210526315789, + "grad_norm": 0.5134720802307129, + "learning_rate": 2.9699035719054367e-05, + "loss": 0.0143, + "step": 1251 + }, + { + "epoch": 0.6589473684210526, + "grad_norm": 3.9289088249206543, + "learning_rate": 2.9698540135309434e-05, + "loss": 0.5238, + "step": 1252 + }, + { + "epoch": 0.6594736842105263, + "grad_norm": 6.439306735992432, + "learning_rate": 2.969804414801456e-05, + "loss": 0.549, + "step": 1253 + }, + { + "epoch": 0.66, + "grad_norm": 2.025859832763672, + "learning_rate": 2.969754775718337e-05, + "loss": 0.9216, + "step": 1254 + }, + { + "epoch": 0.6605263157894737, + "grad_norm": 1.8443411588668823, + "learning_rate": 2.969705096282949e-05, + "loss": 0.1711, + "step": 1255 + }, + { + "epoch": 0.6610526315789473, + "grad_norm": 1.5247849225997925, + "learning_rate": 2.9696553764966562e-05, + "loss": 1.1315, + "step": 1256 + }, + { + "epoch": 0.661578947368421, + "grad_norm": 3.6713950634002686, + "learning_rate": 2.9696056163608237e-05, + "loss": 0.467, + "step": 1257 + }, + { + "epoch": 0.6621052631578948, + "grad_norm": 0.790451169013977, + "learning_rate": 2.969555815876818e-05, + "loss": 0.0082, + "step": 1258 + }, + { + "epoch": 0.6626315789473685, + "grad_norm": 1.4589107036590576, + "learning_rate": 2.9695059750460052e-05, + "loss": 0.6006, + "step": 1259 + }, + { + "epoch": 0.6631578947368421, + "grad_norm": 1.3655271530151367, + "learning_rate": 2.9694560938697548e-05, + "loss": 1.2306, + "step": 1260 + }, + { + "epoch": 0.6636842105263158, + "grad_norm": 1.3729549646377563, + "learning_rate": 2.9694061723494358e-05, + "loss": 0.7385, + "step": 1261 + }, + { + "epoch": 0.6642105263157895, + "grad_norm": 4.8193864822387695, + "learning_rate": 2.9693562104864182e-05, + "loss": 0.7079, + "step": 1262 + }, + { + "epoch": 0.6647368421052632, + "grad_norm": 1.563333511352539, + "learning_rate": 2.9693062082820752e-05, + "loss": 0.5117, + "step": 1263 + }, + { + "epoch": 0.6652631578947369, + "grad_norm": 1.3271199464797974, + "learning_rate": 2.969256165737779e-05, + "loss": 1.0954, + "step": 1264 + }, + { + "epoch": 0.6657894736842105, + "grad_norm": 4.284012794494629, + "learning_rate": 2.9692060828549025e-05, + "loss": 1.9864, + "step": 1265 + }, + { + "epoch": 0.6663157894736842, + "grad_norm": 3.3707847595214844, + "learning_rate": 2.969155959634822e-05, + "loss": 1.0695, + "step": 1266 + }, + { + "epoch": 0.6668421052631579, + "grad_norm": 2.1350784301757812, + "learning_rate": 2.9691057960789133e-05, + "loss": 1.2387, + "step": 1267 + }, + { + "epoch": 0.6673684210526316, + "grad_norm": 1.350180745124817, + "learning_rate": 2.9690555921885534e-05, + "loss": 1.1326, + "step": 1268 + }, + { + "epoch": 0.6678947368421052, + "grad_norm": 1.3490583896636963, + "learning_rate": 2.9690053479651207e-05, + "loss": 0.7916, + "step": 1269 + }, + { + "epoch": 0.6684210526315789, + "grad_norm": 5.74185848236084, + "learning_rate": 2.968955063409995e-05, + "loss": 0.9312, + "step": 1270 + }, + { + "epoch": 0.6689473684210526, + "grad_norm": 1.1950428485870361, + "learning_rate": 2.968904738524557e-05, + "loss": 0.8421, + "step": 1271 + }, + { + "epoch": 0.6694736842105263, + "grad_norm": 1.2086502313613892, + "learning_rate": 2.9688543733101876e-05, + "loss": 1.205, + "step": 1272 + }, + { + "epoch": 0.67, + "grad_norm": 2.652189254760742, + "learning_rate": 2.96880396776827e-05, + "loss": 1.0971, + "step": 1273 + }, + { + "epoch": 0.6705263157894736, + "grad_norm": 1.5372319221496582, + "learning_rate": 2.968753521900188e-05, + "loss": 1.4361, + "step": 1274 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 3.254009485244751, + "learning_rate": 2.9687030357073265e-05, + "loss": 1.5015, + "step": 1275 + }, + { + "epoch": 0.671578947368421, + "grad_norm": 1.3365062475204468, + "learning_rate": 2.968652509191072e-05, + "loss": 1.2392, + "step": 1276 + }, + { + "epoch": 0.6721052631578948, + "grad_norm": 8.001225471496582, + "learning_rate": 2.9686019423528117e-05, + "loss": 1.1131, + "step": 1277 + }, + { + "epoch": 0.6726315789473685, + "grad_norm": 4.288140296936035, + "learning_rate": 2.9685513351939335e-05, + "loss": 1.2558, + "step": 1278 + }, + { + "epoch": 0.6731578947368421, + "grad_norm": 0.9908925294876099, + "learning_rate": 2.968500687715827e-05, + "loss": 1.0251, + "step": 1279 + }, + { + "epoch": 0.6736842105263158, + "grad_norm": 1.2662602663040161, + "learning_rate": 2.968449999919883e-05, + "loss": 1.0004, + "step": 1280 + }, + { + "epoch": 0.6742105263157895, + "grad_norm": 1.4708739519119263, + "learning_rate": 2.9683992718074926e-05, + "loss": 1.3322, + "step": 1281 + }, + { + "epoch": 0.6747368421052632, + "grad_norm": 1.1707433462142944, + "learning_rate": 2.9683485033800494e-05, + "loss": 1.3583, + "step": 1282 + }, + { + "epoch": 0.6752631578947368, + "grad_norm": 0.060910388827323914, + "learning_rate": 2.9682976946389463e-05, + "loss": 0.0018, + "step": 1283 + }, + { + "epoch": 0.6757894736842105, + "grad_norm": 1.6072977781295776, + "learning_rate": 2.9682468455855783e-05, + "loss": 0.0838, + "step": 1284 + }, + { + "epoch": 0.6763157894736842, + "grad_norm": 1.179344892501831, + "learning_rate": 2.9681959562213422e-05, + "loss": 1.3671, + "step": 1285 + }, + { + "epoch": 0.6768421052631579, + "grad_norm": 6.977228164672852, + "learning_rate": 2.9681450265476345e-05, + "loss": 0.5456, + "step": 1286 + }, + { + "epoch": 0.6773684210526316, + "grad_norm": 1.475023865699768, + "learning_rate": 2.968094056565854e-05, + "loss": 1.0063, + "step": 1287 + }, + { + "epoch": 0.6778947368421052, + "grad_norm": 2.039761781692505, + "learning_rate": 2.9680430462774e-05, + "loss": 0.3243, + "step": 1288 + }, + { + "epoch": 0.6784210526315789, + "grad_norm": 1.093194603919983, + "learning_rate": 2.967991995683673e-05, + "loss": 1.2607, + "step": 1289 + }, + { + "epoch": 0.6789473684210526, + "grad_norm": 24.999378204345703, + "learning_rate": 2.9679409047860743e-05, + "loss": 1.5342, + "step": 1290 + }, + { + "epoch": 0.6794736842105263, + "grad_norm": 1.6621330976486206, + "learning_rate": 2.9678897735860066e-05, + "loss": 1.8376, + "step": 1291 + }, + { + "epoch": 0.68, + "grad_norm": 3.57814621925354, + "learning_rate": 2.9678386020848742e-05, + "loss": 1.4534, + "step": 1292 + }, + { + "epoch": 0.6805263157894736, + "grad_norm": 1.6566349267959595, + "learning_rate": 2.9677873902840813e-05, + "loss": 0.049, + "step": 1293 + }, + { + "epoch": 0.6810526315789474, + "grad_norm": 1.090785264968872, + "learning_rate": 2.9677361381850345e-05, + "loss": 1.0962, + "step": 1294 + }, + { + "epoch": 0.6815789473684211, + "grad_norm": 1.6356092691421509, + "learning_rate": 2.9676848457891407e-05, + "loss": 0.6644, + "step": 1295 + }, + { + "epoch": 0.6821052631578948, + "grad_norm": 1.247987151145935, + "learning_rate": 2.9676335130978082e-05, + "loss": 0.8771, + "step": 1296 + }, + { + "epoch": 0.6826315789473684, + "grad_norm": 1.4328747987747192, + "learning_rate": 2.9675821401124465e-05, + "loss": 1.0978, + "step": 1297 + }, + { + "epoch": 0.6831578947368421, + "grad_norm": 6.744469165802002, + "learning_rate": 2.967530726834466e-05, + "loss": 0.8294, + "step": 1298 + }, + { + "epoch": 0.6836842105263158, + "grad_norm": 1.233546495437622, + "learning_rate": 2.9674792732652773e-05, + "loss": 0.9119, + "step": 1299 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 1.5304890871047974, + "learning_rate": 2.967427779406295e-05, + "loss": 1.2472, + "step": 1300 + }, + { + "epoch": 0.6847368421052632, + "grad_norm": 1.6759600639343262, + "learning_rate": 2.9673762452589307e-05, + "loss": 1.0797, + "step": 1301 + }, + { + "epoch": 0.6852631578947368, + "grad_norm": 2.450378894805908, + "learning_rate": 2.967324670824601e-05, + "loss": 0.7401, + "step": 1302 + }, + { + "epoch": 0.6857894736842105, + "grad_norm": 0.631327748298645, + "learning_rate": 2.9672730561047214e-05, + "loss": 0.0149, + "step": 1303 + }, + { + "epoch": 0.6863157894736842, + "grad_norm": 2.2037277221679688, + "learning_rate": 2.9672214011007087e-05, + "loss": 0.9195, + "step": 1304 + }, + { + "epoch": 0.6868421052631579, + "grad_norm": 20.760221481323242, + "learning_rate": 2.967169705813981e-05, + "loss": 2.7392, + "step": 1305 + }, + { + "epoch": 0.6873684210526316, + "grad_norm": 1.0082573890686035, + "learning_rate": 2.9671179702459576e-05, + "loss": 1.0737, + "step": 1306 + }, + { + "epoch": 0.6878947368421052, + "grad_norm": 2.9095327854156494, + "learning_rate": 2.9670661943980595e-05, + "loss": 1.4494, + "step": 1307 + }, + { + "epoch": 0.6884210526315789, + "grad_norm": 0.9664977788925171, + "learning_rate": 2.9670143782717075e-05, + "loss": 1.1525, + "step": 1308 + }, + { + "epoch": 0.6889473684210526, + "grad_norm": 0.6551389098167419, + "learning_rate": 2.966962521868325e-05, + "loss": 0.0284, + "step": 1309 + }, + { + "epoch": 0.6894736842105263, + "grad_norm": 6.767817497253418, + "learning_rate": 2.966910625189335e-05, + "loss": 0.253, + "step": 1310 + }, + { + "epoch": 0.69, + "grad_norm": 4.445407867431641, + "learning_rate": 2.9668586882361625e-05, + "loss": 1.5918, + "step": 1311 + }, + { + "epoch": 0.6905263157894737, + "grad_norm": 3.130711078643799, + "learning_rate": 2.9668067110102338e-05, + "loss": 1.0437, + "step": 1312 + }, + { + "epoch": 0.6910526315789474, + "grad_norm": 1.7217767238616943, + "learning_rate": 2.9667546935129757e-05, + "loss": 0.5301, + "step": 1313 + }, + { + "epoch": 0.6915789473684211, + "grad_norm": 6.565402984619141, + "learning_rate": 2.966702635745816e-05, + "loss": 1.4053, + "step": 1314 + }, + { + "epoch": 0.6921052631578948, + "grad_norm": 1.4495912790298462, + "learning_rate": 2.9666505377101845e-05, + "loss": 0.8034, + "step": 1315 + }, + { + "epoch": 0.6926315789473684, + "grad_norm": 1.2847636938095093, + "learning_rate": 2.9665983994075113e-05, + "loss": 1.0825, + "step": 1316 + }, + { + "epoch": 0.6931578947368421, + "grad_norm": 1.0292222499847412, + "learning_rate": 2.966546220839228e-05, + "loss": 1.1681, + "step": 1317 + }, + { + "epoch": 0.6936842105263158, + "grad_norm": 1.537844181060791, + "learning_rate": 2.966494002006767e-05, + "loss": 1.6044, + "step": 1318 + }, + { + "epoch": 0.6942105263157895, + "grad_norm": 4.573411464691162, + "learning_rate": 2.966441742911562e-05, + "loss": 0.7895, + "step": 1319 + }, + { + "epoch": 0.6947368421052632, + "grad_norm": 1.3798898458480835, + "learning_rate": 2.9663894435550477e-05, + "loss": 0.5207, + "step": 1320 + }, + { + "epoch": 0.6952631578947368, + "grad_norm": 1.8828356266021729, + "learning_rate": 2.96633710393866e-05, + "loss": 0.4363, + "step": 1321 + }, + { + "epoch": 0.6957894736842105, + "grad_norm": 4.514081954956055, + "learning_rate": 2.966284724063836e-05, + "loss": 0.9505, + "step": 1322 + }, + { + "epoch": 0.6963157894736842, + "grad_norm": 6.411378860473633, + "learning_rate": 2.966232303932014e-05, + "loss": 0.2739, + "step": 1323 + }, + { + "epoch": 0.6968421052631579, + "grad_norm": 3.237457036972046, + "learning_rate": 2.9661798435446325e-05, + "loss": 0.2196, + "step": 1324 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 2.3053643703460693, + "learning_rate": 2.966127342903133e-05, + "loss": 1.4729, + "step": 1325 + }, + { + "epoch": 0.6978947368421052, + "grad_norm": 1.2993172407150269, + "learning_rate": 2.9660748020089555e-05, + "loss": 0.8165, + "step": 1326 + }, + { + "epoch": 0.6984210526315789, + "grad_norm": 1.4997568130493164, + "learning_rate": 2.9660222208635438e-05, + "loss": 1.0576, + "step": 1327 + }, + { + "epoch": 0.6989473684210527, + "grad_norm": 6.542171001434326, + "learning_rate": 2.9659695994683404e-05, + "loss": 0.7472, + "step": 1328 + }, + { + "epoch": 0.6994736842105264, + "grad_norm": 2.04111647605896, + "learning_rate": 2.965916937824791e-05, + "loss": 1.0342, + "step": 1329 + }, + { + "epoch": 0.7, + "grad_norm": 3.6200339794158936, + "learning_rate": 2.965864235934341e-05, + "loss": 0.9009, + "step": 1330 + }, + { + "epoch": 0.7005263157894737, + "grad_norm": 3.2301101684570312, + "learning_rate": 2.965811493798437e-05, + "loss": 1.1608, + "step": 1331 + }, + { + "epoch": 0.7010526315789474, + "grad_norm": 13.88673210144043, + "learning_rate": 2.9657587114185272e-05, + "loss": 2.5288, + "step": 1332 + }, + { + "epoch": 0.7015789473684211, + "grad_norm": 3.9736039638519287, + "learning_rate": 2.9657058887960613e-05, + "loss": 1.0553, + "step": 1333 + }, + { + "epoch": 0.7021052631578948, + "grad_norm": 24.85307502746582, + "learning_rate": 2.965653025932489e-05, + "loss": 0.916, + "step": 1334 + }, + { + "epoch": 0.7026315789473684, + "grad_norm": 4.859387397766113, + "learning_rate": 2.965600122829262e-05, + "loss": 0.9876, + "step": 1335 + }, + { + "epoch": 0.7031578947368421, + "grad_norm": 2.863323211669922, + "learning_rate": 2.9655471794878325e-05, + "loss": 0.8016, + "step": 1336 + }, + { + "epoch": 0.7036842105263158, + "grad_norm": 2.4096951484680176, + "learning_rate": 2.9654941959096543e-05, + "loss": 0.6945, + "step": 1337 + }, + { + "epoch": 0.7042105263157895, + "grad_norm": 1.333808183670044, + "learning_rate": 2.9654411720961816e-05, + "loss": 1.1654, + "step": 1338 + }, + { + "epoch": 0.7047368421052631, + "grad_norm": 6.172205448150635, + "learning_rate": 2.9653881080488705e-05, + "loss": 0.5594, + "step": 1339 + }, + { + "epoch": 0.7052631578947368, + "grad_norm": 5.650049686431885, + "learning_rate": 2.9653350037691777e-05, + "loss": 0.8009, + "step": 1340 + }, + { + "epoch": 0.7057894736842105, + "grad_norm": 1.504169225692749, + "learning_rate": 2.9652818592585616e-05, + "loss": 1.1546, + "step": 1341 + }, + { + "epoch": 0.7063157894736842, + "grad_norm": 2.2842626571655273, + "learning_rate": 2.965228674518481e-05, + "loss": 0.9654, + "step": 1342 + }, + { + "epoch": 0.7068421052631579, + "grad_norm": 1.429457187652588, + "learning_rate": 2.965175449550396e-05, + "loss": 1.3076, + "step": 1343 + }, + { + "epoch": 0.7073684210526315, + "grad_norm": 1.1388136148452759, + "learning_rate": 2.9651221843557682e-05, + "loss": 0.9347, + "step": 1344 + }, + { + "epoch": 0.7078947368421052, + "grad_norm": 2.501084566116333, + "learning_rate": 2.96506887893606e-05, + "loss": 1.11, + "step": 1345 + }, + { + "epoch": 0.708421052631579, + "grad_norm": 27.422794342041016, + "learning_rate": 2.9650155332927343e-05, + "loss": 7.5874, + "step": 1346 + }, + { + "epoch": 0.7089473684210527, + "grad_norm": 5.256887912750244, + "learning_rate": 2.9649621474272564e-05, + "loss": 1.4157, + "step": 1347 + }, + { + "epoch": 0.7094736842105264, + "grad_norm": 1.0665907859802246, + "learning_rate": 2.9649087213410918e-05, + "loss": 0.8372, + "step": 1348 + }, + { + "epoch": 0.71, + "grad_norm": 6.498046398162842, + "learning_rate": 2.964855255035707e-05, + "loss": 1.2181, + "step": 1349 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 2.614759683609009, + "learning_rate": 2.9648017485125708e-05, + "loss": 0.1237, + "step": 1350 + }, + { + "epoch": 0.7110526315789474, + "grad_norm": 1.4356591701507568, + "learning_rate": 2.9647482017731508e-05, + "loss": 1.2685, + "step": 1351 + }, + { + "epoch": 0.7115789473684211, + "grad_norm": 1.245197057723999, + "learning_rate": 2.964694614818918e-05, + "loss": 1.1176, + "step": 1352 + }, + { + "epoch": 0.7121052631578947, + "grad_norm": 1.7546439170837402, + "learning_rate": 2.9646409876513444e-05, + "loss": 1.8027, + "step": 1353 + }, + { + "epoch": 0.7126315789473684, + "grad_norm": 6.851596355438232, + "learning_rate": 2.9645873202719013e-05, + "loss": 0.9774, + "step": 1354 + }, + { + "epoch": 0.7131578947368421, + "grad_norm": 2.6459946632385254, + "learning_rate": 2.9645336126820616e-05, + "loss": 1.5187, + "step": 1355 + }, + { + "epoch": 0.7136842105263158, + "grad_norm": 11.114506721496582, + "learning_rate": 2.9644798648833013e-05, + "loss": 1.5982, + "step": 1356 + }, + { + "epoch": 0.7142105263157895, + "grad_norm": 1.1266565322875977, + "learning_rate": 2.9644260768770953e-05, + "loss": 1.043, + "step": 1357 + }, + { + "epoch": 0.7147368421052631, + "grad_norm": 1.3332453966140747, + "learning_rate": 2.9643722486649203e-05, + "loss": 0.9823, + "step": 1358 + }, + { + "epoch": 0.7152631578947368, + "grad_norm": 7.991135120391846, + "learning_rate": 2.9643183802482542e-05, + "loss": 1.8691, + "step": 1359 + }, + { + "epoch": 0.7157894736842105, + "grad_norm": 1.4576728343963623, + "learning_rate": 2.9642644716285765e-05, + "loss": 1.2554, + "step": 1360 + }, + { + "epoch": 0.7163157894736842, + "grad_norm": 24.029621124267578, + "learning_rate": 2.9642105228073662e-05, + "loss": 1.4993, + "step": 1361 + }, + { + "epoch": 0.716842105263158, + "grad_norm": 1.7569326162338257, + "learning_rate": 2.9641565337861055e-05, + "loss": 0.6511, + "step": 1362 + }, + { + "epoch": 0.7173684210526315, + "grad_norm": 0.8566625118255615, + "learning_rate": 2.9641025045662765e-05, + "loss": 0.0214, + "step": 1363 + }, + { + "epoch": 0.7178947368421053, + "grad_norm": 1.8023531436920166, + "learning_rate": 2.9640484351493616e-05, + "loss": 1.3085, + "step": 1364 + }, + { + "epoch": 0.718421052631579, + "grad_norm": 1.1481828689575195, + "learning_rate": 2.9639943255368468e-05, + "loss": 1.3198, + "step": 1365 + }, + { + "epoch": 0.7189473684210527, + "grad_norm": 2.6472365856170654, + "learning_rate": 2.9639401757302162e-05, + "loss": 0.2092, + "step": 1366 + }, + { + "epoch": 0.7194736842105263, + "grad_norm": 1.8456323146820068, + "learning_rate": 2.9638859857309574e-05, + "loss": 1.0272, + "step": 1367 + }, + { + "epoch": 0.72, + "grad_norm": 3.3874828815460205, + "learning_rate": 2.9638317555405582e-05, + "loss": 0.9814, + "step": 1368 + }, + { + "epoch": 0.7205263157894737, + "grad_norm": 1.234138011932373, + "learning_rate": 2.9637774851605073e-05, + "loss": 0.8918, + "step": 1369 + }, + { + "epoch": 0.7210526315789474, + "grad_norm": 2.1231067180633545, + "learning_rate": 2.9637231745922942e-05, + "loss": 0.8428, + "step": 1370 + }, + { + "epoch": 0.7215789473684211, + "grad_norm": 2.287672281265259, + "learning_rate": 2.963668823837411e-05, + "loss": 1.5556, + "step": 1371 + }, + { + "epoch": 0.7221052631578947, + "grad_norm": 1.185556173324585, + "learning_rate": 2.9636144328973495e-05, + "loss": 0.8633, + "step": 1372 + }, + { + "epoch": 0.7226315789473684, + "grad_norm": 5.523141860961914, + "learning_rate": 2.9635600017736024e-05, + "loss": 0.7142, + "step": 1373 + }, + { + "epoch": 0.7231578947368421, + "grad_norm": 12.063275337219238, + "learning_rate": 2.9635055304676647e-05, + "loss": 0.3757, + "step": 1374 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 4.580484867095947, + "learning_rate": 2.963451018981032e-05, + "loss": 1.5171, + "step": 1375 + }, + { + "epoch": 0.7242105263157895, + "grad_norm": 1.9087570905685425, + "learning_rate": 2.9633964673152004e-05, + "loss": 1.3982, + "step": 1376 + }, + { + "epoch": 0.7247368421052631, + "grad_norm": 2.3344743251800537, + "learning_rate": 2.9633418754716682e-05, + "loss": 1.0389, + "step": 1377 + }, + { + "epoch": 0.7252631578947368, + "grad_norm": 4.389303207397461, + "learning_rate": 2.9632872434519342e-05, + "loss": 0.3592, + "step": 1378 + }, + { + "epoch": 0.7257894736842105, + "grad_norm": 4.647209644317627, + "learning_rate": 2.963232571257498e-05, + "loss": 2.2252, + "step": 1379 + }, + { + "epoch": 0.7263157894736842, + "grad_norm": 2.98408579826355, + "learning_rate": 2.9631778588898606e-05, + "loss": 1.3005, + "step": 1380 + }, + { + "epoch": 0.7268421052631578, + "grad_norm": 10.173641204833984, + "learning_rate": 2.9631231063505245e-05, + "loss": 0.4029, + "step": 1381 + }, + { + "epoch": 0.7273684210526316, + "grad_norm": 2.366199254989624, + "learning_rate": 2.963068313640992e-05, + "loss": 1.408, + "step": 1382 + }, + { + "epoch": 0.7278947368421053, + "grad_norm": 0.9890458583831787, + "learning_rate": 2.963013480762769e-05, + "loss": 0.941, + "step": 1383 + }, + { + "epoch": 0.728421052631579, + "grad_norm": 2.4400086402893066, + "learning_rate": 2.96295860771736e-05, + "loss": 0.303, + "step": 1384 + }, + { + "epoch": 0.7289473684210527, + "grad_norm": 7.326104640960693, + "learning_rate": 2.9629036945062715e-05, + "loss": 0.1054, + "step": 1385 + }, + { + "epoch": 0.7294736842105263, + "grad_norm": 0.08032506704330444, + "learning_rate": 2.9628487411310113e-05, + "loss": 0.0038, + "step": 1386 + }, + { + "epoch": 0.73, + "grad_norm": 1.0595088005065918, + "learning_rate": 2.962793747593088e-05, + "loss": 1.1041, + "step": 1387 + }, + { + "epoch": 0.7305263157894737, + "grad_norm": 0.06474526226520538, + "learning_rate": 2.9627387138940117e-05, + "loss": 0.0022, + "step": 1388 + }, + { + "epoch": 0.7310526315789474, + "grad_norm": 1.7989740371704102, + "learning_rate": 2.9626836400352932e-05, + "loss": 0.8054, + "step": 1389 + }, + { + "epoch": 0.7315789473684211, + "grad_norm": 0.8924505710601807, + "learning_rate": 2.962628526018445e-05, + "loss": 1.0706, + "step": 1390 + }, + { + "epoch": 0.7321052631578947, + "grad_norm": 1.269217848777771, + "learning_rate": 2.9625733718449792e-05, + "loss": 0.7377, + "step": 1391 + }, + { + "epoch": 0.7326315789473684, + "grad_norm": 2.0392255783081055, + "learning_rate": 2.962518177516411e-05, + "loss": 0.5988, + "step": 1392 + }, + { + "epoch": 0.7331578947368421, + "grad_norm": 9.548563003540039, + "learning_rate": 2.9624629430342557e-05, + "loss": 0.437, + "step": 1393 + }, + { + "epoch": 0.7336842105263158, + "grad_norm": 2.005530595779419, + "learning_rate": 2.9624076684000292e-05, + "loss": 1.4294, + "step": 1394 + }, + { + "epoch": 0.7342105263157894, + "grad_norm": 8.551248550415039, + "learning_rate": 2.96235235361525e-05, + "loss": 1.0252, + "step": 1395 + }, + { + "epoch": 0.7347368421052631, + "grad_norm": 2.2077114582061768, + "learning_rate": 2.962296998681436e-05, + "loss": 0.749, + "step": 1396 + }, + { + "epoch": 0.7352631578947368, + "grad_norm": 1.9976532459259033, + "learning_rate": 2.962241603600107e-05, + "loss": 0.6527, + "step": 1397 + }, + { + "epoch": 0.7357894736842105, + "grad_norm": 1.972212314605713, + "learning_rate": 2.962186168372784e-05, + "loss": 1.1974, + "step": 1398 + }, + { + "epoch": 0.7363157894736843, + "grad_norm": 0.9594762921333313, + "learning_rate": 2.962130693000989e-05, + "loss": 1.5027, + "step": 1399 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 1.018154501914978, + "learning_rate": 2.9620751774862456e-05, + "loss": 1.1123, + "step": 1400 + }, + { + "epoch": 0.7373684210526316, + "grad_norm": 1.4608852863311768, + "learning_rate": 2.962019621830077e-05, + "loss": 1.2949, + "step": 1401 + }, + { + "epoch": 0.7378947368421053, + "grad_norm": 19.72736167907715, + "learning_rate": 2.9619640260340092e-05, + "loss": 1.1407, + "step": 1402 + }, + { + "epoch": 0.738421052631579, + "grad_norm": 2.0531411170959473, + "learning_rate": 2.9619083900995684e-05, + "loss": 1.2994, + "step": 1403 + }, + { + "epoch": 0.7389473684210527, + "grad_norm": 1.4088972806930542, + "learning_rate": 2.961852714028282e-05, + "loss": 1.0896, + "step": 1404 + }, + { + "epoch": 0.7394736842105263, + "grad_norm": 0.31528720259666443, + "learning_rate": 2.961796997821679e-05, + "loss": 0.0114, + "step": 1405 + }, + { + "epoch": 0.74, + "grad_norm": 6.371922969818115, + "learning_rate": 2.9617412414812883e-05, + "loss": 0.6344, + "step": 1406 + }, + { + "epoch": 0.7405263157894737, + "grad_norm": 2.7792177200317383, + "learning_rate": 2.9616854450086415e-05, + "loss": 0.0651, + "step": 1407 + }, + { + "epoch": 0.7410526315789474, + "grad_norm": 1.3516690731048584, + "learning_rate": 2.9616296084052698e-05, + "loss": 0.9898, + "step": 1408 + }, + { + "epoch": 0.741578947368421, + "grad_norm": 5.8592448234558105, + "learning_rate": 2.961573731672707e-05, + "loss": 1.8225, + "step": 1409 + }, + { + "epoch": 0.7421052631578947, + "grad_norm": 1.515937089920044, + "learning_rate": 2.9615178148124867e-05, + "loss": 0.8826, + "step": 1410 + }, + { + "epoch": 0.7426315789473684, + "grad_norm": 1.1607184410095215, + "learning_rate": 2.9614618578261436e-05, + "loss": 0.9627, + "step": 1411 + }, + { + "epoch": 0.7431578947368421, + "grad_norm": 1.7677443027496338, + "learning_rate": 2.9614058607152153e-05, + "loss": 1.2879, + "step": 1412 + }, + { + "epoch": 0.7436842105263158, + "grad_norm": 1.718802809715271, + "learning_rate": 2.9613498234812378e-05, + "loss": 0.9766, + "step": 1413 + }, + { + "epoch": 0.7442105263157894, + "grad_norm": 1.4020949602127075, + "learning_rate": 2.9612937461257504e-05, + "loss": 1.319, + "step": 1414 + }, + { + "epoch": 0.7447368421052631, + "grad_norm": 1.2930610179901123, + "learning_rate": 2.9612376286502934e-05, + "loss": 1.0887, + "step": 1415 + }, + { + "epoch": 0.7452631578947368, + "grad_norm": 1.6021417379379272, + "learning_rate": 2.961181471056406e-05, + "loss": 0.861, + "step": 1416 + }, + { + "epoch": 0.7457894736842106, + "grad_norm": 1.3957011699676514, + "learning_rate": 2.9611252733456306e-05, + "loss": 0.8134, + "step": 1417 + }, + { + "epoch": 0.7463157894736843, + "grad_norm": 2.1977591514587402, + "learning_rate": 2.9610690355195108e-05, + "loss": 1.5508, + "step": 1418 + }, + { + "epoch": 0.7468421052631579, + "grad_norm": 2.2483537197113037, + "learning_rate": 2.9610127575795894e-05, + "loss": 0.7797, + "step": 1419 + }, + { + "epoch": 0.7473684210526316, + "grad_norm": 1.5289298295974731, + "learning_rate": 2.9609564395274125e-05, + "loss": 1.4425, + "step": 1420 + }, + { + "epoch": 0.7478947368421053, + "grad_norm": 1.3084803819656372, + "learning_rate": 2.9609000813645257e-05, + "loss": 1.117, + "step": 1421 + }, + { + "epoch": 0.748421052631579, + "grad_norm": 2.422757863998413, + "learning_rate": 2.960843683092477e-05, + "loss": 1.7103, + "step": 1422 + }, + { + "epoch": 0.7489473684210526, + "grad_norm": 1.4796887636184692, + "learning_rate": 2.9607872447128142e-05, + "loss": 1.0487, + "step": 1423 + }, + { + "epoch": 0.7494736842105263, + "grad_norm": 1.0981144905090332, + "learning_rate": 2.960730766227087e-05, + "loss": 0.5913, + "step": 1424 + }, + { + "epoch": 0.75, + "grad_norm": 1.8127541542053223, + "learning_rate": 2.9606742476368464e-05, + "loss": 0.8062, + "step": 1425 + }, + { + "epoch": 0.75, + "eval_loss": 1.0191138982772827, + "eval_runtime": 12.9816, + "eval_samples_per_second": 7.703, + "eval_steps_per_second": 7.703, + "step": 1425 + }, + { + "epoch": 0.7505263157894737, + "grad_norm": 2.0109336376190186, + "learning_rate": 2.9606176889436435e-05, + "loss": 1.5174, + "step": 1426 + }, + { + "epoch": 0.7510526315789474, + "grad_norm": 1.7353535890579224, + "learning_rate": 2.9605610901490312e-05, + "loss": 1.0043, + "step": 1427 + }, + { + "epoch": 0.751578947368421, + "grad_norm": 2.5789828300476074, + "learning_rate": 2.960504451254564e-05, + "loss": 1.056, + "step": 1428 + }, + { + "epoch": 0.7521052631578947, + "grad_norm": 10.612245559692383, + "learning_rate": 2.960447772261796e-05, + "loss": 1.3747, + "step": 1429 + }, + { + "epoch": 0.7526315789473684, + "grad_norm": 1.5246334075927734, + "learning_rate": 2.960391053172285e-05, + "loss": 1.2934, + "step": 1430 + }, + { + "epoch": 0.7531578947368421, + "grad_norm": 1.2563579082489014, + "learning_rate": 2.9603342939875863e-05, + "loss": 1.1843, + "step": 1431 + }, + { + "epoch": 0.7536842105263157, + "grad_norm": 2.347818613052368, + "learning_rate": 2.960277494709259e-05, + "loss": 0.9651, + "step": 1432 + }, + { + "epoch": 0.7542105263157894, + "grad_norm": 1.2420134544372559, + "learning_rate": 2.960220655338863e-05, + "loss": 0.8367, + "step": 1433 + }, + { + "epoch": 0.7547368421052632, + "grad_norm": 1.5213626623153687, + "learning_rate": 2.9601637758779577e-05, + "loss": 1.2834, + "step": 1434 + }, + { + "epoch": 0.7552631578947369, + "grad_norm": 3.097379684448242, + "learning_rate": 2.9601068563281063e-05, + "loss": 1.1477, + "step": 1435 + }, + { + "epoch": 0.7557894736842106, + "grad_norm": 1.0866178274154663, + "learning_rate": 2.9600498966908702e-05, + "loss": 1.1422, + "step": 1436 + }, + { + "epoch": 0.7563157894736842, + "grad_norm": 1.3330771923065186, + "learning_rate": 2.959992896967814e-05, + "loss": 1.2408, + "step": 1437 + }, + { + "epoch": 0.7568421052631579, + "grad_norm": 2.1840741634368896, + "learning_rate": 2.959935857160502e-05, + "loss": 0.7088, + "step": 1438 + }, + { + "epoch": 0.7573684210526316, + "grad_norm": 1.8869200944900513, + "learning_rate": 2.9598787772705006e-05, + "loss": 1.0891, + "step": 1439 + }, + { + "epoch": 0.7578947368421053, + "grad_norm": 1.4808701276779175, + "learning_rate": 2.959821657299377e-05, + "loss": 1.7629, + "step": 1440 + }, + { + "epoch": 0.758421052631579, + "grad_norm": 1.3917648792266846, + "learning_rate": 2.959764497248699e-05, + "loss": 1.3897, + "step": 1441 + }, + { + "epoch": 0.7589473684210526, + "grad_norm": 1.1825001239776611, + "learning_rate": 2.9597072971200366e-05, + "loss": 1.1223, + "step": 1442 + }, + { + "epoch": 0.7594736842105263, + "grad_norm": 1.76896071434021, + "learning_rate": 2.9596500569149603e-05, + "loss": 1.6206, + "step": 1443 + }, + { + "epoch": 0.76, + "grad_norm": 1.2272676229476929, + "learning_rate": 2.9595927766350406e-05, + "loss": 1.1395, + "step": 1444 + }, + { + "epoch": 0.7605263157894737, + "grad_norm": 4.527194499969482, + "learning_rate": 2.959535456281851e-05, + "loss": 0.4346, + "step": 1445 + }, + { + "epoch": 0.7610526315789473, + "grad_norm": 1.3327759504318237, + "learning_rate": 2.959478095856965e-05, + "loss": 0.9762, + "step": 1446 + }, + { + "epoch": 0.761578947368421, + "grad_norm": 2.397428512573242, + "learning_rate": 2.959420695361958e-05, + "loss": 0.7909, + "step": 1447 + }, + { + "epoch": 0.7621052631578947, + "grad_norm": 1.3565646409988403, + "learning_rate": 2.9593632547984047e-05, + "loss": 0.7872, + "step": 1448 + }, + { + "epoch": 0.7626315789473684, + "grad_norm": 0.9641793370246887, + "learning_rate": 2.9593057741678832e-05, + "loss": 0.8944, + "step": 1449 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 2.6557819843292236, + "learning_rate": 2.959248253471971e-05, + "loss": 0.8311, + "step": 1450 + }, + { + "epoch": 0.7636842105263157, + "grad_norm": 4.478829860687256, + "learning_rate": 2.9591906927122477e-05, + "loss": 0.4445, + "step": 1451 + }, + { + "epoch": 0.7642105263157895, + "grad_norm": 2.393716335296631, + "learning_rate": 2.9591330918902935e-05, + "loss": 1.7126, + "step": 1452 + }, + { + "epoch": 0.7647368421052632, + "grad_norm": 1.553025722503662, + "learning_rate": 2.95907545100769e-05, + "loss": 1.1763, + "step": 1453 + }, + { + "epoch": 0.7652631578947369, + "grad_norm": 1.0929442644119263, + "learning_rate": 2.9590177700660193e-05, + "loss": 1.14, + "step": 1454 + }, + { + "epoch": 0.7657894736842106, + "grad_norm": 7.241302013397217, + "learning_rate": 2.9589600490668655e-05, + "loss": 1.111, + "step": 1455 + }, + { + "epoch": 0.7663157894736842, + "grad_norm": 0.972644567489624, + "learning_rate": 2.9589022880118133e-05, + "loss": 1.1578, + "step": 1456 + }, + { + "epoch": 0.7668421052631579, + "grad_norm": 7.614607810974121, + "learning_rate": 2.9588444869024484e-05, + "loss": 1.0112, + "step": 1457 + }, + { + "epoch": 0.7673684210526316, + "grad_norm": 1.8197392225265503, + "learning_rate": 2.9587866457403577e-05, + "loss": 1.2878, + "step": 1458 + }, + { + "epoch": 0.7678947368421053, + "grad_norm": 1.1428899765014648, + "learning_rate": 2.9587287645271293e-05, + "loss": 0.7216, + "step": 1459 + }, + { + "epoch": 0.7684210526315789, + "grad_norm": 1.5418213605880737, + "learning_rate": 2.958670843264353e-05, + "loss": 0.9176, + "step": 1460 + }, + { + "epoch": 0.7689473684210526, + "grad_norm": 6.577853679656982, + "learning_rate": 2.9586128819536172e-05, + "loss": 1.3075, + "step": 1461 + }, + { + "epoch": 0.7694736842105263, + "grad_norm": 1.7671222686767578, + "learning_rate": 2.958554880596515e-05, + "loss": 1.0772, + "step": 1462 + }, + { + "epoch": 0.77, + "grad_norm": 1.2378417253494263, + "learning_rate": 2.9584968391946378e-05, + "loss": 1.048, + "step": 1463 + }, + { + "epoch": 0.7705263157894737, + "grad_norm": 1.9807356595993042, + "learning_rate": 2.9584387577495803e-05, + "loss": 1.6231, + "step": 1464 + }, + { + "epoch": 0.7710526315789473, + "grad_norm": 1.2167612314224243, + "learning_rate": 2.958380636262936e-05, + "loss": 1.5038, + "step": 1465 + }, + { + "epoch": 0.771578947368421, + "grad_norm": 1.2333711385726929, + "learning_rate": 2.9583224747363008e-05, + "loss": 0.6763, + "step": 1466 + }, + { + "epoch": 0.7721052631578947, + "grad_norm": 1.9440076351165771, + "learning_rate": 2.958264273171272e-05, + "loss": 1.8108, + "step": 1467 + }, + { + "epoch": 0.7726315789473684, + "grad_norm": 1.7781847715377808, + "learning_rate": 2.958206031569447e-05, + "loss": 1.5469, + "step": 1468 + }, + { + "epoch": 0.7731578947368422, + "grad_norm": 1.5457514524459839, + "learning_rate": 2.9581477499324254e-05, + "loss": 1.1067, + "step": 1469 + }, + { + "epoch": 0.7736842105263158, + "grad_norm": 6.0052103996276855, + "learning_rate": 2.9580894282618073e-05, + "loss": 0.5737, + "step": 1470 + }, + { + "epoch": 0.7742105263157895, + "grad_norm": 1.827090859413147, + "learning_rate": 2.9580310665591933e-05, + "loss": 1.1473, + "step": 1471 + }, + { + "epoch": 0.7747368421052632, + "grad_norm": 5.207357883453369, + "learning_rate": 2.9579726648261862e-05, + "loss": 2.123, + "step": 1472 + }, + { + "epoch": 0.7752631578947369, + "grad_norm": 4.914318084716797, + "learning_rate": 2.957914223064389e-05, + "loss": 1.4473, + "step": 1473 + }, + { + "epoch": 0.7757894736842105, + "grad_norm": 4.295709609985352, + "learning_rate": 2.9578557412754067e-05, + "loss": 0.1821, + "step": 1474 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 14.05141830444336, + "learning_rate": 2.9577972194608453e-05, + "loss": 1.7767, + "step": 1475 + }, + { + "epoch": 0.7768421052631579, + "grad_norm": 3.5937631130218506, + "learning_rate": 2.9577386576223105e-05, + "loss": 0.8665, + "step": 1476 + }, + { + "epoch": 0.7773684210526316, + "grad_norm": 1.2602375745773315, + "learning_rate": 2.9576800557614103e-05, + "loss": 1.6036, + "step": 1477 + }, + { + "epoch": 0.7778947368421053, + "grad_norm": 6.486451148986816, + "learning_rate": 2.9576214138797544e-05, + "loss": 0.8866, + "step": 1478 + }, + { + "epoch": 0.7784210526315789, + "grad_norm": 2.9001505374908447, + "learning_rate": 2.9575627319789523e-05, + "loss": 1.085, + "step": 1479 + }, + { + "epoch": 0.7789473684210526, + "grad_norm": 1.1088868379592896, + "learning_rate": 2.957504010060615e-05, + "loss": 1.0509, + "step": 1480 + }, + { + "epoch": 0.7794736842105263, + "grad_norm": 2.7946763038635254, + "learning_rate": 2.9574452481263553e-05, + "loss": 0.1451, + "step": 1481 + }, + { + "epoch": 0.78, + "grad_norm": 1.2279927730560303, + "learning_rate": 2.9573864461777856e-05, + "loss": 0.9693, + "step": 1482 + }, + { + "epoch": 0.7805263157894737, + "grad_norm": 9.112462997436523, + "learning_rate": 2.957327604216521e-05, + "loss": 0.4725, + "step": 1483 + }, + { + "epoch": 0.7810526315789473, + "grad_norm": 4.834194183349609, + "learning_rate": 2.957268722244177e-05, + "loss": 0.7559, + "step": 1484 + }, + { + "epoch": 0.781578947368421, + "grad_norm": 1.3210901021957397, + "learning_rate": 2.9572098002623697e-05, + "loss": 0.9653, + "step": 1485 + }, + { + "epoch": 0.7821052631578947, + "grad_norm": 6.407845973968506, + "learning_rate": 2.9571508382727173e-05, + "loss": 2.3573, + "step": 1486 + }, + { + "epoch": 0.7826315789473685, + "grad_norm": 1.6584391593933105, + "learning_rate": 2.9570918362768386e-05, + "loss": 1.6321, + "step": 1487 + }, + { + "epoch": 0.783157894736842, + "grad_norm": 1.5938304662704468, + "learning_rate": 2.9570327942763535e-05, + "loss": 1.1299, + "step": 1488 + }, + { + "epoch": 0.7836842105263158, + "grad_norm": 0.9389225840568542, + "learning_rate": 2.9569737122728823e-05, + "loss": 0.9162, + "step": 1489 + }, + { + "epoch": 0.7842105263157895, + "grad_norm": 2.557969331741333, + "learning_rate": 2.956914590268048e-05, + "loss": 1.1852, + "step": 1490 + }, + { + "epoch": 0.7847368421052632, + "grad_norm": 1.1017169952392578, + "learning_rate": 2.9568554282634733e-05, + "loss": 1.3037, + "step": 1491 + }, + { + "epoch": 0.7852631578947369, + "grad_norm": 1.8029855489730835, + "learning_rate": 2.956796226260783e-05, + "loss": 1.9646, + "step": 1492 + }, + { + "epoch": 0.7857894736842105, + "grad_norm": 10.63049030303955, + "learning_rate": 2.9567369842616015e-05, + "loss": 0.8632, + "step": 1493 + }, + { + "epoch": 0.7863157894736842, + "grad_norm": 6.249732494354248, + "learning_rate": 2.9566777022675563e-05, + "loss": 0.5931, + "step": 1494 + }, + { + "epoch": 0.7868421052631579, + "grad_norm": 3.90535306930542, + "learning_rate": 2.956618380280275e-05, + "loss": 1.2675, + "step": 1495 + }, + { + "epoch": 0.7873684210526316, + "grad_norm": 5.921347618103027, + "learning_rate": 2.9565590183013855e-05, + "loss": 0.2349, + "step": 1496 + }, + { + "epoch": 0.7878947368421053, + "grad_norm": 0.9792536497116089, + "learning_rate": 2.9564996163325186e-05, + "loss": 1.1103, + "step": 1497 + }, + { + "epoch": 0.7884210526315789, + "grad_norm": 3.9059622287750244, + "learning_rate": 2.956440174375304e-05, + "loss": 0.0963, + "step": 1498 + }, + { + "epoch": 0.7889473684210526, + "grad_norm": 2.304387092590332, + "learning_rate": 2.9563806924313746e-05, + "loss": 1.1423, + "step": 1499 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 6.4439263343811035, + "learning_rate": 2.956321170502363e-05, + "loss": 1.7772, + "step": 1500 + }, + { + "epoch": 0.79, + "grad_norm": 2.323302745819092, + "learning_rate": 2.956261608589904e-05, + "loss": 0.9714, + "step": 1501 + }, + { + "epoch": 0.7905263157894736, + "grad_norm": 2.1680712699890137, + "learning_rate": 2.956202006695632e-05, + "loss": 1.3609, + "step": 1502 + }, + { + "epoch": 0.7910526315789473, + "grad_norm": 1.4925416707992554, + "learning_rate": 2.9561423648211842e-05, + "loss": 1.1193, + "step": 1503 + }, + { + "epoch": 0.791578947368421, + "grad_norm": 2.1561765670776367, + "learning_rate": 2.956082682968197e-05, + "loss": 1.3798, + "step": 1504 + }, + { + "epoch": 0.7921052631578948, + "grad_norm": 2.159775495529175, + "learning_rate": 2.9560229611383104e-05, + "loss": 1.2499, + "step": 1505 + }, + { + "epoch": 0.7926315789473685, + "grad_norm": 2.9001150131225586, + "learning_rate": 2.955963199333163e-05, + "loss": 1.2439, + "step": 1506 + }, + { + "epoch": 0.7931578947368421, + "grad_norm": 20.27045440673828, + "learning_rate": 2.9559033975543962e-05, + "loss": 1.3729, + "step": 1507 + }, + { + "epoch": 0.7936842105263158, + "grad_norm": 1.0026558637619019, + "learning_rate": 2.9558435558036507e-05, + "loss": 0.8089, + "step": 1508 + }, + { + "epoch": 0.7942105263157895, + "grad_norm": 2.685729503631592, + "learning_rate": 2.955783674082571e-05, + "loss": 0.9878, + "step": 1509 + }, + { + "epoch": 0.7947368421052632, + "grad_norm": 15.882140159606934, + "learning_rate": 2.9557237523928005e-05, + "loss": 2.2313, + "step": 1510 + }, + { + "epoch": 0.7952631578947369, + "grad_norm": 1.5289243459701538, + "learning_rate": 2.9556637907359845e-05, + "loss": 0.6709, + "step": 1511 + }, + { + "epoch": 0.7957894736842105, + "grad_norm": 1.3777986764907837, + "learning_rate": 2.9556037891137686e-05, + "loss": 0.8706, + "step": 1512 + }, + { + "epoch": 0.7963157894736842, + "grad_norm": 3.52500319480896, + "learning_rate": 2.9555437475278013e-05, + "loss": 2.0162, + "step": 1513 + }, + { + "epoch": 0.7968421052631579, + "grad_norm": 1.4890594482421875, + "learning_rate": 2.95548366597973e-05, + "loss": 0.8976, + "step": 1514 + }, + { + "epoch": 0.7973684210526316, + "grad_norm": 3.3992414474487305, + "learning_rate": 2.9554235444712045e-05, + "loss": 0.5914, + "step": 1515 + }, + { + "epoch": 0.7978947368421052, + "grad_norm": 4.6464152336120605, + "learning_rate": 2.9553633830038757e-05, + "loss": 2.0883, + "step": 1516 + }, + { + "epoch": 0.7984210526315789, + "grad_norm": 2.2460274696350098, + "learning_rate": 2.955303181579395e-05, + "loss": 1.3262, + "step": 1517 + }, + { + "epoch": 0.7989473684210526, + "grad_norm": 1.4003204107284546, + "learning_rate": 2.955242940199416e-05, + "loss": 1.3034, + "step": 1518 + }, + { + "epoch": 0.7994736842105263, + "grad_norm": 1.23296058177948, + "learning_rate": 2.955182658865592e-05, + "loss": 1.2704, + "step": 1519 + }, + { + "epoch": 0.8, + "grad_norm": 1.1843321323394775, + "learning_rate": 2.9551223375795778e-05, + "loss": 1.0394, + "step": 1520 + }, + { + "epoch": 0.8005263157894736, + "grad_norm": 1.3480000495910645, + "learning_rate": 2.95506197634303e-05, + "loss": 1.1087, + "step": 1521 + }, + { + "epoch": 0.8010526315789473, + "grad_norm": 1.0556117296218872, + "learning_rate": 2.9550015751576055e-05, + "loss": 0.9642, + "step": 1522 + }, + { + "epoch": 0.8015789473684211, + "grad_norm": 5.54053258895874, + "learning_rate": 2.9549411340249627e-05, + "loss": 1.3033, + "step": 1523 + }, + { + "epoch": 0.8021052631578948, + "grad_norm": 3.8652968406677246, + "learning_rate": 2.9548806529467617e-05, + "loss": 1.0023, + "step": 1524 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 3.5290725231170654, + "learning_rate": 2.954820131924662e-05, + "loss": 0.5537, + "step": 1525 + }, + { + "epoch": 0.8031578947368421, + "grad_norm": 0.9234074950218201, + "learning_rate": 2.9547595709603255e-05, + "loss": 0.9543, + "step": 1526 + }, + { + "epoch": 0.8036842105263158, + "grad_norm": 13.975532531738281, + "learning_rate": 2.9546989700554154e-05, + "loss": 1.2983, + "step": 1527 + }, + { + "epoch": 0.8042105263157895, + "grad_norm": 1.407110333442688, + "learning_rate": 2.9546383292115947e-05, + "loss": 1.2283, + "step": 1528 + }, + { + "epoch": 0.8047368421052632, + "grad_norm": 1.1215810775756836, + "learning_rate": 2.9545776484305293e-05, + "loss": 0.9641, + "step": 1529 + }, + { + "epoch": 0.8052631578947368, + "grad_norm": 2.0094728469848633, + "learning_rate": 2.9545169277138845e-05, + "loss": 1.0771, + "step": 1530 + }, + { + "epoch": 0.8057894736842105, + "grad_norm": 3.301666021347046, + "learning_rate": 2.9544561670633272e-05, + "loss": 0.5507, + "step": 1531 + }, + { + "epoch": 0.8063157894736842, + "grad_norm": 2.8523879051208496, + "learning_rate": 2.954395366480526e-05, + "loss": 0.414, + "step": 1532 + }, + { + "epoch": 0.8068421052631579, + "grad_norm": 5.4080986976623535, + "learning_rate": 2.9543345259671505e-05, + "loss": 0.9862, + "step": 1533 + }, + { + "epoch": 0.8073684210526316, + "grad_norm": 3.0157690048217773, + "learning_rate": 2.9542736455248702e-05, + "loss": 1.1812, + "step": 1534 + }, + { + "epoch": 0.8078947368421052, + "grad_norm": 2.6960086822509766, + "learning_rate": 2.9542127251553573e-05, + "loss": 0.8758, + "step": 1535 + }, + { + "epoch": 0.8084210526315789, + "grad_norm": 1.4676748514175415, + "learning_rate": 2.954151764860284e-05, + "loss": 1.2065, + "step": 1536 + }, + { + "epoch": 0.8089473684210526, + "grad_norm": 1.893027424812317, + "learning_rate": 2.9540907646413247e-05, + "loss": 1.7415, + "step": 1537 + }, + { + "epoch": 0.8094736842105263, + "grad_norm": 2.425906181335449, + "learning_rate": 2.954029724500153e-05, + "loss": 1.0721, + "step": 1538 + }, + { + "epoch": 0.81, + "grad_norm": 4.737662315368652, + "learning_rate": 2.953968644438445e-05, + "loss": 1.0819, + "step": 1539 + }, + { + "epoch": 0.8105263157894737, + "grad_norm": 2.0641164779663086, + "learning_rate": 2.9539075244578793e-05, + "loss": 1.6513, + "step": 1540 + }, + { + "epoch": 0.8110526315789474, + "grad_norm": 1.7806847095489502, + "learning_rate": 2.9538463645601317e-05, + "loss": 1.0087, + "step": 1541 + }, + { + "epoch": 0.8115789473684211, + "grad_norm": 1.2224940061569214, + "learning_rate": 2.9537851647468827e-05, + "loss": 1.0176, + "step": 1542 + }, + { + "epoch": 0.8121052631578948, + "grad_norm": 3.8275363445281982, + "learning_rate": 2.953723925019812e-05, + "loss": 0.2529, + "step": 1543 + }, + { + "epoch": 0.8126315789473684, + "grad_norm": 4.72785758972168, + "learning_rate": 2.9536626453806008e-05, + "loss": 1.2509, + "step": 1544 + }, + { + "epoch": 0.8131578947368421, + "grad_norm": 1.0455762147903442, + "learning_rate": 2.9536013258309323e-05, + "loss": 0.766, + "step": 1545 + }, + { + "epoch": 0.8136842105263158, + "grad_norm": 1.5553256273269653, + "learning_rate": 2.9535399663724893e-05, + "loss": 1.1175, + "step": 1546 + }, + { + "epoch": 0.8142105263157895, + "grad_norm": 2.9578685760498047, + "learning_rate": 2.953478567006957e-05, + "loss": 0.6569, + "step": 1547 + }, + { + "epoch": 0.8147368421052632, + "grad_norm": 1.4119994640350342, + "learning_rate": 2.9534171277360207e-05, + "loss": 1.2938, + "step": 1548 + }, + { + "epoch": 0.8152631578947368, + "grad_norm": 2.524916887283325, + "learning_rate": 2.9533556485613673e-05, + "loss": 0.4687, + "step": 1549 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 2.7491307258605957, + "learning_rate": 2.9532941294846848e-05, + "loss": 0.9021, + "step": 1550 + }, + { + "epoch": 0.8163157894736842, + "grad_norm": 3.837688446044922, + "learning_rate": 2.9532325705076622e-05, + "loss": 0.7552, + "step": 1551 + }, + { + "epoch": 0.8168421052631579, + "grad_norm": 1.9643511772155762, + "learning_rate": 2.9531709716319895e-05, + "loss": 1.2493, + "step": 1552 + }, + { + "epoch": 0.8173684210526316, + "grad_norm": 2.230646848678589, + "learning_rate": 2.953109332859358e-05, + "loss": 0.1658, + "step": 1553 + }, + { + "epoch": 0.8178947368421052, + "grad_norm": 4.156702995300293, + "learning_rate": 2.9530476541914602e-05, + "loss": 0.4678, + "step": 1554 + }, + { + "epoch": 0.8184210526315789, + "grad_norm": 10.191547393798828, + "learning_rate": 2.9529859356299894e-05, + "loss": 0.6664, + "step": 1555 + }, + { + "epoch": 0.8189473684210526, + "grad_norm": 1.6715644598007202, + "learning_rate": 2.9529241771766396e-05, + "loss": 0.929, + "step": 1556 + }, + { + "epoch": 0.8194736842105264, + "grad_norm": 1.5630298852920532, + "learning_rate": 2.9528623788331067e-05, + "loss": 1.5926, + "step": 1557 + }, + { + "epoch": 0.82, + "grad_norm": 0.9629634618759155, + "learning_rate": 2.9528005406010877e-05, + "loss": 0.7064, + "step": 1558 + }, + { + "epoch": 0.8205263157894737, + "grad_norm": 2.5397849082946777, + "learning_rate": 2.9527386624822805e-05, + "loss": 1.0266, + "step": 1559 + }, + { + "epoch": 0.8210526315789474, + "grad_norm": 0.8805807828903198, + "learning_rate": 2.952676744478383e-05, + "loss": 0.1851, + "step": 1560 + }, + { + "epoch": 0.8215789473684211, + "grad_norm": 1.243837594985962, + "learning_rate": 2.952614786591096e-05, + "loss": 1.4598, + "step": 1561 + }, + { + "epoch": 0.8221052631578948, + "grad_norm": 2.9784929752349854, + "learning_rate": 2.9525527888221203e-05, + "loss": 1.4596, + "step": 1562 + }, + { + "epoch": 0.8226315789473684, + "grad_norm": 1.0616602897644043, + "learning_rate": 2.9524907511731582e-05, + "loss": 1.151, + "step": 1563 + }, + { + "epoch": 0.8231578947368421, + "grad_norm": 3.9415078163146973, + "learning_rate": 2.9524286736459125e-05, + "loss": 0.7022, + "step": 1564 + }, + { + "epoch": 0.8236842105263158, + "grad_norm": 4.667412757873535, + "learning_rate": 2.9523665562420882e-05, + "loss": 1.614, + "step": 1565 + }, + { + "epoch": 0.8242105263157895, + "grad_norm": 2.3108322620391846, + "learning_rate": 2.95230439896339e-05, + "loss": 0.8213, + "step": 1566 + }, + { + "epoch": 0.8247368421052632, + "grad_norm": 1.3090465068817139, + "learning_rate": 2.9522422018115254e-05, + "loss": 0.9995, + "step": 1567 + }, + { + "epoch": 0.8252631578947368, + "grad_norm": 2.5572314262390137, + "learning_rate": 2.952179964788201e-05, + "loss": 0.3968, + "step": 1568 + }, + { + "epoch": 0.8257894736842105, + "grad_norm": 4.847593784332275, + "learning_rate": 2.9521176878951262e-05, + "loss": 1.5573, + "step": 1569 + }, + { + "epoch": 0.8263157894736842, + "grad_norm": 3.584441661834717, + "learning_rate": 2.9520553711340107e-05, + "loss": 1.6182, + "step": 1570 + }, + { + "epoch": 0.8268421052631579, + "grad_norm": 0.06894690543413162, + "learning_rate": 2.951993014506565e-05, + "loss": 0.0021, + "step": 1571 + }, + { + "epoch": 0.8273684210526315, + "grad_norm": 1.0614222288131714, + "learning_rate": 2.951930618014502e-05, + "loss": 1.1364, + "step": 1572 + }, + { + "epoch": 0.8278947368421052, + "grad_norm": 1.2423781156539917, + "learning_rate": 2.9518681816595337e-05, + "loss": 0.7434, + "step": 1573 + }, + { + "epoch": 0.828421052631579, + "grad_norm": 1.0514869689941406, + "learning_rate": 2.9518057054433753e-05, + "loss": 0.9144, + "step": 1574 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 6.573333263397217, + "learning_rate": 2.951743189367741e-05, + "loss": 0.6118, + "step": 1575 + }, + { + "epoch": 0.8294736842105264, + "grad_norm": 6.8987016677856445, + "learning_rate": 2.9516806334343482e-05, + "loss": 0.7301, + "step": 1576 + }, + { + "epoch": 0.83, + "grad_norm": 1.210350513458252, + "learning_rate": 2.951618037644914e-05, + "loss": 0.9962, + "step": 1577 + }, + { + "epoch": 0.8305263157894737, + "grad_norm": 4.0031328201293945, + "learning_rate": 2.9515554020011567e-05, + "loss": 1.1273, + "step": 1578 + }, + { + "epoch": 0.8310526315789474, + "grad_norm": 6.773041725158691, + "learning_rate": 2.9514927265047968e-05, + "loss": 2.3973, + "step": 1579 + }, + { + "epoch": 0.8315789473684211, + "grad_norm": 2.425973892211914, + "learning_rate": 2.951430011157554e-05, + "loss": 0.6053, + "step": 1580 + }, + { + "epoch": 0.8321052631578948, + "grad_norm": 6.728580951690674, + "learning_rate": 2.951367255961151e-05, + "loss": 0.9388, + "step": 1581 + }, + { + "epoch": 0.8326315789473684, + "grad_norm": 1.9585034847259521, + "learning_rate": 2.95130446091731e-05, + "loss": 1.0964, + "step": 1582 + }, + { + "epoch": 0.8331578947368421, + "grad_norm": 1.406274676322937, + "learning_rate": 2.9512416260277554e-05, + "loss": 1.4249, + "step": 1583 + }, + { + "epoch": 0.8336842105263158, + "grad_norm": 20.80072593688965, + "learning_rate": 2.9511787512942125e-05, + "loss": 0.8256, + "step": 1584 + }, + { + "epoch": 0.8342105263157895, + "grad_norm": 1.4816763401031494, + "learning_rate": 2.9511158367184078e-05, + "loss": 0.6075, + "step": 1585 + }, + { + "epoch": 0.8347368421052631, + "grad_norm": 6.253325939178467, + "learning_rate": 2.9510528823020683e-05, + "loss": 0.124, + "step": 1586 + }, + { + "epoch": 0.8352631578947368, + "grad_norm": 1.8239108324050903, + "learning_rate": 2.9509898880469215e-05, + "loss": 0.7881, + "step": 1587 + }, + { + "epoch": 0.8357894736842105, + "grad_norm": 9.914002418518066, + "learning_rate": 2.9509268539546985e-05, + "loss": 0.3637, + "step": 1588 + }, + { + "epoch": 0.8363157894736842, + "grad_norm": 2.900563955307007, + "learning_rate": 2.9508637800271293e-05, + "loss": 0.9361, + "step": 1589 + }, + { + "epoch": 0.8368421052631579, + "grad_norm": 1.4834281206130981, + "learning_rate": 2.9508006662659447e-05, + "loss": 0.4473, + "step": 1590 + }, + { + "epoch": 0.8373684210526315, + "grad_norm": 2.494849443435669, + "learning_rate": 2.9507375126728787e-05, + "loss": 1.2612, + "step": 1591 + }, + { + "epoch": 0.8378947368421052, + "grad_norm": 1.2127383947372437, + "learning_rate": 2.950674319249665e-05, + "loss": 0.8488, + "step": 1592 + }, + { + "epoch": 0.838421052631579, + "grad_norm": 1.7087959051132202, + "learning_rate": 2.950611085998038e-05, + "loss": 0.1118, + "step": 1593 + }, + { + "epoch": 0.8389473684210527, + "grad_norm": 1.0232149362564087, + "learning_rate": 2.9505478129197343e-05, + "loss": 0.0376, + "step": 1594 + }, + { + "epoch": 0.8394736842105263, + "grad_norm": 1.3383780717849731, + "learning_rate": 2.950484500016491e-05, + "loss": 1.1654, + "step": 1595 + }, + { + "epoch": 0.84, + "grad_norm": 1.3777598142623901, + "learning_rate": 2.950421147290046e-05, + "loss": 0.8772, + "step": 1596 + }, + { + "epoch": 0.8405263157894737, + "grad_norm": 3.801234722137451, + "learning_rate": 2.950357754742139e-05, + "loss": 1.8463, + "step": 1597 + }, + { + "epoch": 0.8410526315789474, + "grad_norm": 1.922048568725586, + "learning_rate": 2.9502943223745104e-05, + "loss": 1.6611, + "step": 1598 + }, + { + "epoch": 0.8415789473684211, + "grad_norm": 2.133612632751465, + "learning_rate": 2.9502308501889016e-05, + "loss": 2.0384, + "step": 1599 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 3.9198570251464844, + "learning_rate": 2.950167338187056e-05, + "loss": 1.6621, + "step": 1600 + }, + { + "epoch": 0.8426315789473684, + "grad_norm": 2.9979448318481445, + "learning_rate": 2.950103786370716e-05, + "loss": 0.2946, + "step": 1601 + }, + { + "epoch": 0.8431578947368421, + "grad_norm": 1.4680919647216797, + "learning_rate": 2.950040194741627e-05, + "loss": 1.1285, + "step": 1602 + }, + { + "epoch": 0.8436842105263158, + "grad_norm": 2.139376640319824, + "learning_rate": 2.9499765633015354e-05, + "loss": 0.8158, + "step": 1603 + }, + { + "epoch": 0.8442105263157895, + "grad_norm": 2.2096030712127686, + "learning_rate": 2.9499128920521875e-05, + "loss": 0.0758, + "step": 1604 + }, + { + "epoch": 0.8447368421052631, + "grad_norm": 2.995516300201416, + "learning_rate": 2.949849180995332e-05, + "loss": 0.8989, + "step": 1605 + }, + { + "epoch": 0.8452631578947368, + "grad_norm": 3.639786958694458, + "learning_rate": 2.9497854301327175e-05, + "loss": 1.5573, + "step": 1606 + }, + { + "epoch": 0.8457894736842105, + "grad_norm": 9.909292221069336, + "learning_rate": 2.9497216394660948e-05, + "loss": 0.7076, + "step": 1607 + }, + { + "epoch": 0.8463157894736842, + "grad_norm": 8.76042366027832, + "learning_rate": 2.949657808997215e-05, + "loss": 1.8939, + "step": 1608 + }, + { + "epoch": 0.8468421052631578, + "grad_norm": 8.321390151977539, + "learning_rate": 2.9495939387278303e-05, + "loss": 0.3237, + "step": 1609 + }, + { + "epoch": 0.8473684210526315, + "grad_norm": 2.2840054035186768, + "learning_rate": 2.949530028659695e-05, + "loss": 1.3387, + "step": 1610 + }, + { + "epoch": 0.8478947368421053, + "grad_norm": 1.5071499347686768, + "learning_rate": 2.9494660787945634e-05, + "loss": 1.1994, + "step": 1611 + }, + { + "epoch": 0.848421052631579, + "grad_norm": 8.956307411193848, + "learning_rate": 2.9494020891341912e-05, + "loss": 1.5923, + "step": 1612 + }, + { + "epoch": 0.8489473684210527, + "grad_norm": 1.6805635690689087, + "learning_rate": 2.949338059680335e-05, + "loss": 1.6323, + "step": 1613 + }, + { + "epoch": 0.8494736842105263, + "grad_norm": 0.6971753835678101, + "learning_rate": 2.9492739904347533e-05, + "loss": 0.0236, + "step": 1614 + }, + { + "epoch": 0.85, + "grad_norm": 1.6537336111068726, + "learning_rate": 2.9492098813992045e-05, + "loss": 1.2005, + "step": 1615 + }, + { + "epoch": 0.8505263157894737, + "grad_norm": 1.5291582345962524, + "learning_rate": 2.9491457325754495e-05, + "loss": 1.1125, + "step": 1616 + }, + { + "epoch": 0.8510526315789474, + "grad_norm": 2.4934237003326416, + "learning_rate": 2.949081543965249e-05, + "loss": 1.3744, + "step": 1617 + }, + { + "epoch": 0.8515789473684211, + "grad_norm": 2.8750171661376953, + "learning_rate": 2.949017315570365e-05, + "loss": 1.6325, + "step": 1618 + }, + { + "epoch": 0.8521052631578947, + "grad_norm": 0.9199529886245728, + "learning_rate": 2.9489530473925615e-05, + "loss": 1.0891, + "step": 1619 + }, + { + "epoch": 0.8526315789473684, + "grad_norm": 4.175845146179199, + "learning_rate": 2.9488887394336025e-05, + "loss": 0.306, + "step": 1620 + }, + { + "epoch": 0.8531578947368421, + "grad_norm": 2.5200769901275635, + "learning_rate": 2.948824391695254e-05, + "loss": 0.2277, + "step": 1621 + }, + { + "epoch": 0.8536842105263158, + "grad_norm": 1.024670124053955, + "learning_rate": 2.9487600041792825e-05, + "loss": 1.073, + "step": 1622 + }, + { + "epoch": 0.8542105263157894, + "grad_norm": 3.6856977939605713, + "learning_rate": 2.9486955768874555e-05, + "loss": 0.9715, + "step": 1623 + }, + { + "epoch": 0.8547368421052631, + "grad_norm": 2.900958776473999, + "learning_rate": 2.9486311098215425e-05, + "loss": 1.142, + "step": 1624 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 1.7244210243225098, + "learning_rate": 2.948566602983313e-05, + "loss": 1.1344, + "step": 1625 + }, + { + "epoch": 0.8557894736842105, + "grad_norm": 5.9757232666015625, + "learning_rate": 2.948502056374538e-05, + "loss": 0.8719, + "step": 1626 + }, + { + "epoch": 0.8563157894736843, + "grad_norm": 3.8142871856689453, + "learning_rate": 2.94843746999699e-05, + "loss": 0.8588, + "step": 1627 + }, + { + "epoch": 0.8568421052631578, + "grad_norm": 5.312744617462158, + "learning_rate": 2.9483728438524417e-05, + "loss": 0.9706, + "step": 1628 + }, + { + "epoch": 0.8573684210526316, + "grad_norm": 0.9603210687637329, + "learning_rate": 2.9483081779426678e-05, + "loss": 0.8257, + "step": 1629 + }, + { + "epoch": 0.8578947368421053, + "grad_norm": 12.436172485351562, + "learning_rate": 2.9482434722694434e-05, + "loss": 1.303, + "step": 1630 + }, + { + "epoch": 0.858421052631579, + "grad_norm": 2.7977070808410645, + "learning_rate": 2.9481787268345456e-05, + "loss": 1.4104, + "step": 1631 + }, + { + "epoch": 0.8589473684210527, + "grad_norm": 1.3737499713897705, + "learning_rate": 2.9481139416397512e-05, + "loss": 1.7754, + "step": 1632 + }, + { + "epoch": 0.8594736842105263, + "grad_norm": 4.523530960083008, + "learning_rate": 2.9480491166868396e-05, + "loss": 1.9328, + "step": 1633 + }, + { + "epoch": 0.86, + "grad_norm": 14.998103141784668, + "learning_rate": 2.9479842519775903e-05, + "loss": 1.3404, + "step": 1634 + }, + { + "epoch": 0.8605263157894737, + "grad_norm": 3.4863827228546143, + "learning_rate": 2.9479193475137834e-05, + "loss": 1.7372, + "step": 1635 + }, + { + "epoch": 0.8610526315789474, + "grad_norm": 3.7734429836273193, + "learning_rate": 2.9478544032972024e-05, + "loss": 2.3958, + "step": 1636 + }, + { + "epoch": 0.861578947368421, + "grad_norm": 1.744239091873169, + "learning_rate": 2.9477894193296295e-05, + "loss": 0.9227, + "step": 1637 + }, + { + "epoch": 0.8621052631578947, + "grad_norm": 2.1991264820098877, + "learning_rate": 2.9477243956128484e-05, + "loss": 1.5509, + "step": 1638 + }, + { + "epoch": 0.8626315789473684, + "grad_norm": 13.812848091125488, + "learning_rate": 2.9476593321486455e-05, + "loss": 1.1571, + "step": 1639 + }, + { + "epoch": 0.8631578947368421, + "grad_norm": 5.264194488525391, + "learning_rate": 2.9475942289388056e-05, + "loss": 1.2813, + "step": 1640 + }, + { + "epoch": 0.8636842105263158, + "grad_norm": 1.047844648361206, + "learning_rate": 2.9475290859851173e-05, + "loss": 0.9731, + "step": 1641 + }, + { + "epoch": 0.8642105263157894, + "grad_norm": 6.97605562210083, + "learning_rate": 2.9474639032893685e-05, + "loss": 1.1533, + "step": 1642 + }, + { + "epoch": 0.8647368421052631, + "grad_norm": 1.112858772277832, + "learning_rate": 2.9473986808533495e-05, + "loss": 1.062, + "step": 1643 + }, + { + "epoch": 0.8652631578947368, + "grad_norm": 1.1823147535324097, + "learning_rate": 2.9473334186788503e-05, + "loss": 1.3267, + "step": 1644 + }, + { + "epoch": 0.8657894736842106, + "grad_norm": 1.9988354444503784, + "learning_rate": 2.947268116767663e-05, + "loss": 0.8981, + "step": 1645 + }, + { + "epoch": 0.8663157894736843, + "grad_norm": 10.701164245605469, + "learning_rate": 2.9472027751215803e-05, + "loss": 1.2634, + "step": 1646 + }, + { + "epoch": 0.8668421052631579, + "grad_norm": 8.62368106842041, + "learning_rate": 2.9471373937423963e-05, + "loss": 0.6866, + "step": 1647 + }, + { + "epoch": 0.8673684210526316, + "grad_norm": 7.085811138153076, + "learning_rate": 2.947071972631906e-05, + "loss": 0.4653, + "step": 1648 + }, + { + "epoch": 0.8678947368421053, + "grad_norm": 0.7952293753623962, + "learning_rate": 2.9470065117919057e-05, + "loss": 0.688, + "step": 1649 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.9955103993415833, + "learning_rate": 2.9469410112241925e-05, + "loss": 0.8748, + "step": 1650 + }, + { + "epoch": 0.8689473684210526, + "grad_norm": 1.0168907642364502, + "learning_rate": 2.9468754709305643e-05, + "loss": 1.3661, + "step": 1651 + }, + { + "epoch": 0.8694736842105263, + "grad_norm": 5.310661315917969, + "learning_rate": 2.9468098909128212e-05, + "loss": 0.8655, + "step": 1652 + }, + { + "epoch": 0.87, + "grad_norm": 5.678101539611816, + "learning_rate": 2.9467442711727637e-05, + "loss": 0.914, + "step": 1653 + }, + { + "epoch": 0.8705263157894737, + "grad_norm": 7.018301963806152, + "learning_rate": 2.9466786117121928e-05, + "loss": 1.3955, + "step": 1654 + }, + { + "epoch": 0.8710526315789474, + "grad_norm": 1.5973879098892212, + "learning_rate": 2.9466129125329114e-05, + "loss": 1.1749, + "step": 1655 + }, + { + "epoch": 0.871578947368421, + "grad_norm": 1.043853521347046, + "learning_rate": 2.9465471736367234e-05, + "loss": 0.0401, + "step": 1656 + }, + { + "epoch": 0.8721052631578947, + "grad_norm": 1.4394428730010986, + "learning_rate": 2.9464813950254336e-05, + "loss": 0.0699, + "step": 1657 + }, + { + "epoch": 0.8726315789473684, + "grad_norm": 1.1438318490982056, + "learning_rate": 2.9464155767008485e-05, + "loss": 1.5743, + "step": 1658 + }, + { + "epoch": 0.8731578947368421, + "grad_norm": 7.7294392585754395, + "learning_rate": 2.9463497186647747e-05, + "loss": 0.961, + "step": 1659 + }, + { + "epoch": 0.8736842105263158, + "grad_norm": 2.75779390335083, + "learning_rate": 2.9462838209190198e-05, + "loss": 0.7825, + "step": 1660 + }, + { + "epoch": 0.8742105263157894, + "grad_norm": 1.5052375793457031, + "learning_rate": 2.9462178834653937e-05, + "loss": 1.0688, + "step": 1661 + }, + { + "epoch": 0.8747368421052631, + "grad_norm": 4.883263111114502, + "learning_rate": 2.9461519063057064e-05, + "loss": 0.4308, + "step": 1662 + }, + { + "epoch": 0.8752631578947369, + "grad_norm": 2.8982460498809814, + "learning_rate": 2.9460858894417694e-05, + "loss": 0.4369, + "step": 1663 + }, + { + "epoch": 0.8757894736842106, + "grad_norm": 38.408145904541016, + "learning_rate": 2.9460198328753955e-05, + "loss": 2.1035, + "step": 1664 + }, + { + "epoch": 0.8763157894736842, + "grad_norm": 1.7316091060638428, + "learning_rate": 2.9459537366083983e-05, + "loss": 0.9198, + "step": 1665 + }, + { + "epoch": 0.8768421052631579, + "grad_norm": 0.7877925634384155, + "learning_rate": 2.945887600642592e-05, + "loss": 0.5887, + "step": 1666 + }, + { + "epoch": 0.8773684210526316, + "grad_norm": 1.3625367879867554, + "learning_rate": 2.9458214249797924e-05, + "loss": 1.1236, + "step": 1667 + }, + { + "epoch": 0.8778947368421053, + "grad_norm": 1.776649832725525, + "learning_rate": 2.9457552096218168e-05, + "loss": 1.2667, + "step": 1668 + }, + { + "epoch": 0.878421052631579, + "grad_norm": 5.151728630065918, + "learning_rate": 2.945688954570483e-05, + "loss": 1.5045, + "step": 1669 + }, + { + "epoch": 0.8789473684210526, + "grad_norm": 1.2965806722640991, + "learning_rate": 2.9456226598276097e-05, + "loss": 1.1484, + "step": 1670 + }, + { + "epoch": 0.8794736842105263, + "grad_norm": 1.1582598686218262, + "learning_rate": 2.9455563253950176e-05, + "loss": 1.1762, + "step": 1671 + }, + { + "epoch": 0.88, + "grad_norm": 1.36055588722229, + "learning_rate": 2.945489951274527e-05, + "loss": 1.2704, + "step": 1672 + }, + { + "epoch": 0.8805263157894737, + "grad_norm": 5.386001110076904, + "learning_rate": 2.9454235374679612e-05, + "loss": 1.0296, + "step": 1673 + }, + { + "epoch": 0.8810526315789474, + "grad_norm": 4.373623371124268, + "learning_rate": 2.9453570839771432e-05, + "loss": 1.3767, + "step": 1674 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 1.0014320611953735, + "learning_rate": 2.9452905908038975e-05, + "loss": 1.3597, + "step": 1675 + }, + { + "epoch": 0.8821052631578947, + "grad_norm": 4.0518412590026855, + "learning_rate": 2.9452240579500496e-05, + "loss": 0.7262, + "step": 1676 + }, + { + "epoch": 0.8826315789473684, + "grad_norm": 2.685443878173828, + "learning_rate": 2.9451574854174265e-05, + "loss": 1.2386, + "step": 1677 + }, + { + "epoch": 0.8831578947368421, + "grad_norm": 2.57368540763855, + "learning_rate": 2.9450908732078553e-05, + "loss": 1.0309, + "step": 1678 + }, + { + "epoch": 0.8836842105263157, + "grad_norm": 1.2405550479888916, + "learning_rate": 2.9450242213231654e-05, + "loss": 1.1075, + "step": 1679 + }, + { + "epoch": 0.8842105263157894, + "grad_norm": 1.6623353958129883, + "learning_rate": 2.9449575297651865e-05, + "loss": 0.9361, + "step": 1680 + }, + { + "epoch": 0.8847368421052632, + "grad_norm": 14.208858489990234, + "learning_rate": 2.9448907985357498e-05, + "loss": 2.1596, + "step": 1681 + }, + { + "epoch": 0.8852631578947369, + "grad_norm": 1.5413142442703247, + "learning_rate": 2.944824027636687e-05, + "loss": 1.4468, + "step": 1682 + }, + { + "epoch": 0.8857894736842106, + "grad_norm": 0.9851499795913696, + "learning_rate": 2.9447572170698324e-05, + "loss": 0.9406, + "step": 1683 + }, + { + "epoch": 0.8863157894736842, + "grad_norm": 0.8356818556785583, + "learning_rate": 2.9446903668370188e-05, + "loss": 0.0297, + "step": 1684 + }, + { + "epoch": 0.8868421052631579, + "grad_norm": 2.6945416927337646, + "learning_rate": 2.944623476940082e-05, + "loss": 1.2829, + "step": 1685 + }, + { + "epoch": 0.8873684210526316, + "grad_norm": 33.185028076171875, + "learning_rate": 2.9445565473808593e-05, + "loss": 0.9756, + "step": 1686 + }, + { + "epoch": 0.8878947368421053, + "grad_norm": 1.7505784034729004, + "learning_rate": 2.9444895781611876e-05, + "loss": 1.2576, + "step": 1687 + }, + { + "epoch": 0.888421052631579, + "grad_norm": 4.850620269775391, + "learning_rate": 2.944422569282906e-05, + "loss": 0.1032, + "step": 1688 + }, + { + "epoch": 0.8889473684210526, + "grad_norm": 1.0093870162963867, + "learning_rate": 2.9443555207478536e-05, + "loss": 0.9847, + "step": 1689 + }, + { + "epoch": 0.8894736842105263, + "grad_norm": 2.135535955429077, + "learning_rate": 2.9442884325578714e-05, + "loss": 1.3696, + "step": 1690 + }, + { + "epoch": 0.89, + "grad_norm": 3.9265260696411133, + "learning_rate": 2.9442213047148012e-05, + "loss": 0.9913, + "step": 1691 + }, + { + "epoch": 0.8905263157894737, + "grad_norm": 4.452930927276611, + "learning_rate": 2.944154137220487e-05, + "loss": 0.7837, + "step": 1692 + }, + { + "epoch": 0.8910526315789473, + "grad_norm": 2.796504259109497, + "learning_rate": 2.944086930076771e-05, + "loss": 0.442, + "step": 1693 + }, + { + "epoch": 0.891578947368421, + "grad_norm": 2.876086950302124, + "learning_rate": 2.9440196832855004e-05, + "loss": 1.2997, + "step": 1694 + }, + { + "epoch": 0.8921052631578947, + "grad_norm": 1.2075906991958618, + "learning_rate": 2.94395239684852e-05, + "loss": 0.8753, + "step": 1695 + }, + { + "epoch": 0.8926315789473684, + "grad_norm": 7.354285717010498, + "learning_rate": 2.9438850707676786e-05, + "loss": 0.9609, + "step": 1696 + }, + { + "epoch": 0.8931578947368422, + "grad_norm": 5.216385364532471, + "learning_rate": 2.943817705044823e-05, + "loss": 1.3972, + "step": 1697 + }, + { + "epoch": 0.8936842105263157, + "grad_norm": 1.4113494157791138, + "learning_rate": 2.9437502996818035e-05, + "loss": 1.4202, + "step": 1698 + }, + { + "epoch": 0.8942105263157895, + "grad_norm": 6.46826696395874, + "learning_rate": 2.9436828546804707e-05, + "loss": 0.6637, + "step": 1699 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 1.4000974893569946, + "learning_rate": 2.943615370042677e-05, + "loss": 0.0493, + "step": 1700 + }, + { + "epoch": 0.8952631578947369, + "grad_norm": 1.1557705402374268, + "learning_rate": 2.943547845770274e-05, + "loss": 0.4349, + "step": 1701 + }, + { + "epoch": 0.8957894736842106, + "grad_norm": 8.672079086303711, + "learning_rate": 2.943480281865116e-05, + "loss": 1.1025, + "step": 1702 + }, + { + "epoch": 0.8963157894736842, + "grad_norm": 1.1264904737472534, + "learning_rate": 2.943412678329058e-05, + "loss": 1.0514, + "step": 1703 + }, + { + "epoch": 0.8968421052631579, + "grad_norm": 0.8493797183036804, + "learning_rate": 2.9433450351639567e-05, + "loss": 0.7548, + "step": 1704 + }, + { + "epoch": 0.8973684210526316, + "grad_norm": 3.7704265117645264, + "learning_rate": 2.9432773523716683e-05, + "loss": 1.2445, + "step": 1705 + }, + { + "epoch": 0.8978947368421053, + "grad_norm": 1.2486798763275146, + "learning_rate": 2.9432096299540518e-05, + "loss": 1.1246, + "step": 1706 + }, + { + "epoch": 0.8984210526315789, + "grad_norm": 1.134177803993225, + "learning_rate": 2.9431418679129655e-05, + "loss": 0.8451, + "step": 1707 + }, + { + "epoch": 0.8989473684210526, + "grad_norm": 1.9805290699005127, + "learning_rate": 2.9430740662502712e-05, + "loss": 0.4611, + "step": 1708 + }, + { + "epoch": 0.8994736842105263, + "grad_norm": 2.323180675506592, + "learning_rate": 2.9430062249678297e-05, + "loss": 1.2539, + "step": 1709 + }, + { + "epoch": 0.9, + "grad_norm": 1.190372347831726, + "learning_rate": 2.942938344067503e-05, + "loss": 0.992, + "step": 1710 + }, + { + "epoch": 0.9005263157894737, + "grad_norm": 2.458120822906494, + "learning_rate": 2.9428704235511557e-05, + "loss": 1.1641, + "step": 1711 + }, + { + "epoch": 0.9010526315789473, + "grad_norm": 1.1921024322509766, + "learning_rate": 2.942802463420652e-05, + "loss": 0.9713, + "step": 1712 + }, + { + "epoch": 0.901578947368421, + "grad_norm": 1.0940788984298706, + "learning_rate": 2.942734463677858e-05, + "loss": 0.8756, + "step": 1713 + }, + { + "epoch": 0.9021052631578947, + "grad_norm": 1.3359793424606323, + "learning_rate": 2.9426664243246404e-05, + "loss": 1.3792, + "step": 1714 + }, + { + "epoch": 0.9026315789473685, + "grad_norm": 2.6136605739593506, + "learning_rate": 2.9425983453628677e-05, + "loss": 0.954, + "step": 1715 + }, + { + "epoch": 0.9031578947368422, + "grad_norm": 1.5081348419189453, + "learning_rate": 2.942530226794409e-05, + "loss": 1.3498, + "step": 1716 + }, + { + "epoch": 0.9036842105263158, + "grad_norm": 3.4692232608795166, + "learning_rate": 2.942462068621134e-05, + "loss": 1.0917, + "step": 1717 + }, + { + "epoch": 0.9042105263157895, + "grad_norm": 4.044422149658203, + "learning_rate": 2.942393870844914e-05, + "loss": 1.806, + "step": 1718 + }, + { + "epoch": 0.9047368421052632, + "grad_norm": 1.875767707824707, + "learning_rate": 2.9423256334676215e-05, + "loss": 1.1397, + "step": 1719 + }, + { + "epoch": 0.9052631578947369, + "grad_norm": 2.141014337539673, + "learning_rate": 2.9422573564911305e-05, + "loss": 1.2411, + "step": 1720 + }, + { + "epoch": 0.9057894736842105, + "grad_norm": 1.6898494958877563, + "learning_rate": 2.9421890399173153e-05, + "loss": 1.4801, + "step": 1721 + }, + { + "epoch": 0.9063157894736842, + "grad_norm": 2.024906635284424, + "learning_rate": 2.942120683748051e-05, + "loss": 0.317, + "step": 1722 + }, + { + "epoch": 0.9068421052631579, + "grad_norm": 18.252716064453125, + "learning_rate": 2.9420522879852148e-05, + "loss": 0.7255, + "step": 1723 + }, + { + "epoch": 0.9073684210526316, + "grad_norm": 2.799224853515625, + "learning_rate": 2.9419838526306845e-05, + "loss": 0.7641, + "step": 1724 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 1.1127315759658813, + "learning_rate": 2.941915377686339e-05, + "loss": 1.0762, + "step": 1725 + }, + { + "epoch": 0.9084210526315789, + "grad_norm": 2.6971869468688965, + "learning_rate": 2.9418468631540578e-05, + "loss": 1.15, + "step": 1726 + }, + { + "epoch": 0.9089473684210526, + "grad_norm": 1.2686923742294312, + "learning_rate": 2.9417783090357224e-05, + "loss": 0.9524, + "step": 1727 + }, + { + "epoch": 0.9094736842105263, + "grad_norm": 3.2218449115753174, + "learning_rate": 2.9417097153332152e-05, + "loss": 1.7397, + "step": 1728 + }, + { + "epoch": 0.91, + "grad_norm": 1.8627907037734985, + "learning_rate": 2.941641082048419e-05, + "loss": 1.2889, + "step": 1729 + }, + { + "epoch": 0.9105263157894737, + "grad_norm": 3.6902270317077637, + "learning_rate": 2.9415724091832184e-05, + "loss": 1.9215, + "step": 1730 + }, + { + "epoch": 0.9110526315789473, + "grad_norm": 1.4535077810287476, + "learning_rate": 2.9415036967394988e-05, + "loss": 1.0473, + "step": 1731 + }, + { + "epoch": 0.911578947368421, + "grad_norm": 1.3268829584121704, + "learning_rate": 2.9414349447191466e-05, + "loss": 1.0177, + "step": 1732 + }, + { + "epoch": 0.9121052631578948, + "grad_norm": 1.7916074991226196, + "learning_rate": 2.9413661531240493e-05, + "loss": 1.0315, + "step": 1733 + }, + { + "epoch": 0.9126315789473685, + "grad_norm": 1.504307508468628, + "learning_rate": 2.941297321956096e-05, + "loss": 1.1304, + "step": 1734 + }, + { + "epoch": 0.9131578947368421, + "grad_norm": 4.602481365203857, + "learning_rate": 2.9412284512171756e-05, + "loss": 1.52, + "step": 1735 + }, + { + "epoch": 0.9136842105263158, + "grad_norm": 4.820444107055664, + "learning_rate": 2.94115954090918e-05, + "loss": 1.8141, + "step": 1736 + }, + { + "epoch": 0.9142105263157895, + "grad_norm": 1.313570261001587, + "learning_rate": 2.9410905910340004e-05, + "loss": 0.741, + "step": 1737 + }, + { + "epoch": 0.9147368421052632, + "grad_norm": 1.3603005409240723, + "learning_rate": 2.9410216015935304e-05, + "loss": 1.313, + "step": 1738 + }, + { + "epoch": 0.9152631578947369, + "grad_norm": 4.094880104064941, + "learning_rate": 2.9409525725896636e-05, + "loss": 1.0756, + "step": 1739 + }, + { + "epoch": 0.9157894736842105, + "grad_norm": 1.1733410358428955, + "learning_rate": 2.9408835040242953e-05, + "loss": 0.9995, + "step": 1740 + }, + { + "epoch": 0.9163157894736842, + "grad_norm": 3.4846761226654053, + "learning_rate": 2.9408143958993218e-05, + "loss": 0.9578, + "step": 1741 + }, + { + "epoch": 0.9168421052631579, + "grad_norm": 1.4159212112426758, + "learning_rate": 2.9407452482166412e-05, + "loss": 1.1337, + "step": 1742 + }, + { + "epoch": 0.9173684210526316, + "grad_norm": 7.647956371307373, + "learning_rate": 2.9406760609781507e-05, + "loss": 1.4257, + "step": 1743 + }, + { + "epoch": 0.9178947368421052, + "grad_norm": 1.0013185739517212, + "learning_rate": 2.9406068341857505e-05, + "loss": 1.2189, + "step": 1744 + }, + { + "epoch": 0.9184210526315789, + "grad_norm": 0.9043682813644409, + "learning_rate": 2.9405375678413417e-05, + "loss": 0.6736, + "step": 1745 + }, + { + "epoch": 0.9189473684210526, + "grad_norm": 0.8028733134269714, + "learning_rate": 2.940468261946825e-05, + "loss": 0.9183, + "step": 1746 + }, + { + "epoch": 0.9194736842105263, + "grad_norm": 11.042693138122559, + "learning_rate": 2.9403989165041044e-05, + "loss": 1.6583, + "step": 1747 + }, + { + "epoch": 0.92, + "grad_norm": 3.432318687438965, + "learning_rate": 2.940329531515082e-05, + "loss": 0.7686, + "step": 1748 + }, + { + "epoch": 0.9205263157894736, + "grad_norm": 6.241179466247559, + "learning_rate": 2.9402601069816645e-05, + "loss": 0.654, + "step": 1749 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 4.325395584106445, + "learning_rate": 2.9401906429057574e-05, + "loss": 0.3556, + "step": 1750 + }, + { + "epoch": 0.921578947368421, + "grad_norm": 1.5290063619613647, + "learning_rate": 2.9401211392892677e-05, + "loss": 1.601, + "step": 1751 + }, + { + "epoch": 0.9221052631578948, + "grad_norm": 1.4590153694152832, + "learning_rate": 2.9400515961341035e-05, + "loss": 1.0342, + "step": 1752 + }, + { + "epoch": 0.9226315789473685, + "grad_norm": 2.1654052734375, + "learning_rate": 2.939982013442174e-05, + "loss": 1.6038, + "step": 1753 + }, + { + "epoch": 0.9231578947368421, + "grad_norm": 11.959127426147461, + "learning_rate": 2.9399123912153908e-05, + "loss": 0.6332, + "step": 1754 + }, + { + "epoch": 0.9236842105263158, + "grad_norm": 2.1623222827911377, + "learning_rate": 2.939842729455664e-05, + "loss": 0.1993, + "step": 1755 + }, + { + "epoch": 0.9242105263157895, + "grad_norm": 2.1751246452331543, + "learning_rate": 2.9397730281649067e-05, + "loss": 1.1215, + "step": 1756 + }, + { + "epoch": 0.9247368421052632, + "grad_norm": 3.4143033027648926, + "learning_rate": 2.9397032873450323e-05, + "loss": 0.9459, + "step": 1757 + }, + { + "epoch": 0.9252631578947368, + "grad_norm": 1.3410007953643799, + "learning_rate": 2.9396335069979562e-05, + "loss": 0.9964, + "step": 1758 + }, + { + "epoch": 0.9257894736842105, + "grad_norm": 1.164724588394165, + "learning_rate": 2.9395636871255933e-05, + "loss": 0.3303, + "step": 1759 + }, + { + "epoch": 0.9263157894736842, + "grad_norm": 1.904927372932434, + "learning_rate": 2.9394938277298613e-05, + "loss": 0.7966, + "step": 1760 + }, + { + "epoch": 0.9268421052631579, + "grad_norm": 2.1621692180633545, + "learning_rate": 2.9394239288126782e-05, + "loss": 1.0655, + "step": 1761 + }, + { + "epoch": 0.9273684210526316, + "grad_norm": 1.9396727085113525, + "learning_rate": 2.939353990375962e-05, + "loss": 1.1009, + "step": 1762 + }, + { + "epoch": 0.9278947368421052, + "grad_norm": 2.1531522274017334, + "learning_rate": 2.9392840124216342e-05, + "loss": 0.9206, + "step": 1763 + }, + { + "epoch": 0.9284210526315789, + "grad_norm": 1.473966360092163, + "learning_rate": 2.9392139949516154e-05, + "loss": 1.0062, + "step": 1764 + }, + { + "epoch": 0.9289473684210526, + "grad_norm": 5.68750524520874, + "learning_rate": 2.939143937967828e-05, + "loss": 1.3941, + "step": 1765 + }, + { + "epoch": 0.9294736842105263, + "grad_norm": 1.1354111433029175, + "learning_rate": 2.9390738414721954e-05, + "loss": 1.3447, + "step": 1766 + }, + { + "epoch": 0.93, + "grad_norm": 4.276787757873535, + "learning_rate": 2.939003705466642e-05, + "loss": 1.534, + "step": 1767 + }, + { + "epoch": 0.9305263157894736, + "grad_norm": 1.2840847969055176, + "learning_rate": 2.938933529953094e-05, + "loss": 0.7145, + "step": 1768 + }, + { + "epoch": 0.9310526315789474, + "grad_norm": 1.7286664247512817, + "learning_rate": 2.9388633149334777e-05, + "loss": 0.1145, + "step": 1769 + }, + { + "epoch": 0.9315789473684211, + "grad_norm": 6.341940879821777, + "learning_rate": 2.9387930604097205e-05, + "loss": 2.5274, + "step": 1770 + }, + { + "epoch": 0.9321052631578948, + "grad_norm": 1.1878658533096313, + "learning_rate": 2.938722766383751e-05, + "loss": 0.5424, + "step": 1771 + }, + { + "epoch": 0.9326315789473684, + "grad_norm": 3.3078019618988037, + "learning_rate": 2.9386524328575003e-05, + "loss": 0.2485, + "step": 1772 + }, + { + "epoch": 0.9331578947368421, + "grad_norm": 1.6980602741241455, + "learning_rate": 2.938582059832899e-05, + "loss": 1.1616, + "step": 1773 + }, + { + "epoch": 0.9336842105263158, + "grad_norm": 3.3054215908050537, + "learning_rate": 2.9385116473118785e-05, + "loss": 1.3687, + "step": 1774 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 3.2444324493408203, + "learning_rate": 2.9384411952963724e-05, + "loss": 1.7401, + "step": 1775 + }, + { + "epoch": 0.9347368421052632, + "grad_norm": 1.1763970851898193, + "learning_rate": 2.9383707037883153e-05, + "loss": 1.25, + "step": 1776 + }, + { + "epoch": 0.9352631578947368, + "grad_norm": 1.8481565713882446, + "learning_rate": 2.938300172789642e-05, + "loss": 0.904, + "step": 1777 + }, + { + "epoch": 0.9357894736842105, + "grad_norm": 1.5394039154052734, + "learning_rate": 2.9382296023022895e-05, + "loss": 1.2871, + "step": 1778 + }, + { + "epoch": 0.9363157894736842, + "grad_norm": 2.9562089443206787, + "learning_rate": 2.9381589923281952e-05, + "loss": 0.9019, + "step": 1779 + }, + { + "epoch": 0.9368421052631579, + "grad_norm": 2.452955961227417, + "learning_rate": 2.9380883428692972e-05, + "loss": 0.6759, + "step": 1780 + }, + { + "epoch": 0.9373684210526316, + "grad_norm": 5.483946800231934, + "learning_rate": 2.9380176539275355e-05, + "loss": 1.4204, + "step": 1781 + }, + { + "epoch": 0.9378947368421052, + "grad_norm": 9.330098152160645, + "learning_rate": 2.9379469255048513e-05, + "loss": 2.1166, + "step": 1782 + }, + { + "epoch": 0.9384210526315789, + "grad_norm": 5.44893741607666, + "learning_rate": 2.9378761576031858e-05, + "loss": 1.667, + "step": 1783 + }, + { + "epoch": 0.9389473684210526, + "grad_norm": 21.2198543548584, + "learning_rate": 2.937805350224482e-05, + "loss": 1.3647, + "step": 1784 + }, + { + "epoch": 0.9394736842105263, + "grad_norm": 1.1891385316848755, + "learning_rate": 2.9377345033706843e-05, + "loss": 1.1004, + "step": 1785 + }, + { + "epoch": 0.94, + "grad_norm": 3.092072010040283, + "learning_rate": 2.937663617043738e-05, + "loss": 0.3973, + "step": 1786 + }, + { + "epoch": 0.9405263157894737, + "grad_norm": 4.912240505218506, + "learning_rate": 2.9375926912455884e-05, + "loss": 0.4901, + "step": 1787 + }, + { + "epoch": 0.9410526315789474, + "grad_norm": 1.5781091451644897, + "learning_rate": 2.9375217259781833e-05, + "loss": 1.2677, + "step": 1788 + }, + { + "epoch": 0.9415789473684211, + "grad_norm": 16.19452667236328, + "learning_rate": 2.937450721243471e-05, + "loss": 1.1125, + "step": 1789 + }, + { + "epoch": 0.9421052631578948, + "grad_norm": 1.818876028060913, + "learning_rate": 2.9373796770434015e-05, + "loss": 1.052, + "step": 1790 + }, + { + "epoch": 0.9426315789473684, + "grad_norm": 1.8112890720367432, + "learning_rate": 2.9373085933799242e-05, + "loss": 0.8413, + "step": 1791 + }, + { + "epoch": 0.9431578947368421, + "grad_norm": 0.8109291195869446, + "learning_rate": 2.937237470254992e-05, + "loss": 0.4724, + "step": 1792 + }, + { + "epoch": 0.9436842105263158, + "grad_norm": 1.2814196348190308, + "learning_rate": 2.9371663076705566e-05, + "loss": 1.2765, + "step": 1793 + }, + { + "epoch": 0.9442105263157895, + "grad_norm": 47.906097412109375, + "learning_rate": 2.937095105628572e-05, + "loss": 2.2085, + "step": 1794 + }, + { + "epoch": 0.9447368421052632, + "grad_norm": 1.119391918182373, + "learning_rate": 2.937023864130993e-05, + "loss": 1.2151, + "step": 1795 + }, + { + "epoch": 0.9452631578947368, + "grad_norm": 2.7573742866516113, + "learning_rate": 2.936952583179776e-05, + "loss": 1.4973, + "step": 1796 + }, + { + "epoch": 0.9457894736842105, + "grad_norm": 1.0621460676193237, + "learning_rate": 2.9368812627768777e-05, + "loss": 1.7011, + "step": 1797 + }, + { + "epoch": 0.9463157894736842, + "grad_norm": 1.82633638381958, + "learning_rate": 2.9368099029242564e-05, + "loss": 0.6421, + "step": 1798 + }, + { + "epoch": 0.9468421052631579, + "grad_norm": 2.238363742828369, + "learning_rate": 2.9367385036238707e-05, + "loss": 1.4712, + "step": 1799 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 1.1244910955429077, + "learning_rate": 2.9366670648776818e-05, + "loss": 0.7498, + "step": 1800 + }, + { + "epoch": 0.9478947368421052, + "grad_norm": 2.550694704055786, + "learning_rate": 2.9365955866876503e-05, + "loss": 0.8432, + "step": 1801 + }, + { + "epoch": 0.9484210526315789, + "grad_norm": 0.9619981050491333, + "learning_rate": 2.9365240690557387e-05, + "loss": 0.9428, + "step": 1802 + }, + { + "epoch": 0.9489473684210527, + "grad_norm": 3.0906801223754883, + "learning_rate": 2.9364525119839107e-05, + "loss": 0.254, + "step": 1803 + }, + { + "epoch": 0.9494736842105264, + "grad_norm": 17.610000610351562, + "learning_rate": 2.936380915474131e-05, + "loss": 2.4892, + "step": 1804 + }, + { + "epoch": 0.95, + "grad_norm": 1.0261893272399902, + "learning_rate": 2.9363092795283654e-05, + "loss": 1.0611, + "step": 1805 + }, + { + "epoch": 0.9505263157894737, + "grad_norm": 1.3554835319519043, + "learning_rate": 2.9362376041485807e-05, + "loss": 0.9824, + "step": 1806 + }, + { + "epoch": 0.9510526315789474, + "grad_norm": 1.1118865013122559, + "learning_rate": 2.936165889336744e-05, + "loss": 0.9515, + "step": 1807 + }, + { + "epoch": 0.9515789473684211, + "grad_norm": 9.150864601135254, + "learning_rate": 2.936094135094825e-05, + "loss": 0.5341, + "step": 1808 + }, + { + "epoch": 0.9521052631578948, + "grad_norm": 3.7893803119659424, + "learning_rate": 2.936022341424794e-05, + "loss": 1.7552, + "step": 1809 + }, + { + "epoch": 0.9526315789473684, + "grad_norm": 1.512068271636963, + "learning_rate": 2.9359505083286215e-05, + "loss": 1.0817, + "step": 1810 + }, + { + "epoch": 0.9531578947368421, + "grad_norm": 2.110664129257202, + "learning_rate": 2.935878635808279e-05, + "loss": 1.1328, + "step": 1811 + }, + { + "epoch": 0.9536842105263158, + "grad_norm": 6.667463779449463, + "learning_rate": 2.9358067238657414e-05, + "loss": 1.9689, + "step": 1812 + }, + { + "epoch": 0.9542105263157895, + "grad_norm": 8.632956504821777, + "learning_rate": 2.9357347725029814e-05, + "loss": 0.961, + "step": 1813 + }, + { + "epoch": 0.9547368421052631, + "grad_norm": 1.0220603942871094, + "learning_rate": 2.935662781721976e-05, + "loss": 0.7937, + "step": 1814 + }, + { + "epoch": 0.9552631578947368, + "grad_norm": 1.1829009056091309, + "learning_rate": 2.9355907515247008e-05, + "loss": 0.5874, + "step": 1815 + }, + { + "epoch": 0.9557894736842105, + "grad_norm": 1.6473325490951538, + "learning_rate": 2.9355186819131334e-05, + "loss": 1.3057, + "step": 1816 + }, + { + "epoch": 0.9563157894736842, + "grad_norm": 1.158306360244751, + "learning_rate": 2.9354465728892528e-05, + "loss": 0.0314, + "step": 1817 + }, + { + "epoch": 0.9568421052631579, + "grad_norm": 10.786005973815918, + "learning_rate": 2.9353744244550382e-05, + "loss": 2.2784, + "step": 1818 + }, + { + "epoch": 0.9573684210526315, + "grad_norm": 7.915785312652588, + "learning_rate": 2.935302236612471e-05, + "loss": 0.8294, + "step": 1819 + }, + { + "epoch": 0.9578947368421052, + "grad_norm": 1.1970584392547607, + "learning_rate": 2.9352300093635335e-05, + "loss": 1.3563, + "step": 1820 + }, + { + "epoch": 0.958421052631579, + "grad_norm": 1.5558509826660156, + "learning_rate": 2.9351577427102075e-05, + "loss": 1.0525, + "step": 1821 + }, + { + "epoch": 0.9589473684210527, + "grad_norm": 0.9184319972991943, + "learning_rate": 2.935085436654478e-05, + "loss": 0.668, + "step": 1822 + }, + { + "epoch": 0.9594736842105264, + "grad_norm": 1.1146199703216553, + "learning_rate": 2.93501309119833e-05, + "loss": 1.2854, + "step": 1823 + }, + { + "epoch": 0.96, + "grad_norm": 1.9025349617004395, + "learning_rate": 2.9349407063437496e-05, + "loss": 0.8864, + "step": 1824 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 6.680036544799805, + "learning_rate": 2.934868282092724e-05, + "loss": 1.3019, + "step": 1825 + }, + { + "epoch": 0.9610526315789474, + "grad_norm": 2.815171241760254, + "learning_rate": 2.934795818447242e-05, + "loss": 0.3377, + "step": 1826 + }, + { + "epoch": 0.9615789473684211, + "grad_norm": 1.282981276512146, + "learning_rate": 2.934723315409293e-05, + "loss": 1.0778, + "step": 1827 + }, + { + "epoch": 0.9621052631578947, + "grad_norm": 6.144766330718994, + "learning_rate": 2.9346507729808676e-05, + "loss": 0.1954, + "step": 1828 + }, + { + "epoch": 0.9626315789473684, + "grad_norm": 1.2909563779830933, + "learning_rate": 2.9345781911639576e-05, + "loss": 1.141, + "step": 1829 + }, + { + "epoch": 0.9631578947368421, + "grad_norm": 10.239936828613281, + "learning_rate": 2.9345055699605546e-05, + "loss": 3.1448, + "step": 1830 + }, + { + "epoch": 0.9636842105263158, + "grad_norm": 3.629612922668457, + "learning_rate": 2.9344329093726542e-05, + "loss": 0.5131, + "step": 1831 + }, + { + "epoch": 0.9642105263157895, + "grad_norm": 1.5074138641357422, + "learning_rate": 2.93436020940225e-05, + "loss": 1.7405, + "step": 1832 + }, + { + "epoch": 0.9647368421052631, + "grad_norm": 2.027520179748535, + "learning_rate": 2.9342874700513387e-05, + "loss": 0.6478, + "step": 1833 + }, + { + "epoch": 0.9652631578947368, + "grad_norm": 2.1978070735931396, + "learning_rate": 2.934214691321917e-05, + "loss": 0.7294, + "step": 1834 + }, + { + "epoch": 0.9657894736842105, + "grad_norm": 2.9730889797210693, + "learning_rate": 2.9341418732159826e-05, + "loss": 0.7626, + "step": 1835 + }, + { + "epoch": 0.9663157894736842, + "grad_norm": 1.2914584875106812, + "learning_rate": 2.9340690157355358e-05, + "loss": 1.3947, + "step": 1836 + }, + { + "epoch": 0.966842105263158, + "grad_norm": 7.379958629608154, + "learning_rate": 2.9339961188825765e-05, + "loss": 2.4647, + "step": 1837 + }, + { + "epoch": 0.9673684210526315, + "grad_norm": 2.926400899887085, + "learning_rate": 2.9339231826591057e-05, + "loss": 0.5296, + "step": 1838 + }, + { + "epoch": 0.9678947368421053, + "grad_norm": 1.7127336263656616, + "learning_rate": 2.9338502070671258e-05, + "loss": 0.8867, + "step": 1839 + }, + { + "epoch": 0.968421052631579, + "grad_norm": 5.599946975708008, + "learning_rate": 2.933777192108641e-05, + "loss": 1.0288, + "step": 1840 + }, + { + "epoch": 0.9689473684210527, + "grad_norm": 1.7110023498535156, + "learning_rate": 2.9337041377856562e-05, + "loss": 1.5123, + "step": 1841 + }, + { + "epoch": 0.9694736842105263, + "grad_norm": 6.067218780517578, + "learning_rate": 2.9336310441001757e-05, + "loss": 1.374, + "step": 1842 + }, + { + "epoch": 0.97, + "grad_norm": 13.339165687561035, + "learning_rate": 2.9335579110542075e-05, + "loss": 2.0621, + "step": 1843 + }, + { + "epoch": 0.9705263157894737, + "grad_norm": 1.5139007568359375, + "learning_rate": 2.9334847386497587e-05, + "loss": 1.1468, + "step": 1844 + }, + { + "epoch": 0.9710526315789474, + "grad_norm": 2.751978874206543, + "learning_rate": 2.9334115268888392e-05, + "loss": 1.4863, + "step": 1845 + }, + { + "epoch": 0.9715789473684211, + "grad_norm": 0.912716805934906, + "learning_rate": 2.933338275773458e-05, + "loss": 0.575, + "step": 1846 + }, + { + "epoch": 0.9721052631578947, + "grad_norm": 6.437097072601318, + "learning_rate": 2.933264985305627e-05, + "loss": 1.4573, + "step": 1847 + }, + { + "epoch": 0.9726315789473684, + "grad_norm": 2.406679391860962, + "learning_rate": 2.933191655487358e-05, + "loss": 1.426, + "step": 1848 + }, + { + "epoch": 0.9731578947368421, + "grad_norm": 1.209294319152832, + "learning_rate": 2.9331182863206647e-05, + "loss": 1.1083, + "step": 1849 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 1.4055339097976685, + "learning_rate": 2.933044877807561e-05, + "loss": 1.1797, + "step": 1850 + }, + { + "epoch": 0.9742105263157895, + "grad_norm": 0.443554550409317, + "learning_rate": 2.9329714299500624e-05, + "loss": 0.0177, + "step": 1851 + }, + { + "epoch": 0.9747368421052631, + "grad_norm": 1.1881070137023926, + "learning_rate": 2.9328979427501854e-05, + "loss": 1.2222, + "step": 1852 + }, + { + "epoch": 0.9752631578947368, + "grad_norm": 1.7724149227142334, + "learning_rate": 2.9328244162099475e-05, + "loss": 1.0751, + "step": 1853 + }, + { + "epoch": 0.9757894736842105, + "grad_norm": 3.7436909675598145, + "learning_rate": 2.932750850331368e-05, + "loss": 0.5386, + "step": 1854 + }, + { + "epoch": 0.9763157894736842, + "grad_norm": 1.2736313343048096, + "learning_rate": 2.932677245116466e-05, + "loss": 1.151, + "step": 1855 + }, + { + "epoch": 0.9768421052631578, + "grad_norm": 10.034126281738281, + "learning_rate": 2.932603600567263e-05, + "loss": 0.5635, + "step": 1856 + }, + { + "epoch": 0.9773684210526316, + "grad_norm": 1.5949655771255493, + "learning_rate": 2.9325299166857802e-05, + "loss": 1.3225, + "step": 1857 + }, + { + "epoch": 0.9778947368421053, + "grad_norm": 5.0193376541137695, + "learning_rate": 2.9324561934740407e-05, + "loss": 0.7483, + "step": 1858 + }, + { + "epoch": 0.978421052631579, + "grad_norm": 1.0509535074234009, + "learning_rate": 2.932382430934069e-05, + "loss": 0.9718, + "step": 1859 + }, + { + "epoch": 0.9789473684210527, + "grad_norm": 2.3995654582977295, + "learning_rate": 2.9323086290678897e-05, + "loss": 0.7383, + "step": 1860 + }, + { + "epoch": 0.9794736842105263, + "grad_norm": 1.2402573823928833, + "learning_rate": 2.9322347878775294e-05, + "loss": 1.2607, + "step": 1861 + }, + { + "epoch": 0.98, + "grad_norm": 0.9786499738693237, + "learning_rate": 2.9321609073650157e-05, + "loss": 1.065, + "step": 1862 + }, + { + "epoch": 0.9805263157894737, + "grad_norm": 6.080226898193359, + "learning_rate": 2.9320869875323767e-05, + "loss": 0.3911, + "step": 1863 + }, + { + "epoch": 0.9810526315789474, + "grad_norm": 5.234240531921387, + "learning_rate": 2.9320130283816417e-05, + "loss": 0.958, + "step": 1864 + }, + { + "epoch": 0.9815789473684211, + "grad_norm": 48.88657760620117, + "learning_rate": 2.9319390299148417e-05, + "loss": 2.8281, + "step": 1865 + }, + { + "epoch": 0.9821052631578947, + "grad_norm": 1.2519406080245972, + "learning_rate": 2.9318649921340076e-05, + "loss": 0.5246, + "step": 1866 + }, + { + "epoch": 0.9826315789473684, + "grad_norm": 2.0961368083953857, + "learning_rate": 2.931790915041173e-05, + "loss": 1.0672, + "step": 1867 + }, + { + "epoch": 0.9831578947368421, + "grad_norm": 3.2622575759887695, + "learning_rate": 2.9317167986383705e-05, + "loss": 0.9251, + "step": 1868 + }, + { + "epoch": 0.9836842105263158, + "grad_norm": 1.264971375465393, + "learning_rate": 2.9316426429276366e-05, + "loss": 0.2829, + "step": 1869 + }, + { + "epoch": 0.9842105263157894, + "grad_norm": 16.512882232666016, + "learning_rate": 2.9315684479110062e-05, + "loss": 1.0379, + "step": 1870 + }, + { + "epoch": 0.9847368421052631, + "grad_norm": 3.8302979469299316, + "learning_rate": 2.931494213590516e-05, + "loss": 0.7015, + "step": 1871 + }, + { + "epoch": 0.9852631578947368, + "grad_norm": 2.289673089981079, + "learning_rate": 2.9314199399682053e-05, + "loss": 1.1638, + "step": 1872 + }, + { + "epoch": 0.9857894736842105, + "grad_norm": 1.4307857751846313, + "learning_rate": 2.9313456270461123e-05, + "loss": 0.0371, + "step": 1873 + }, + { + "epoch": 0.9863157894736843, + "grad_norm": 4.602456569671631, + "learning_rate": 2.9312712748262774e-05, + "loss": 2.1234, + "step": 1874 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 2.3993990421295166, + "learning_rate": 2.9311968833107423e-05, + "loss": 0.5891, + "step": 1875 + }, + { + "epoch": 0.9873684210526316, + "grad_norm": 31.689926147460938, + "learning_rate": 2.9311224525015494e-05, + "loss": 1.496, + "step": 1876 + }, + { + "epoch": 0.9878947368421053, + "grad_norm": 2.8490824699401855, + "learning_rate": 2.9310479824007416e-05, + "loss": 0.3687, + "step": 1877 + }, + { + "epoch": 0.988421052631579, + "grad_norm": 1.2507219314575195, + "learning_rate": 2.930973473010365e-05, + "loss": 0.9543, + "step": 1878 + }, + { + "epoch": 0.9889473684210527, + "grad_norm": 2.8183679580688477, + "learning_rate": 2.930898924332463e-05, + "loss": 0.1816, + "step": 1879 + }, + { + "epoch": 0.9894736842105263, + "grad_norm": 1.2869808673858643, + "learning_rate": 2.9308243363690844e-05, + "loss": 0.5637, + "step": 1880 + }, + { + "epoch": 0.99, + "grad_norm": 4.482398986816406, + "learning_rate": 2.9307497091222753e-05, + "loss": 0.2045, + "step": 1881 + }, + { + "epoch": 0.9905263157894737, + "grad_norm": 4.65263557434082, + "learning_rate": 2.930675042594086e-05, + "loss": 1.2288, + "step": 1882 + }, + { + "epoch": 0.9910526315789474, + "grad_norm": 1.563451886177063, + "learning_rate": 2.9306003367865662e-05, + "loss": 1.599, + "step": 1883 + }, + { + "epoch": 0.991578947368421, + "grad_norm": 1.2689063549041748, + "learning_rate": 2.9305255917017665e-05, + "loss": 1.0688, + "step": 1884 + }, + { + "epoch": 0.9921052631578947, + "grad_norm": 1.954530119895935, + "learning_rate": 2.930450807341739e-05, + "loss": 0.2562, + "step": 1885 + }, + { + "epoch": 0.9926315789473684, + "grad_norm": 9.376235008239746, + "learning_rate": 2.9303759837085375e-05, + "loss": 0.9329, + "step": 1886 + }, + { + "epoch": 0.9931578947368421, + "grad_norm": 4.2806477546691895, + "learning_rate": 2.9303011208042158e-05, + "loss": 1.4794, + "step": 1887 + }, + { + "epoch": 0.9936842105263158, + "grad_norm": 3.1993582248687744, + "learning_rate": 2.9302262186308297e-05, + "loss": 0.9944, + "step": 1888 + }, + { + "epoch": 0.9942105263157894, + "grad_norm": 1.6474570035934448, + "learning_rate": 2.9301512771904347e-05, + "loss": 1.0777, + "step": 1889 + }, + { + "epoch": 0.9947368421052631, + "grad_norm": 1.2916871309280396, + "learning_rate": 2.9300762964850895e-05, + "loss": 1.3304, + "step": 1890 + }, + { + "epoch": 0.9952631578947368, + "grad_norm": 1.1107717752456665, + "learning_rate": 2.930001276516852e-05, + "loss": 1.1093, + "step": 1891 + }, + { + "epoch": 0.9957894736842106, + "grad_norm": 1.7554007768630981, + "learning_rate": 2.9299262172877817e-05, + "loss": 1.1471, + "step": 1892 + }, + { + "epoch": 0.9963157894736843, + "grad_norm": 4.788026332855225, + "learning_rate": 2.9298511187999404e-05, + "loss": 0.49, + "step": 1893 + }, + { + "epoch": 0.9968421052631579, + "grad_norm": 5.148394584655762, + "learning_rate": 2.929775981055389e-05, + "loss": 0.2753, + "step": 1894 + }, + { + "epoch": 0.9973684210526316, + "grad_norm": 1.212622046470642, + "learning_rate": 2.9297008040561907e-05, + "loss": 1.0885, + "step": 1895 + }, + { + "epoch": 0.9978947368421053, + "grad_norm": 1.1759798526763916, + "learning_rate": 2.9296255878044094e-05, + "loss": 0.8806, + "step": 1896 + }, + { + "epoch": 0.998421052631579, + "grad_norm": 1.2078953981399536, + "learning_rate": 2.9295503323021103e-05, + "loss": 1.0733, + "step": 1897 + }, + { + "epoch": 0.9989473684210526, + "grad_norm": 5.538965225219727, + "learning_rate": 2.9294750375513598e-05, + "loss": 2.357, + "step": 1898 + }, + { + "epoch": 0.9994736842105263, + "grad_norm": 0.9785202741622925, + "learning_rate": 2.9293997035542244e-05, + "loss": 0.7298, + "step": 1899 + }, + { + "epoch": 1.0, + "grad_norm": 1.185262680053711, + "learning_rate": 2.929324330312773e-05, + "loss": 1.5887, + "step": 1900 + }, + { + "epoch": 1.0, + "eval_loss": 1.0015404224395752, + "eval_runtime": 12.876, + "eval_samples_per_second": 7.766, + "eval_steps_per_second": 7.766, + "step": 1900 + } + ], + "logging_steps": 1, + "max_steps": 19000, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 1900, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6932541497704448e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}