diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,39424 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 56255, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003555239534263621, + "grad_norm": 6.5037431716918945, + "learning_rate": 4.739336492890996e-08, + "loss": 3.6777, + "step": 10 + }, + { + "epoch": 0.0007110479068527242, + "grad_norm": 6.644636154174805, + "learning_rate": 9.478672985781992e-08, + "loss": 3.6844, + "step": 20 + }, + { + "epoch": 0.0010665718602790863, + "grad_norm": 5.921571254730225, + "learning_rate": 1.4218009478672986e-07, + "loss": 3.6619, + "step": 30 + }, + { + "epoch": 0.0014220958137054485, + "grad_norm": 5.546863555908203, + "learning_rate": 1.8957345971563984e-07, + "loss": 3.6723, + "step": 40 + }, + { + "epoch": 0.0017776197671318106, + "grad_norm": 5.478545188903809, + "learning_rate": 2.3696682464454978e-07, + "loss": 3.6157, + "step": 50 + }, + { + "epoch": 0.0021331437205581727, + "grad_norm": 5.220118045806885, + "learning_rate": 2.843601895734597e-07, + "loss": 3.5973, + "step": 60 + }, + { + "epoch": 0.002488667673984535, + "grad_norm": 5.022663116455078, + "learning_rate": 3.317535545023697e-07, + "loss": 3.5078, + "step": 70 + }, + { + "epoch": 0.002844191627410897, + "grad_norm": 5.178471088409424, + "learning_rate": 3.791469194312797e-07, + "loss": 3.4621, + "step": 80 + }, + { + "epoch": 0.003199715580837259, + "grad_norm": 4.002021312713623, + "learning_rate": 4.265402843601896e-07, + "loss": 3.4312, + "step": 90 + }, + { + "epoch": 0.003555239534263621, + "grad_norm": 4.136953830718994, + "learning_rate": 4.7393364928909956e-07, + "loss": 3.3377, + "step": 100 + }, + { + "epoch": 0.003910763487689983, + "grad_norm": 3.798868179321289, + "learning_rate": 5.213270142180095e-07, + "loss": 3.2934, + "step": 110 + }, + { + "epoch": 0.004266287441116345, + "grad_norm": 4.2911763191223145, + "learning_rate": 5.687203791469194e-07, + "loss": 3.1883, + "step": 120 + }, + { + "epoch": 0.0046218113945427075, + "grad_norm": 3.2939631938934326, + "learning_rate": 6.161137440758294e-07, + "loss": 3.139, + "step": 130 + }, + { + "epoch": 0.00497733534796907, + "grad_norm": 3.2117068767547607, + "learning_rate": 6.635071090047394e-07, + "loss": 3.1006, + "step": 140 + }, + { + "epoch": 0.005332859301395432, + "grad_norm": 3.382498264312744, + "learning_rate": 7.109004739336493e-07, + "loss": 3.0482, + "step": 150 + }, + { + "epoch": 0.005688383254821794, + "grad_norm": 2.9878997802734375, + "learning_rate": 7.582938388625594e-07, + "loss": 2.9764, + "step": 160 + }, + { + "epoch": 0.006043907208248156, + "grad_norm": 3.0326039791107178, + "learning_rate": 8.056872037914692e-07, + "loss": 2.9183, + "step": 170 + }, + { + "epoch": 0.006399431161674518, + "grad_norm": 3.024005174636841, + "learning_rate": 8.530805687203792e-07, + "loss": 2.8864, + "step": 180 + }, + { + "epoch": 0.00675495511510088, + "grad_norm": 3.1701743602752686, + "learning_rate": 9.004739336492892e-07, + "loss": 2.828, + "step": 190 + }, + { + "epoch": 0.007110479068527242, + "grad_norm": 2.661820888519287, + "learning_rate": 9.478672985781991e-07, + "loss": 2.7768, + "step": 200 + }, + { + "epoch": 0.0074660030219536044, + "grad_norm": 3.0967164039611816, + "learning_rate": 9.95260663507109e-07, + "loss": 2.7001, + "step": 210 + }, + { + "epoch": 0.007821526975379967, + "grad_norm": 2.645308017730713, + "learning_rate": 1.042654028436019e-06, + "loss": 2.6736, + "step": 220 + }, + { + "epoch": 0.008177050928806329, + "grad_norm": 2.3344457149505615, + "learning_rate": 1.090047393364929e-06, + "loss": 2.6604, + "step": 230 + }, + { + "epoch": 0.00853257488223269, + "grad_norm": 2.545344829559326, + "learning_rate": 1.1374407582938388e-06, + "loss": 2.6873, + "step": 240 + }, + { + "epoch": 0.008888098835659053, + "grad_norm": 2.7679622173309326, + "learning_rate": 1.184834123222749e-06, + "loss": 2.6686, + "step": 250 + }, + { + "epoch": 0.009243622789085415, + "grad_norm": 2.9236536026000977, + "learning_rate": 1.2322274881516587e-06, + "loss": 2.636, + "step": 260 + }, + { + "epoch": 0.009599146742511777, + "grad_norm": 3.063015937805176, + "learning_rate": 1.2796208530805687e-06, + "loss": 2.5889, + "step": 270 + }, + { + "epoch": 0.00995467069593814, + "grad_norm": 2.6119446754455566, + "learning_rate": 1.3270142180094788e-06, + "loss": 2.6461, + "step": 280 + }, + { + "epoch": 0.010310194649364501, + "grad_norm": 2.787658214569092, + "learning_rate": 1.3744075829383887e-06, + "loss": 2.5946, + "step": 290 + }, + { + "epoch": 0.010665718602790863, + "grad_norm": 2.616063117980957, + "learning_rate": 1.4218009478672987e-06, + "loss": 2.6025, + "step": 300 + }, + { + "epoch": 0.011021242556217226, + "grad_norm": 2.5729634761810303, + "learning_rate": 1.4691943127962086e-06, + "loss": 2.538, + "step": 310 + }, + { + "epoch": 0.011376766509643588, + "grad_norm": 2.5412399768829346, + "learning_rate": 1.5165876777251187e-06, + "loss": 2.5636, + "step": 320 + }, + { + "epoch": 0.01173229046306995, + "grad_norm": 3.008772611618042, + "learning_rate": 1.5639810426540287e-06, + "loss": 2.5296, + "step": 330 + }, + { + "epoch": 0.012087814416496312, + "grad_norm": 2.5369677543640137, + "learning_rate": 1.6113744075829384e-06, + "loss": 2.5443, + "step": 340 + }, + { + "epoch": 0.012443338369922674, + "grad_norm": 2.5106494426727295, + "learning_rate": 1.6587677725118483e-06, + "loss": 2.5304, + "step": 350 + }, + { + "epoch": 0.012798862323349036, + "grad_norm": 2.8165364265441895, + "learning_rate": 1.7061611374407585e-06, + "loss": 2.561, + "step": 360 + }, + { + "epoch": 0.013154386276775398, + "grad_norm": 2.5253000259399414, + "learning_rate": 1.7535545023696684e-06, + "loss": 2.532, + "step": 370 + }, + { + "epoch": 0.01350991023020176, + "grad_norm": 2.657996892929077, + "learning_rate": 1.8009478672985784e-06, + "loss": 2.5121, + "step": 380 + }, + { + "epoch": 0.013865434183628123, + "grad_norm": 2.816077470779419, + "learning_rate": 1.8483412322274883e-06, + "loss": 2.4412, + "step": 390 + }, + { + "epoch": 0.014220958137054485, + "grad_norm": 2.648855686187744, + "learning_rate": 1.8957345971563982e-06, + "loss": 2.4674, + "step": 400 + }, + { + "epoch": 0.014576482090480847, + "grad_norm": 2.7675745487213135, + "learning_rate": 1.943127962085308e-06, + "loss": 2.4813, + "step": 410 + }, + { + "epoch": 0.014932006043907209, + "grad_norm": 2.5133159160614014, + "learning_rate": 1.990521327014218e-06, + "loss": 2.5003, + "step": 420 + }, + { + "epoch": 0.015287529997333571, + "grad_norm": 2.5450525283813477, + "learning_rate": 2.037914691943128e-06, + "loss": 2.4677, + "step": 430 + }, + { + "epoch": 0.015643053950759933, + "grad_norm": 7.591505527496338, + "learning_rate": 2.085308056872038e-06, + "loss": 2.5193, + "step": 440 + }, + { + "epoch": 0.015998577904186295, + "grad_norm": 2.441828727722168, + "learning_rate": 2.1327014218009483e-06, + "loss": 2.4628, + "step": 450 + }, + { + "epoch": 0.016354101857612657, + "grad_norm": 2.5905909538269043, + "learning_rate": 2.180094786729858e-06, + "loss": 2.4269, + "step": 460 + }, + { + "epoch": 0.01670962581103902, + "grad_norm": 2.608412981033325, + "learning_rate": 2.2274881516587678e-06, + "loss": 2.45, + "step": 470 + }, + { + "epoch": 0.01706514976446538, + "grad_norm": 2.5817153453826904, + "learning_rate": 2.2748815165876777e-06, + "loss": 2.4327, + "step": 480 + }, + { + "epoch": 0.017420673717891744, + "grad_norm": 2.6757760047912598, + "learning_rate": 2.322274881516588e-06, + "loss": 2.4466, + "step": 490 + }, + { + "epoch": 0.017776197671318106, + "grad_norm": 2.745751142501831, + "learning_rate": 2.369668246445498e-06, + "loss": 2.4339, + "step": 500 + }, + { + "epoch": 0.018131721624744468, + "grad_norm": 2.6839053630828857, + "learning_rate": 2.417061611374408e-06, + "loss": 2.4309, + "step": 510 + }, + { + "epoch": 0.01848724557817083, + "grad_norm": 3.034766912460327, + "learning_rate": 2.4644549763033174e-06, + "loss": 2.3887, + "step": 520 + }, + { + "epoch": 0.018842769531597192, + "grad_norm": 2.3895976543426514, + "learning_rate": 2.5118483412322274e-06, + "loss": 2.3988, + "step": 530 + }, + { + "epoch": 0.019198293485023554, + "grad_norm": 2.550225257873535, + "learning_rate": 2.5592417061611373e-06, + "loss": 2.4171, + "step": 540 + }, + { + "epoch": 0.019553817438449916, + "grad_norm": 2.5043528079986572, + "learning_rate": 2.606635071090048e-06, + "loss": 2.4244, + "step": 550 + }, + { + "epoch": 0.01990934139187628, + "grad_norm": 2.4192793369293213, + "learning_rate": 2.6540284360189576e-06, + "loss": 2.4324, + "step": 560 + }, + { + "epoch": 0.02026486534530264, + "grad_norm": 2.150911331176758, + "learning_rate": 2.7014218009478675e-06, + "loss": 2.4225, + "step": 570 + }, + { + "epoch": 0.020620389298729003, + "grad_norm": 2.4253993034362793, + "learning_rate": 2.7488151658767775e-06, + "loss": 2.4464, + "step": 580 + }, + { + "epoch": 0.020975913252155365, + "grad_norm": 2.2535018920898438, + "learning_rate": 2.7962085308056874e-06, + "loss": 2.3521, + "step": 590 + }, + { + "epoch": 0.021331437205581727, + "grad_norm": 2.497767210006714, + "learning_rate": 2.8436018957345973e-06, + "loss": 2.3227, + "step": 600 + }, + { + "epoch": 0.02168696115900809, + "grad_norm": 2.3069896697998047, + "learning_rate": 2.8909952606635073e-06, + "loss": 2.3832, + "step": 610 + }, + { + "epoch": 0.02204248511243445, + "grad_norm": 2.4061696529388428, + "learning_rate": 2.938388625592417e-06, + "loss": 2.385, + "step": 620 + }, + { + "epoch": 0.022398009065860813, + "grad_norm": 2.2939300537109375, + "learning_rate": 2.985781990521327e-06, + "loss": 2.3879, + "step": 630 + }, + { + "epoch": 0.022753533019287175, + "grad_norm": 2.414767026901245, + "learning_rate": 3.0331753554502375e-06, + "loss": 2.3585, + "step": 640 + }, + { + "epoch": 0.023109056972713538, + "grad_norm": 2.20217227935791, + "learning_rate": 3.0805687203791474e-06, + "loss": 2.4066, + "step": 650 + }, + { + "epoch": 0.0234645809261399, + "grad_norm": 2.1113369464874268, + "learning_rate": 3.1279620853080574e-06, + "loss": 2.3501, + "step": 660 + }, + { + "epoch": 0.023820104879566262, + "grad_norm": 2.116257905960083, + "learning_rate": 3.1753554502369673e-06, + "loss": 2.3473, + "step": 670 + }, + { + "epoch": 0.024175628832992624, + "grad_norm": 2.11061692237854, + "learning_rate": 3.222748815165877e-06, + "loss": 2.367, + "step": 680 + }, + { + "epoch": 0.024531152786418986, + "grad_norm": 2.093904733657837, + "learning_rate": 3.2701421800947867e-06, + "loss": 2.3376, + "step": 690 + }, + { + "epoch": 0.024886676739845348, + "grad_norm": 2.0183255672454834, + "learning_rate": 3.3175355450236967e-06, + "loss": 2.376, + "step": 700 + }, + { + "epoch": 0.02524220069327171, + "grad_norm": 2.031346559524536, + "learning_rate": 3.3649289099526066e-06, + "loss": 2.3569, + "step": 710 + }, + { + "epoch": 0.025597724646698072, + "grad_norm": 2.0024542808532715, + "learning_rate": 3.412322274881517e-06, + "loss": 2.349, + "step": 720 + }, + { + "epoch": 0.025953248600124434, + "grad_norm": 2.029694080352783, + "learning_rate": 3.459715639810427e-06, + "loss": 2.3457, + "step": 730 + }, + { + "epoch": 0.026308772553550797, + "grad_norm": 2.160933017730713, + "learning_rate": 3.507109004739337e-06, + "loss": 2.296, + "step": 740 + }, + { + "epoch": 0.02666429650697716, + "grad_norm": 2.0530309677124023, + "learning_rate": 3.5545023696682468e-06, + "loss": 2.3714, + "step": 750 + }, + { + "epoch": 0.02701982046040352, + "grad_norm": 1.919304370880127, + "learning_rate": 3.6018957345971567e-06, + "loss": 2.3104, + "step": 760 + }, + { + "epoch": 0.027375344413829883, + "grad_norm": 2.03427791595459, + "learning_rate": 3.6492890995260666e-06, + "loss": 2.2817, + "step": 770 + }, + { + "epoch": 0.027730868367256245, + "grad_norm": 2.2166829109191895, + "learning_rate": 3.6966824644549766e-06, + "loss": 2.3678, + "step": 780 + }, + { + "epoch": 0.028086392320682607, + "grad_norm": 2.0098068714141846, + "learning_rate": 3.7440758293838865e-06, + "loss": 2.3577, + "step": 790 + }, + { + "epoch": 0.02844191627410897, + "grad_norm": 2.175006151199341, + "learning_rate": 3.7914691943127964e-06, + "loss": 2.317, + "step": 800 + }, + { + "epoch": 0.02879744022753533, + "grad_norm": 2.008929491043091, + "learning_rate": 3.838862559241707e-06, + "loss": 2.3594, + "step": 810 + }, + { + "epoch": 0.029152964180961694, + "grad_norm": 2.1698524951934814, + "learning_rate": 3.886255924170616e-06, + "loss": 2.298, + "step": 820 + }, + { + "epoch": 0.029508488134388056, + "grad_norm": 2.0474202632904053, + "learning_rate": 3.933649289099527e-06, + "loss": 2.2345, + "step": 830 + }, + { + "epoch": 0.029864012087814418, + "grad_norm": 1.8713438510894775, + "learning_rate": 3.981042654028436e-06, + "loss": 2.3007, + "step": 840 + }, + { + "epoch": 0.03021953604124078, + "grad_norm": 2.2103753089904785, + "learning_rate": 4.0284360189573465e-06, + "loss": 2.2866, + "step": 850 + }, + { + "epoch": 0.030575059994667142, + "grad_norm": 1.954866886138916, + "learning_rate": 4.075829383886256e-06, + "loss": 2.3095, + "step": 860 + }, + { + "epoch": 0.030930583948093504, + "grad_norm": 2.025651454925537, + "learning_rate": 4.123222748815166e-06, + "loss": 2.3398, + "step": 870 + }, + { + "epoch": 0.031286107901519866, + "grad_norm": 1.9596309661865234, + "learning_rate": 4.170616113744076e-06, + "loss": 2.3475, + "step": 880 + }, + { + "epoch": 0.03164163185494623, + "grad_norm": 1.9388352632522583, + "learning_rate": 4.218009478672986e-06, + "loss": 2.2968, + "step": 890 + }, + { + "epoch": 0.03199715580837259, + "grad_norm": 2.0771589279174805, + "learning_rate": 4.265402843601897e-06, + "loss": 2.274, + "step": 900 + }, + { + "epoch": 0.03235267976179895, + "grad_norm": 1.8007633686065674, + "learning_rate": 4.312796208530806e-06, + "loss": 2.3144, + "step": 910 + }, + { + "epoch": 0.032708203715225315, + "grad_norm": 1.8899132013320923, + "learning_rate": 4.360189573459716e-06, + "loss": 2.3019, + "step": 920 + }, + { + "epoch": 0.03306372766865168, + "grad_norm": 1.913493037223816, + "learning_rate": 4.407582938388626e-06, + "loss": 2.3061, + "step": 930 + }, + { + "epoch": 0.03341925162207804, + "grad_norm": 2.0034244060516357, + "learning_rate": 4.4549763033175355e-06, + "loss": 2.2956, + "step": 940 + }, + { + "epoch": 0.0337747755755044, + "grad_norm": 1.9144295454025269, + "learning_rate": 4.502369668246446e-06, + "loss": 2.2693, + "step": 950 + }, + { + "epoch": 0.03413029952893076, + "grad_norm": 2.0472214221954346, + "learning_rate": 4.549763033175355e-06, + "loss": 2.2076, + "step": 960 + }, + { + "epoch": 0.034485823482357125, + "grad_norm": 2.0035147666931152, + "learning_rate": 4.597156398104266e-06, + "loss": 2.3414, + "step": 970 + }, + { + "epoch": 0.03484134743578349, + "grad_norm": 2.1267082691192627, + "learning_rate": 4.644549763033176e-06, + "loss": 2.2974, + "step": 980 + }, + { + "epoch": 0.03519687138920985, + "grad_norm": 2.001173257827759, + "learning_rate": 4.691943127962086e-06, + "loss": 2.2925, + "step": 990 + }, + { + "epoch": 0.03555239534263621, + "grad_norm": 1.8909610509872437, + "learning_rate": 4.739336492890996e-06, + "loss": 2.3045, + "step": 1000 + }, + { + "epoch": 0.035907919296062574, + "grad_norm": 1.8200558423995972, + "learning_rate": 4.7867298578199055e-06, + "loss": 2.2761, + "step": 1010 + }, + { + "epoch": 0.036263443249488936, + "grad_norm": 1.8652522563934326, + "learning_rate": 4.834123222748816e-06, + "loss": 2.3357, + "step": 1020 + }, + { + "epoch": 0.0366189672029153, + "grad_norm": 1.907865285873413, + "learning_rate": 4.881516587677725e-06, + "loss": 2.2691, + "step": 1030 + }, + { + "epoch": 0.03697449115634166, + "grad_norm": 1.8563010692596436, + "learning_rate": 4.928909952606635e-06, + "loss": 2.2373, + "step": 1040 + }, + { + "epoch": 0.03733001510976802, + "grad_norm": 1.8197952508926392, + "learning_rate": 4.976303317535545e-06, + "loss": 2.288, + "step": 1050 + }, + { + "epoch": 0.037685539063194384, + "grad_norm": 1.8919203281402588, + "learning_rate": 5.023696682464455e-06, + "loss": 2.2458, + "step": 1060 + }, + { + "epoch": 0.038041063016620746, + "grad_norm": 1.8901917934417725, + "learning_rate": 5.071090047393366e-06, + "loss": 2.2557, + "step": 1070 + }, + { + "epoch": 0.03839658697004711, + "grad_norm": 1.9412710666656494, + "learning_rate": 5.118483412322275e-06, + "loss": 2.2927, + "step": 1080 + }, + { + "epoch": 0.03875211092347347, + "grad_norm": 2.022618055343628, + "learning_rate": 5.165876777251185e-06, + "loss": 2.2815, + "step": 1090 + }, + { + "epoch": 0.03910763487689983, + "grad_norm": 1.8631237745285034, + "learning_rate": 5.213270142180096e-06, + "loss": 2.3075, + "step": 1100 + }, + { + "epoch": 0.039463158830326195, + "grad_norm": 2.0299785137176514, + "learning_rate": 5.260663507109005e-06, + "loss": 2.2475, + "step": 1110 + }, + { + "epoch": 0.03981868278375256, + "grad_norm": 1.8889631032943726, + "learning_rate": 5.308056872037915e-06, + "loss": 2.2771, + "step": 1120 + }, + { + "epoch": 0.04017420673717892, + "grad_norm": 1.81557297706604, + "learning_rate": 5.355450236966825e-06, + "loss": 2.3143, + "step": 1130 + }, + { + "epoch": 0.04052973069060528, + "grad_norm": 1.985235333442688, + "learning_rate": 5.402843601895735e-06, + "loss": 2.2641, + "step": 1140 + }, + { + "epoch": 0.04088525464403164, + "grad_norm": 2.0497684478759766, + "learning_rate": 5.4502369668246446e-06, + "loss": 2.2873, + "step": 1150 + }, + { + "epoch": 0.041240778597458005, + "grad_norm": 2.0543909072875977, + "learning_rate": 5.497630331753555e-06, + "loss": 2.285, + "step": 1160 + }, + { + "epoch": 0.04159630255088437, + "grad_norm": 2.0090322494506836, + "learning_rate": 5.5450236966824644e-06, + "loss": 2.251, + "step": 1170 + }, + { + "epoch": 0.04195182650431073, + "grad_norm": 2.01086163520813, + "learning_rate": 5.592417061611375e-06, + "loss": 2.2004, + "step": 1180 + }, + { + "epoch": 0.04230735045773709, + "grad_norm": 1.9381053447723389, + "learning_rate": 5.639810426540285e-06, + "loss": 2.2603, + "step": 1190 + }, + { + "epoch": 0.042662874411163454, + "grad_norm": 1.9569153785705566, + "learning_rate": 5.687203791469195e-06, + "loss": 2.233, + "step": 1200 + }, + { + "epoch": 0.043018398364589816, + "grad_norm": 1.9454693794250488, + "learning_rate": 5.734597156398105e-06, + "loss": 2.2415, + "step": 1210 + }, + { + "epoch": 0.04337392231801618, + "grad_norm": 1.7950100898742676, + "learning_rate": 5.7819905213270145e-06, + "loss": 2.243, + "step": 1220 + }, + { + "epoch": 0.04372944627144254, + "grad_norm": 1.8708484172821045, + "learning_rate": 5.829383886255925e-06, + "loss": 2.2722, + "step": 1230 + }, + { + "epoch": 0.0440849702248689, + "grad_norm": 1.9645860195159912, + "learning_rate": 5.876777251184834e-06, + "loss": 2.2032, + "step": 1240 + }, + { + "epoch": 0.044440494178295264, + "grad_norm": 1.973793625831604, + "learning_rate": 5.924170616113745e-06, + "loss": 2.2247, + "step": 1250 + }, + { + "epoch": 0.04479601813172163, + "grad_norm": 1.8837801218032837, + "learning_rate": 5.971563981042654e-06, + "loss": 2.2531, + "step": 1260 + }, + { + "epoch": 0.04515154208514799, + "grad_norm": 1.9478363990783691, + "learning_rate": 6.018957345971565e-06, + "loss": 2.1965, + "step": 1270 + }, + { + "epoch": 0.04550706603857435, + "grad_norm": 2.0230274200439453, + "learning_rate": 6.066350710900475e-06, + "loss": 2.2398, + "step": 1280 + }, + { + "epoch": 0.04586258999200071, + "grad_norm": 1.92470121383667, + "learning_rate": 6.1137440758293845e-06, + "loss": 2.2117, + "step": 1290 + }, + { + "epoch": 0.046218113945427075, + "grad_norm": 1.9754289388656616, + "learning_rate": 6.161137440758295e-06, + "loss": 2.2447, + "step": 1300 + }, + { + "epoch": 0.04657363789885344, + "grad_norm": 1.9050835371017456, + "learning_rate": 6.208530805687204e-06, + "loss": 2.257, + "step": 1310 + }, + { + "epoch": 0.0469291618522798, + "grad_norm": 2.0105550289154053, + "learning_rate": 6.255924170616115e-06, + "loss": 2.2238, + "step": 1320 + }, + { + "epoch": 0.04728468580570616, + "grad_norm": 1.8811384439468384, + "learning_rate": 6.303317535545023e-06, + "loss": 2.2305, + "step": 1330 + }, + { + "epoch": 0.047640209759132524, + "grad_norm": 1.7932945489883423, + "learning_rate": 6.350710900473935e-06, + "loss": 2.2524, + "step": 1340 + }, + { + "epoch": 0.047995733712558886, + "grad_norm": 1.950801134109497, + "learning_rate": 6.398104265402843e-06, + "loss": 2.2003, + "step": 1350 + }, + { + "epoch": 0.04835125766598525, + "grad_norm": 1.797852635383606, + "learning_rate": 6.445497630331754e-06, + "loss": 2.2409, + "step": 1360 + }, + { + "epoch": 0.04870678161941161, + "grad_norm": 1.877571702003479, + "learning_rate": 6.492890995260665e-06, + "loss": 2.2534, + "step": 1370 + }, + { + "epoch": 0.04906230557283797, + "grad_norm": 1.7737246751785278, + "learning_rate": 6.5402843601895735e-06, + "loss": 2.232, + "step": 1380 + }, + { + "epoch": 0.049417829526264334, + "grad_norm": 1.8201391696929932, + "learning_rate": 6.587677725118484e-06, + "loss": 2.2015, + "step": 1390 + }, + { + "epoch": 0.049773353479690696, + "grad_norm": 2.060864210128784, + "learning_rate": 6.635071090047393e-06, + "loss": 2.2239, + "step": 1400 + }, + { + "epoch": 0.05012887743311706, + "grad_norm": 1.956697702407837, + "learning_rate": 6.682464454976304e-06, + "loss": 2.2289, + "step": 1410 + }, + { + "epoch": 0.05048440138654342, + "grad_norm": 1.8469531536102295, + "learning_rate": 6.729857819905213e-06, + "loss": 2.2408, + "step": 1420 + }, + { + "epoch": 0.05083992533996978, + "grad_norm": 1.8741036653518677, + "learning_rate": 6.777251184834124e-06, + "loss": 2.2032, + "step": 1430 + }, + { + "epoch": 0.051195449293396145, + "grad_norm": 1.8755990266799927, + "learning_rate": 6.824644549763034e-06, + "loss": 2.2485, + "step": 1440 + }, + { + "epoch": 0.05155097324682251, + "grad_norm": 1.8328008651733398, + "learning_rate": 6.8720379146919435e-06, + "loss": 2.2939, + "step": 1450 + }, + { + "epoch": 0.05190649720024887, + "grad_norm": 1.8831439018249512, + "learning_rate": 6.919431279620854e-06, + "loss": 2.2182, + "step": 1460 + }, + { + "epoch": 0.05226202115367523, + "grad_norm": 1.965472936630249, + "learning_rate": 6.966824644549763e-06, + "loss": 2.2382, + "step": 1470 + }, + { + "epoch": 0.05261754510710159, + "grad_norm": 2.070134162902832, + "learning_rate": 7.014218009478674e-06, + "loss": 2.1926, + "step": 1480 + }, + { + "epoch": 0.052973069060527955, + "grad_norm": 1.9419405460357666, + "learning_rate": 7.061611374407583e-06, + "loss": 2.2103, + "step": 1490 + }, + { + "epoch": 0.05332859301395432, + "grad_norm": 1.9228605031967163, + "learning_rate": 7.1090047393364935e-06, + "loss": 2.2038, + "step": 1500 + }, + { + "epoch": 0.05368411696738068, + "grad_norm": 1.8788220882415771, + "learning_rate": 7.156398104265403e-06, + "loss": 2.1504, + "step": 1510 + }, + { + "epoch": 0.05403964092080704, + "grad_norm": 1.8219408988952637, + "learning_rate": 7.203791469194313e-06, + "loss": 2.246, + "step": 1520 + }, + { + "epoch": 0.054395164874233404, + "grad_norm": 2.0067989826202393, + "learning_rate": 7.251184834123224e-06, + "loss": 2.1704, + "step": 1530 + }, + { + "epoch": 0.054750688827659766, + "grad_norm": 1.7641377449035645, + "learning_rate": 7.298578199052133e-06, + "loss": 2.1625, + "step": 1540 + }, + { + "epoch": 0.05510621278108613, + "grad_norm": 1.803442358970642, + "learning_rate": 7.345971563981044e-06, + "loss": 2.225, + "step": 1550 + }, + { + "epoch": 0.05546173673451249, + "grad_norm": 1.9596740007400513, + "learning_rate": 7.393364928909953e-06, + "loss": 2.2086, + "step": 1560 + }, + { + "epoch": 0.05581726068793885, + "grad_norm": 1.8549288511276245, + "learning_rate": 7.4407582938388635e-06, + "loss": 2.2154, + "step": 1570 + }, + { + "epoch": 0.056172784641365214, + "grad_norm": 1.7597678899765015, + "learning_rate": 7.488151658767773e-06, + "loss": 2.1198, + "step": 1580 + }, + { + "epoch": 0.056528308594791576, + "grad_norm": 1.9706618785858154, + "learning_rate": 7.535545023696683e-06, + "loss": 2.2277, + "step": 1590 + }, + { + "epoch": 0.05688383254821794, + "grad_norm": 1.8696203231811523, + "learning_rate": 7.582938388625593e-06, + "loss": 2.2775, + "step": 1600 + }, + { + "epoch": 0.0572393565016443, + "grad_norm": 1.9137041568756104, + "learning_rate": 7.630331753554503e-06, + "loss": 2.2055, + "step": 1610 + }, + { + "epoch": 0.05759488045507066, + "grad_norm": 1.9089804887771606, + "learning_rate": 7.677725118483414e-06, + "loss": 2.2311, + "step": 1620 + }, + { + "epoch": 0.057950404408497025, + "grad_norm": 1.8806320428848267, + "learning_rate": 7.725118483412322e-06, + "loss": 2.2372, + "step": 1630 + }, + { + "epoch": 0.05830592836192339, + "grad_norm": 1.775240182876587, + "learning_rate": 7.772511848341233e-06, + "loss": 2.1764, + "step": 1640 + }, + { + "epoch": 0.05866145231534975, + "grad_norm": 1.8922443389892578, + "learning_rate": 7.819905213270143e-06, + "loss": 2.168, + "step": 1650 + }, + { + "epoch": 0.05901697626877611, + "grad_norm": 1.7949473857879639, + "learning_rate": 7.867298578199053e-06, + "loss": 2.2688, + "step": 1660 + }, + { + "epoch": 0.05937250022220247, + "grad_norm": 1.8707879781723022, + "learning_rate": 7.914691943127962e-06, + "loss": 2.2207, + "step": 1670 + }, + { + "epoch": 0.059728024175628835, + "grad_norm": 1.814133882522583, + "learning_rate": 7.962085308056872e-06, + "loss": 2.2056, + "step": 1680 + }, + { + "epoch": 0.0600835481290552, + "grad_norm": 1.8329755067825317, + "learning_rate": 8.009478672985783e-06, + "loss": 2.1702, + "step": 1690 + }, + { + "epoch": 0.06043907208248156, + "grad_norm": 1.839277744293213, + "learning_rate": 8.056872037914693e-06, + "loss": 2.2376, + "step": 1700 + }, + { + "epoch": 0.06079459603590792, + "grad_norm": 1.8632655143737793, + "learning_rate": 8.104265402843603e-06, + "loss": 2.232, + "step": 1710 + }, + { + "epoch": 0.061150119989334284, + "grad_norm": 1.8680291175842285, + "learning_rate": 8.151658767772512e-06, + "loss": 2.2177, + "step": 1720 + }, + { + "epoch": 0.061505643942760646, + "grad_norm": 1.8234357833862305, + "learning_rate": 8.199052132701422e-06, + "loss": 2.1791, + "step": 1730 + }, + { + "epoch": 0.06186116789618701, + "grad_norm": 1.8661161661148071, + "learning_rate": 8.246445497630333e-06, + "loss": 2.1528, + "step": 1740 + }, + { + "epoch": 0.06221669184961337, + "grad_norm": 1.90630042552948, + "learning_rate": 8.293838862559243e-06, + "loss": 2.1641, + "step": 1750 + }, + { + "epoch": 0.06257221580303973, + "grad_norm": 1.873467206954956, + "learning_rate": 8.341232227488152e-06, + "loss": 2.1503, + "step": 1760 + }, + { + "epoch": 0.0629277397564661, + "grad_norm": 1.8596463203430176, + "learning_rate": 8.388625592417062e-06, + "loss": 2.1719, + "step": 1770 + }, + { + "epoch": 0.06328326370989246, + "grad_norm": 1.8411222696304321, + "learning_rate": 8.436018957345973e-06, + "loss": 2.21, + "step": 1780 + }, + { + "epoch": 0.06363878766331882, + "grad_norm": 1.8672677278518677, + "learning_rate": 8.483412322274883e-06, + "loss": 2.1558, + "step": 1790 + }, + { + "epoch": 0.06399431161674518, + "grad_norm": 1.9354828596115112, + "learning_rate": 8.530805687203793e-06, + "loss": 2.2281, + "step": 1800 + }, + { + "epoch": 0.06434983557017154, + "grad_norm": 1.8663649559020996, + "learning_rate": 8.578199052132702e-06, + "loss": 2.2147, + "step": 1810 + }, + { + "epoch": 0.0647053595235979, + "grad_norm": 1.8387905359268188, + "learning_rate": 8.625592417061612e-06, + "loss": 2.2229, + "step": 1820 + }, + { + "epoch": 0.06506088347702427, + "grad_norm": 1.741399884223938, + "learning_rate": 8.672985781990521e-06, + "loss": 2.1799, + "step": 1830 + }, + { + "epoch": 0.06541640743045063, + "grad_norm": 1.9359266757965088, + "learning_rate": 8.720379146919431e-06, + "loss": 2.2001, + "step": 1840 + }, + { + "epoch": 0.06577193138387699, + "grad_norm": 1.9217205047607422, + "learning_rate": 8.767772511848342e-06, + "loss": 2.1804, + "step": 1850 + }, + { + "epoch": 0.06612745533730335, + "grad_norm": 1.98513662815094, + "learning_rate": 8.815165876777252e-06, + "loss": 2.1211, + "step": 1860 + }, + { + "epoch": 0.06648297929072972, + "grad_norm": 1.9897305965423584, + "learning_rate": 8.862559241706162e-06, + "loss": 2.2303, + "step": 1870 + }, + { + "epoch": 0.06683850324415608, + "grad_norm": 1.8167400360107422, + "learning_rate": 8.909952606635071e-06, + "loss": 2.2092, + "step": 1880 + }, + { + "epoch": 0.06719402719758244, + "grad_norm": 1.842371940612793, + "learning_rate": 8.957345971563981e-06, + "loss": 2.2234, + "step": 1890 + }, + { + "epoch": 0.0675495511510088, + "grad_norm": 1.8326691389083862, + "learning_rate": 9.004739336492892e-06, + "loss": 2.1621, + "step": 1900 + }, + { + "epoch": 0.06790507510443516, + "grad_norm": 1.914857029914856, + "learning_rate": 9.052132701421802e-06, + "loss": 2.1706, + "step": 1910 + }, + { + "epoch": 0.06826059905786153, + "grad_norm": 1.7782554626464844, + "learning_rate": 9.09952606635071e-06, + "loss": 2.176, + "step": 1920 + }, + { + "epoch": 0.06861612301128789, + "grad_norm": 1.8087050914764404, + "learning_rate": 9.146919431279621e-06, + "loss": 2.1664, + "step": 1930 + }, + { + "epoch": 0.06897164696471425, + "grad_norm": 1.9598517417907715, + "learning_rate": 9.194312796208532e-06, + "loss": 2.1703, + "step": 1940 + }, + { + "epoch": 0.06932717091814061, + "grad_norm": 1.9140313863754272, + "learning_rate": 9.241706161137442e-06, + "loss": 2.183, + "step": 1950 + }, + { + "epoch": 0.06968269487156697, + "grad_norm": 1.920876383781433, + "learning_rate": 9.289099526066352e-06, + "loss": 2.2278, + "step": 1960 + }, + { + "epoch": 0.07003821882499334, + "grad_norm": 1.7880984544754028, + "learning_rate": 9.336492890995261e-06, + "loss": 2.223, + "step": 1970 + }, + { + "epoch": 0.0703937427784197, + "grad_norm": 1.8553565740585327, + "learning_rate": 9.383886255924171e-06, + "loss": 2.1544, + "step": 1980 + }, + { + "epoch": 0.07074926673184606, + "grad_norm": 1.8793102502822876, + "learning_rate": 9.431279620853082e-06, + "loss": 2.1707, + "step": 1990 + }, + { + "epoch": 0.07110479068527242, + "grad_norm": 1.7974066734313965, + "learning_rate": 9.478672985781992e-06, + "loss": 2.1662, + "step": 2000 + }, + { + "epoch": 0.07146031463869879, + "grad_norm": 1.857040524482727, + "learning_rate": 9.5260663507109e-06, + "loss": 2.1379, + "step": 2010 + }, + { + "epoch": 0.07181583859212515, + "grad_norm": 1.8388969898223877, + "learning_rate": 9.573459715639811e-06, + "loss": 2.1987, + "step": 2020 + }, + { + "epoch": 0.07217136254555151, + "grad_norm": 2.041085958480835, + "learning_rate": 9.620853080568721e-06, + "loss": 2.157, + "step": 2030 + }, + { + "epoch": 0.07252688649897787, + "grad_norm": 1.8348536491394043, + "learning_rate": 9.668246445497632e-06, + "loss": 2.1598, + "step": 2040 + }, + { + "epoch": 0.07288241045240423, + "grad_norm": 1.81992769241333, + "learning_rate": 9.715639810426542e-06, + "loss": 2.1749, + "step": 2050 + }, + { + "epoch": 0.0732379344058306, + "grad_norm": 1.9568251371383667, + "learning_rate": 9.76303317535545e-06, + "loss": 2.1719, + "step": 2060 + }, + { + "epoch": 0.07359345835925696, + "grad_norm": 1.852169394493103, + "learning_rate": 9.810426540284361e-06, + "loss": 2.0978, + "step": 2070 + }, + { + "epoch": 0.07394898231268332, + "grad_norm": 1.8278820514678955, + "learning_rate": 9.85781990521327e-06, + "loss": 2.1978, + "step": 2080 + }, + { + "epoch": 0.07430450626610968, + "grad_norm": 2.0013930797576904, + "learning_rate": 9.905213270142182e-06, + "loss": 2.2025, + "step": 2090 + }, + { + "epoch": 0.07466003021953604, + "grad_norm": 1.822750210762024, + "learning_rate": 9.95260663507109e-06, + "loss": 2.1063, + "step": 2100 + }, + { + "epoch": 0.0750155541729624, + "grad_norm": 1.9742094278335571, + "learning_rate": 1e-05, + "loss": 2.173, + "step": 2110 + }, + { + "epoch": 0.07537107812638877, + "grad_norm": 1.7482434511184692, + "learning_rate": 1.004739336492891e-05, + "loss": 2.1546, + "step": 2120 + }, + { + "epoch": 0.07572660207981513, + "grad_norm": 1.820383071899414, + "learning_rate": 1.0094786729857822e-05, + "loss": 2.1565, + "step": 2130 + }, + { + "epoch": 0.07608212603324149, + "grad_norm": 1.7688885927200317, + "learning_rate": 1.0142180094786732e-05, + "loss": 2.1533, + "step": 2140 + }, + { + "epoch": 0.07643764998666785, + "grad_norm": 1.9063228368759155, + "learning_rate": 1.018957345971564e-05, + "loss": 2.1278, + "step": 2150 + }, + { + "epoch": 0.07679317394009422, + "grad_norm": 1.819233775138855, + "learning_rate": 1.023696682464455e-05, + "loss": 2.1686, + "step": 2160 + }, + { + "epoch": 0.07714869789352058, + "grad_norm": 1.8698222637176514, + "learning_rate": 1.0284360189573461e-05, + "loss": 2.1222, + "step": 2170 + }, + { + "epoch": 0.07750422184694694, + "grad_norm": 1.7308324575424194, + "learning_rate": 1.033175355450237e-05, + "loss": 2.1716, + "step": 2180 + }, + { + "epoch": 0.0778597458003733, + "grad_norm": 1.8845312595367432, + "learning_rate": 1.037914691943128e-05, + "loss": 2.157, + "step": 2190 + }, + { + "epoch": 0.07821526975379967, + "grad_norm": 1.7712396383285522, + "learning_rate": 1.0426540284360192e-05, + "loss": 2.1132, + "step": 2200 + }, + { + "epoch": 0.07857079370722603, + "grad_norm": 1.8825665712356567, + "learning_rate": 1.0473933649289101e-05, + "loss": 2.2037, + "step": 2210 + }, + { + "epoch": 0.07892631766065239, + "grad_norm": 1.7894903421401978, + "learning_rate": 1.052132701421801e-05, + "loss": 2.1479, + "step": 2220 + }, + { + "epoch": 0.07928184161407875, + "grad_norm": 1.774488925933838, + "learning_rate": 1.056872037914692e-05, + "loss": 2.1836, + "step": 2230 + }, + { + "epoch": 0.07963736556750511, + "grad_norm": 1.8073627948760986, + "learning_rate": 1.061611374407583e-05, + "loss": 2.1312, + "step": 2240 + }, + { + "epoch": 0.07999288952093148, + "grad_norm": 1.863756537437439, + "learning_rate": 1.066350710900474e-05, + "loss": 2.1381, + "step": 2250 + }, + { + "epoch": 0.08034841347435784, + "grad_norm": 1.8275458812713623, + "learning_rate": 1.071090047393365e-05, + "loss": 2.1564, + "step": 2260 + }, + { + "epoch": 0.0807039374277842, + "grad_norm": 1.8978358507156372, + "learning_rate": 1.075829383886256e-05, + "loss": 2.1592, + "step": 2270 + }, + { + "epoch": 0.08105946138121056, + "grad_norm": 1.9047290086746216, + "learning_rate": 1.080568720379147e-05, + "loss": 2.1967, + "step": 2280 + }, + { + "epoch": 0.08141498533463692, + "grad_norm": 1.7772575616836548, + "learning_rate": 1.085308056872038e-05, + "loss": 2.1896, + "step": 2290 + }, + { + "epoch": 0.08177050928806329, + "grad_norm": 1.8947906494140625, + "learning_rate": 1.0900473933649289e-05, + "loss": 2.1464, + "step": 2300 + }, + { + "epoch": 0.08212603324148965, + "grad_norm": 1.8788243532180786, + "learning_rate": 1.0947867298578201e-05, + "loss": 2.196, + "step": 2310 + }, + { + "epoch": 0.08248155719491601, + "grad_norm": 1.8443186283111572, + "learning_rate": 1.099526066350711e-05, + "loss": 2.1347, + "step": 2320 + }, + { + "epoch": 0.08283708114834237, + "grad_norm": 1.757403016090393, + "learning_rate": 1.104265402843602e-05, + "loss": 2.1551, + "step": 2330 + }, + { + "epoch": 0.08319260510176874, + "grad_norm": 1.8618059158325195, + "learning_rate": 1.1090047393364929e-05, + "loss": 2.1738, + "step": 2340 + }, + { + "epoch": 0.0835481290551951, + "grad_norm": 1.759847640991211, + "learning_rate": 1.1137440758293841e-05, + "loss": 2.1513, + "step": 2350 + }, + { + "epoch": 0.08390365300862146, + "grad_norm": 1.8621257543563843, + "learning_rate": 1.118483412322275e-05, + "loss": 2.1029, + "step": 2360 + }, + { + "epoch": 0.08425917696204782, + "grad_norm": 1.7522389888763428, + "learning_rate": 1.1232227488151658e-05, + "loss": 2.1761, + "step": 2370 + }, + { + "epoch": 0.08461470091547418, + "grad_norm": 1.8118172883987427, + "learning_rate": 1.127962085308057e-05, + "loss": 2.1581, + "step": 2380 + }, + { + "epoch": 0.08497022486890055, + "grad_norm": 1.8402385711669922, + "learning_rate": 1.132701421800948e-05, + "loss": 2.1761, + "step": 2390 + }, + { + "epoch": 0.08532574882232691, + "grad_norm": 1.7629215717315674, + "learning_rate": 1.137440758293839e-05, + "loss": 2.1369, + "step": 2400 + }, + { + "epoch": 0.08568127277575327, + "grad_norm": 1.7961244583129883, + "learning_rate": 1.1421800947867298e-05, + "loss": 2.1293, + "step": 2410 + }, + { + "epoch": 0.08603679672917963, + "grad_norm": 1.9245836734771729, + "learning_rate": 1.146919431279621e-05, + "loss": 2.1529, + "step": 2420 + }, + { + "epoch": 0.086392320682606, + "grad_norm": 1.871652364730835, + "learning_rate": 1.1516587677725119e-05, + "loss": 2.1264, + "step": 2430 + }, + { + "epoch": 0.08674784463603236, + "grad_norm": 1.7594822645187378, + "learning_rate": 1.1563981042654029e-05, + "loss": 2.1433, + "step": 2440 + }, + { + "epoch": 0.08710336858945872, + "grad_norm": 1.9148240089416504, + "learning_rate": 1.1611374407582941e-05, + "loss": 2.1479, + "step": 2450 + }, + { + "epoch": 0.08745889254288508, + "grad_norm": 1.8528403043746948, + "learning_rate": 1.165876777251185e-05, + "loss": 2.1668, + "step": 2460 + }, + { + "epoch": 0.08781441649631144, + "grad_norm": 1.8765084743499756, + "learning_rate": 1.1706161137440758e-05, + "loss": 2.1776, + "step": 2470 + }, + { + "epoch": 0.0881699404497378, + "grad_norm": 1.8227301836013794, + "learning_rate": 1.1753554502369669e-05, + "loss": 2.1451, + "step": 2480 + }, + { + "epoch": 0.08852546440316417, + "grad_norm": 1.866655945777893, + "learning_rate": 1.180094786729858e-05, + "loss": 2.1236, + "step": 2490 + }, + { + "epoch": 0.08888098835659053, + "grad_norm": 1.8193955421447754, + "learning_rate": 1.184834123222749e-05, + "loss": 2.11, + "step": 2500 + }, + { + "epoch": 0.08923651231001689, + "grad_norm": 1.866882085800171, + "learning_rate": 1.1895734597156398e-05, + "loss": 2.1669, + "step": 2510 + }, + { + "epoch": 0.08959203626344325, + "grad_norm": 1.7875139713287354, + "learning_rate": 1.1943127962085309e-05, + "loss": 2.1876, + "step": 2520 + }, + { + "epoch": 0.08994756021686962, + "grad_norm": 1.8214070796966553, + "learning_rate": 1.1990521327014219e-05, + "loss": 2.173, + "step": 2530 + }, + { + "epoch": 0.09030308417029598, + "grad_norm": 1.7724765539169312, + "learning_rate": 1.203791469194313e-05, + "loss": 2.1776, + "step": 2540 + }, + { + "epoch": 0.09065860812372234, + "grad_norm": 1.9167289733886719, + "learning_rate": 1.2085308056872038e-05, + "loss": 2.1499, + "step": 2550 + }, + { + "epoch": 0.0910141320771487, + "grad_norm": 1.8983221054077148, + "learning_rate": 1.213270142180095e-05, + "loss": 2.142, + "step": 2560 + }, + { + "epoch": 0.09136965603057506, + "grad_norm": 1.8105605840682983, + "learning_rate": 1.2180094786729859e-05, + "loss": 2.1533, + "step": 2570 + }, + { + "epoch": 0.09172517998400143, + "grad_norm": 1.761996865272522, + "learning_rate": 1.2227488151658769e-05, + "loss": 2.1532, + "step": 2580 + }, + { + "epoch": 0.09208070393742779, + "grad_norm": 1.8852143287658691, + "learning_rate": 1.2274881516587678e-05, + "loss": 2.1707, + "step": 2590 + }, + { + "epoch": 0.09243622789085415, + "grad_norm": 1.7447150945663452, + "learning_rate": 1.232227488151659e-05, + "loss": 2.1333, + "step": 2600 + }, + { + "epoch": 0.09279175184428051, + "grad_norm": 2.3988707065582275, + "learning_rate": 1.2369668246445498e-05, + "loss": 2.153, + "step": 2610 + }, + { + "epoch": 0.09314727579770687, + "grad_norm": 1.8021430969238281, + "learning_rate": 1.2417061611374409e-05, + "loss": 2.0996, + "step": 2620 + }, + { + "epoch": 0.09350279975113324, + "grad_norm": 1.8730841875076294, + "learning_rate": 1.2464454976303319e-05, + "loss": 2.1489, + "step": 2630 + }, + { + "epoch": 0.0938583237045596, + "grad_norm": 1.8202563524246216, + "learning_rate": 1.251184834123223e-05, + "loss": 2.1429, + "step": 2640 + }, + { + "epoch": 0.09421384765798596, + "grad_norm": 1.9044020175933838, + "learning_rate": 1.2559241706161138e-05, + "loss": 2.1005, + "step": 2650 + }, + { + "epoch": 0.09456937161141232, + "grad_norm": 1.7460588216781616, + "learning_rate": 1.2606635071090047e-05, + "loss": 2.1891, + "step": 2660 + }, + { + "epoch": 0.09492489556483868, + "grad_norm": 1.8756009340286255, + "learning_rate": 1.2654028436018959e-05, + "loss": 2.1536, + "step": 2670 + }, + { + "epoch": 0.09528041951826505, + "grad_norm": 1.7713373899459839, + "learning_rate": 1.270142180094787e-05, + "loss": 2.1297, + "step": 2680 + }, + { + "epoch": 0.09563594347169141, + "grad_norm": 1.802303433418274, + "learning_rate": 1.2748815165876778e-05, + "loss": 2.1254, + "step": 2690 + }, + { + "epoch": 0.09599146742511777, + "grad_norm": 1.7659459114074707, + "learning_rate": 1.2796208530805687e-05, + "loss": 2.1714, + "step": 2700 + }, + { + "epoch": 0.09634699137854413, + "grad_norm": 1.8053789138793945, + "learning_rate": 1.2843601895734599e-05, + "loss": 2.0936, + "step": 2710 + }, + { + "epoch": 0.0967025153319705, + "grad_norm": 1.755724310874939, + "learning_rate": 1.2890995260663507e-05, + "loss": 2.1576, + "step": 2720 + }, + { + "epoch": 0.09705803928539686, + "grad_norm": 1.7820082902908325, + "learning_rate": 1.2938388625592418e-05, + "loss": 2.1376, + "step": 2730 + }, + { + "epoch": 0.09741356323882322, + "grad_norm": 1.8214658498764038, + "learning_rate": 1.298578199052133e-05, + "loss": 2.1255, + "step": 2740 + }, + { + "epoch": 0.09776908719224958, + "grad_norm": 1.6887913942337036, + "learning_rate": 1.3033175355450238e-05, + "loss": 2.1164, + "step": 2750 + }, + { + "epoch": 0.09812461114567594, + "grad_norm": 1.791401743888855, + "learning_rate": 1.3080568720379147e-05, + "loss": 2.0981, + "step": 2760 + }, + { + "epoch": 0.0984801350991023, + "grad_norm": 2.0565669536590576, + "learning_rate": 1.3127962085308057e-05, + "loss": 2.1544, + "step": 2770 + }, + { + "epoch": 0.09883565905252867, + "grad_norm": 1.7514798641204834, + "learning_rate": 1.3175355450236968e-05, + "loss": 2.1674, + "step": 2780 + }, + { + "epoch": 0.09919118300595503, + "grad_norm": 1.9238693714141846, + "learning_rate": 1.3222748815165878e-05, + "loss": 2.1556, + "step": 2790 + }, + { + "epoch": 0.09954670695938139, + "grad_norm": 1.700263500213623, + "learning_rate": 1.3270142180094787e-05, + "loss": 2.1248, + "step": 2800 + }, + { + "epoch": 0.09990223091280775, + "grad_norm": 1.8183735609054565, + "learning_rate": 1.3317535545023699e-05, + "loss": 2.0735, + "step": 2810 + }, + { + "epoch": 0.10025775486623412, + "grad_norm": 1.8340328931808472, + "learning_rate": 1.3364928909952607e-05, + "loss": 2.1051, + "step": 2820 + }, + { + "epoch": 0.10061327881966048, + "grad_norm": 1.866182565689087, + "learning_rate": 1.3412322274881518e-05, + "loss": 2.1201, + "step": 2830 + }, + { + "epoch": 0.10096880277308684, + "grad_norm": 2.0255119800567627, + "learning_rate": 1.3459715639810426e-05, + "loss": 2.1717, + "step": 2840 + }, + { + "epoch": 0.1013243267265132, + "grad_norm": 1.772624135017395, + "learning_rate": 1.3507109004739339e-05, + "loss": 2.1187, + "step": 2850 + }, + { + "epoch": 0.10167985067993957, + "grad_norm": 1.7913185358047485, + "learning_rate": 1.3554502369668247e-05, + "loss": 2.1251, + "step": 2860 + }, + { + "epoch": 0.10203537463336593, + "grad_norm": 1.756658911705017, + "learning_rate": 1.3601895734597158e-05, + "loss": 2.1102, + "step": 2870 + }, + { + "epoch": 0.10239089858679229, + "grad_norm": 1.7615718841552734, + "learning_rate": 1.3649289099526068e-05, + "loss": 2.1116, + "step": 2880 + }, + { + "epoch": 0.10274642254021865, + "grad_norm": 1.9718692302703857, + "learning_rate": 1.3696682464454978e-05, + "loss": 2.1488, + "step": 2890 + }, + { + "epoch": 0.10310194649364501, + "grad_norm": 1.7149349451065063, + "learning_rate": 1.3744075829383887e-05, + "loss": 2.1533, + "step": 2900 + }, + { + "epoch": 0.10345747044707138, + "grad_norm": 1.900169849395752, + "learning_rate": 1.3791469194312797e-05, + "loss": 2.1513, + "step": 2910 + }, + { + "epoch": 0.10381299440049774, + "grad_norm": 1.909778118133545, + "learning_rate": 1.3838862559241708e-05, + "loss": 2.1273, + "step": 2920 + }, + { + "epoch": 0.1041685183539241, + "grad_norm": 1.7948297262191772, + "learning_rate": 1.3886255924170618e-05, + "loss": 2.0679, + "step": 2930 + }, + { + "epoch": 0.10452404230735046, + "grad_norm": 1.755152940750122, + "learning_rate": 1.3933649289099527e-05, + "loss": 2.0874, + "step": 2940 + }, + { + "epoch": 0.10487956626077682, + "grad_norm": 1.7649750709533691, + "learning_rate": 1.3981042654028435e-05, + "loss": 2.1234, + "step": 2950 + }, + { + "epoch": 0.10523509021420319, + "grad_norm": 1.7477363348007202, + "learning_rate": 1.4028436018957347e-05, + "loss": 2.175, + "step": 2960 + }, + { + "epoch": 0.10559061416762955, + "grad_norm": 1.8121856451034546, + "learning_rate": 1.4075829383886258e-05, + "loss": 2.0937, + "step": 2970 + }, + { + "epoch": 0.10594613812105591, + "grad_norm": 1.7684760093688965, + "learning_rate": 1.4123222748815166e-05, + "loss": 2.1086, + "step": 2980 + }, + { + "epoch": 0.10630166207448227, + "grad_norm": 1.7173081636428833, + "learning_rate": 1.4170616113744078e-05, + "loss": 2.1222, + "step": 2990 + }, + { + "epoch": 0.10665718602790863, + "grad_norm": 1.7345879077911377, + "learning_rate": 1.4218009478672987e-05, + "loss": 2.1364, + "step": 3000 + }, + { + "epoch": 0.107012709981335, + "grad_norm": 1.7358227968215942, + "learning_rate": 1.4265402843601896e-05, + "loss": 2.1226, + "step": 3010 + }, + { + "epoch": 0.10736823393476136, + "grad_norm": 1.8484522104263306, + "learning_rate": 1.4312796208530806e-05, + "loss": 2.1451, + "step": 3020 + }, + { + "epoch": 0.10772375788818772, + "grad_norm": 1.7787588834762573, + "learning_rate": 1.4360189573459718e-05, + "loss": 2.1015, + "step": 3030 + }, + { + "epoch": 0.10807928184161408, + "grad_norm": 1.7221862077713013, + "learning_rate": 1.4407582938388627e-05, + "loss": 2.1076, + "step": 3040 + }, + { + "epoch": 0.10843480579504045, + "grad_norm": 1.825199842453003, + "learning_rate": 1.4454976303317535e-05, + "loss": 2.1341, + "step": 3050 + }, + { + "epoch": 0.10879032974846681, + "grad_norm": 1.7941467761993408, + "learning_rate": 1.4502369668246448e-05, + "loss": 2.0934, + "step": 3060 + }, + { + "epoch": 0.10914585370189317, + "grad_norm": 1.7535361051559448, + "learning_rate": 1.4549763033175356e-05, + "loss": 2.1409, + "step": 3070 + }, + { + "epoch": 0.10950137765531953, + "grad_norm": 1.778220295906067, + "learning_rate": 1.4597156398104267e-05, + "loss": 2.0991, + "step": 3080 + }, + { + "epoch": 0.1098569016087459, + "grad_norm": 1.8794920444488525, + "learning_rate": 1.4644549763033175e-05, + "loss": 2.1184, + "step": 3090 + }, + { + "epoch": 0.11021242556217226, + "grad_norm": 1.7211918830871582, + "learning_rate": 1.4691943127962087e-05, + "loss": 2.1462, + "step": 3100 + }, + { + "epoch": 0.11056794951559862, + "grad_norm": 1.821576476097107, + "learning_rate": 1.4739336492890996e-05, + "loss": 2.1064, + "step": 3110 + }, + { + "epoch": 0.11092347346902498, + "grad_norm": 1.7056803703308105, + "learning_rate": 1.4786729857819906e-05, + "loss": 2.1186, + "step": 3120 + }, + { + "epoch": 0.11127899742245134, + "grad_norm": 1.8238040208816528, + "learning_rate": 1.4834123222748817e-05, + "loss": 2.0878, + "step": 3130 + }, + { + "epoch": 0.1116345213758777, + "grad_norm": 1.8366920948028564, + "learning_rate": 1.4881516587677727e-05, + "loss": 2.0733, + "step": 3140 + }, + { + "epoch": 0.11199004532930407, + "grad_norm": 1.7596580982208252, + "learning_rate": 1.4928909952606636e-05, + "loss": 2.1166, + "step": 3150 + }, + { + "epoch": 0.11234556928273043, + "grad_norm": 1.6376357078552246, + "learning_rate": 1.4976303317535546e-05, + "loss": 2.1283, + "step": 3160 + }, + { + "epoch": 0.11270109323615679, + "grad_norm": 1.7873460054397583, + "learning_rate": 1.5023696682464456e-05, + "loss": 2.1222, + "step": 3170 + }, + { + "epoch": 0.11305661718958315, + "grad_norm": 1.809497356414795, + "learning_rate": 1.5071090047393367e-05, + "loss": 2.1203, + "step": 3180 + }, + { + "epoch": 0.11341214114300951, + "grad_norm": 1.7785967588424683, + "learning_rate": 1.5118483412322275e-05, + "loss": 2.1825, + "step": 3190 + }, + { + "epoch": 0.11376766509643588, + "grad_norm": 1.6852540969848633, + "learning_rate": 1.5165876777251186e-05, + "loss": 2.0729, + "step": 3200 + }, + { + "epoch": 0.11412318904986224, + "grad_norm": 1.8209391832351685, + "learning_rate": 1.5213270142180096e-05, + "loss": 2.1747, + "step": 3210 + }, + { + "epoch": 0.1144787130032886, + "grad_norm": 1.730448603630066, + "learning_rate": 1.5260663507109007e-05, + "loss": 2.0646, + "step": 3220 + }, + { + "epoch": 0.11483423695671496, + "grad_norm": 1.8416671752929688, + "learning_rate": 1.5308056872037915e-05, + "loss": 2.1426, + "step": 3230 + }, + { + "epoch": 0.11518976091014133, + "grad_norm": 1.871046781539917, + "learning_rate": 1.5355450236966827e-05, + "loss": 2.1861, + "step": 3240 + }, + { + "epoch": 0.11554528486356769, + "grad_norm": 1.7909883260726929, + "learning_rate": 1.5402843601895736e-05, + "loss": 2.0659, + "step": 3250 + }, + { + "epoch": 0.11590080881699405, + "grad_norm": 1.7126774787902832, + "learning_rate": 1.5450236966824645e-05, + "loss": 2.1062, + "step": 3260 + }, + { + "epoch": 0.11625633277042041, + "grad_norm": 1.8236619234085083, + "learning_rate": 1.5497630331753553e-05, + "loss": 2.0713, + "step": 3270 + }, + { + "epoch": 0.11661185672384677, + "grad_norm": 1.7866365909576416, + "learning_rate": 1.5545023696682465e-05, + "loss": 2.0975, + "step": 3280 + }, + { + "epoch": 0.11696738067727314, + "grad_norm": 1.8846007585525513, + "learning_rate": 1.5592417061611377e-05, + "loss": 2.0856, + "step": 3290 + }, + { + "epoch": 0.1173229046306995, + "grad_norm": 1.8732142448425293, + "learning_rate": 1.5639810426540286e-05, + "loss": 2.0821, + "step": 3300 + }, + { + "epoch": 0.11767842858412586, + "grad_norm": 1.805627465248108, + "learning_rate": 1.5687203791469198e-05, + "loss": 2.0877, + "step": 3310 + }, + { + "epoch": 0.11803395253755222, + "grad_norm": 1.6648415327072144, + "learning_rate": 1.5734597156398107e-05, + "loss": 2.1029, + "step": 3320 + }, + { + "epoch": 0.11838947649097858, + "grad_norm": 1.9431143999099731, + "learning_rate": 1.5781990521327015e-05, + "loss": 2.1317, + "step": 3330 + }, + { + "epoch": 0.11874500044440495, + "grad_norm": 1.7122180461883545, + "learning_rate": 1.5829383886255924e-05, + "loss": 2.1403, + "step": 3340 + }, + { + "epoch": 0.11910052439783131, + "grad_norm": 1.8386825323104858, + "learning_rate": 1.5876777251184836e-05, + "loss": 2.1307, + "step": 3350 + }, + { + "epoch": 0.11945604835125767, + "grad_norm": 1.8558971881866455, + "learning_rate": 1.5924170616113745e-05, + "loss": 2.0973, + "step": 3360 + }, + { + "epoch": 0.11981157230468403, + "grad_norm": 1.8701891899108887, + "learning_rate": 1.5971563981042653e-05, + "loss": 2.0487, + "step": 3370 + }, + { + "epoch": 0.1201670962581104, + "grad_norm": 1.7447658777236938, + "learning_rate": 1.6018957345971565e-05, + "loss": 2.0547, + "step": 3380 + }, + { + "epoch": 0.12052262021153676, + "grad_norm": 1.8656021356582642, + "learning_rate": 1.6066350710900474e-05, + "loss": 2.1156, + "step": 3390 + }, + { + "epoch": 0.12087814416496312, + "grad_norm": 1.718123197555542, + "learning_rate": 1.6113744075829386e-05, + "loss": 2.0908, + "step": 3400 + }, + { + "epoch": 0.12123366811838948, + "grad_norm": 1.6136643886566162, + "learning_rate": 1.6161137440758295e-05, + "loss": 2.0786, + "step": 3410 + }, + { + "epoch": 0.12158919207181584, + "grad_norm": 1.748200535774231, + "learning_rate": 1.6208530805687207e-05, + "loss": 2.0935, + "step": 3420 + }, + { + "epoch": 0.1219447160252422, + "grad_norm": 1.918379306793213, + "learning_rate": 1.6255924170616116e-05, + "loss": 2.1453, + "step": 3430 + }, + { + "epoch": 0.12230023997866857, + "grad_norm": 1.996518611907959, + "learning_rate": 1.6303317535545024e-05, + "loss": 2.1181, + "step": 3440 + }, + { + "epoch": 0.12265576393209493, + "grad_norm": 1.7767255306243896, + "learning_rate": 1.6350710900473933e-05, + "loss": 2.1108, + "step": 3450 + }, + { + "epoch": 0.12301128788552129, + "grad_norm": 1.7845118045806885, + "learning_rate": 1.6398104265402845e-05, + "loss": 2.0712, + "step": 3460 + }, + { + "epoch": 0.12336681183894765, + "grad_norm": 1.7948838472366333, + "learning_rate": 1.6445497630331754e-05, + "loss": 2.0738, + "step": 3470 + }, + { + "epoch": 0.12372233579237402, + "grad_norm": 1.703697919845581, + "learning_rate": 1.6492890995260666e-05, + "loss": 2.112, + "step": 3480 + }, + { + "epoch": 0.12407785974580038, + "grad_norm": 1.8490139245986938, + "learning_rate": 1.6540284360189574e-05, + "loss": 2.0903, + "step": 3490 + }, + { + "epoch": 0.12443338369922674, + "grad_norm": 1.7895227670669556, + "learning_rate": 1.6587677725118486e-05, + "loss": 2.0949, + "step": 3500 + }, + { + "epoch": 0.1247889076526531, + "grad_norm": 1.7390356063842773, + "learning_rate": 1.6635071090047395e-05, + "loss": 2.0782, + "step": 3510 + }, + { + "epoch": 0.12514443160607946, + "grad_norm": 1.760340690612793, + "learning_rate": 1.6682464454976304e-05, + "loss": 2.0841, + "step": 3520 + }, + { + "epoch": 0.12549995555950583, + "grad_norm": 1.7745305299758911, + "learning_rate": 1.6729857819905216e-05, + "loss": 2.0654, + "step": 3530 + }, + { + "epoch": 0.1258554795129322, + "grad_norm": 1.7406567335128784, + "learning_rate": 1.6777251184834124e-05, + "loss": 2.0701, + "step": 3540 + }, + { + "epoch": 0.12621100346635855, + "grad_norm": 1.71855628490448, + "learning_rate": 1.6824644549763033e-05, + "loss": 2.158, + "step": 3550 + }, + { + "epoch": 0.1265665274197849, + "grad_norm": 1.7532312870025635, + "learning_rate": 1.6872037914691945e-05, + "loss": 2.104, + "step": 3560 + }, + { + "epoch": 0.12692205137321128, + "grad_norm": 1.8171640634536743, + "learning_rate": 1.6919431279620854e-05, + "loss": 2.1028, + "step": 3570 + }, + { + "epoch": 0.12727757532663764, + "grad_norm": 1.8336002826690674, + "learning_rate": 1.6966824644549766e-05, + "loss": 2.06, + "step": 3580 + }, + { + "epoch": 0.127633099280064, + "grad_norm": 1.8022377490997314, + "learning_rate": 1.7014218009478674e-05, + "loss": 2.0928, + "step": 3590 + }, + { + "epoch": 0.12798862323349036, + "grad_norm": 1.7392102479934692, + "learning_rate": 1.7061611374407587e-05, + "loss": 2.118, + "step": 3600 + }, + { + "epoch": 0.12834414718691672, + "grad_norm": 1.7536499500274658, + "learning_rate": 1.7109004739336495e-05, + "loss": 2.0753, + "step": 3610 + }, + { + "epoch": 0.12869967114034309, + "grad_norm": 1.6055700778961182, + "learning_rate": 1.7156398104265404e-05, + "loss": 2.0485, + "step": 3620 + }, + { + "epoch": 0.12905519509376945, + "grad_norm": 1.7383241653442383, + "learning_rate": 1.7203791469194316e-05, + "loss": 2.1041, + "step": 3630 + }, + { + "epoch": 0.1294107190471958, + "grad_norm": 1.7801284790039062, + "learning_rate": 1.7251184834123225e-05, + "loss": 2.1046, + "step": 3640 + }, + { + "epoch": 0.12976624300062217, + "grad_norm": 1.816885232925415, + "learning_rate": 1.7298578199052133e-05, + "loss": 2.1035, + "step": 3650 + }, + { + "epoch": 0.13012176695404853, + "grad_norm": 1.8502243757247925, + "learning_rate": 1.7345971563981042e-05, + "loss": 2.0305, + "step": 3660 + }, + { + "epoch": 0.1304772909074749, + "grad_norm": 1.803113341331482, + "learning_rate": 1.7393364928909954e-05, + "loss": 2.0559, + "step": 3670 + }, + { + "epoch": 0.13083281486090126, + "grad_norm": 1.7713572978973389, + "learning_rate": 1.7440758293838863e-05, + "loss": 2.0995, + "step": 3680 + }, + { + "epoch": 0.13118833881432762, + "grad_norm": 1.7642992734909058, + "learning_rate": 1.7488151658767775e-05, + "loss": 2.1226, + "step": 3690 + }, + { + "epoch": 0.13154386276775398, + "grad_norm": 1.766026258468628, + "learning_rate": 1.7535545023696683e-05, + "loss": 2.0462, + "step": 3700 + }, + { + "epoch": 0.13189938672118035, + "grad_norm": 1.7639282941818237, + "learning_rate": 1.7582938388625595e-05, + "loss": 2.0673, + "step": 3710 + }, + { + "epoch": 0.1322549106746067, + "grad_norm": 1.7410707473754883, + "learning_rate": 1.7630331753554504e-05, + "loss": 2.0288, + "step": 3720 + }, + { + "epoch": 0.13261043462803307, + "grad_norm": 1.8906439542770386, + "learning_rate": 1.7677725118483413e-05, + "loss": 2.1113, + "step": 3730 + }, + { + "epoch": 0.13296595858145943, + "grad_norm": 1.8331823348999023, + "learning_rate": 1.7725118483412325e-05, + "loss": 2.1004, + "step": 3740 + }, + { + "epoch": 0.1333214825348858, + "grad_norm": 1.6593939065933228, + "learning_rate": 1.7772511848341233e-05, + "loss": 2.1035, + "step": 3750 + }, + { + "epoch": 0.13367700648831216, + "grad_norm": 1.7040388584136963, + "learning_rate": 1.7819905213270142e-05, + "loss": 2.091, + "step": 3760 + }, + { + "epoch": 0.13403253044173852, + "grad_norm": 1.8393237590789795, + "learning_rate": 1.7867298578199054e-05, + "loss": 2.1337, + "step": 3770 + }, + { + "epoch": 0.13438805439516488, + "grad_norm": 1.8253166675567627, + "learning_rate": 1.7914691943127963e-05, + "loss": 2.1284, + "step": 3780 + }, + { + "epoch": 0.13474357834859124, + "grad_norm": 1.8016252517700195, + "learning_rate": 1.7962085308056875e-05, + "loss": 2.0431, + "step": 3790 + }, + { + "epoch": 0.1350991023020176, + "grad_norm": 1.87069833278656, + "learning_rate": 1.8009478672985784e-05, + "loss": 2.1197, + "step": 3800 + }, + { + "epoch": 0.13545462625544397, + "grad_norm": 1.6724790334701538, + "learning_rate": 1.8056872037914696e-05, + "loss": 2.0782, + "step": 3810 + }, + { + "epoch": 0.13581015020887033, + "grad_norm": 1.9466593265533447, + "learning_rate": 1.8104265402843604e-05, + "loss": 2.0586, + "step": 3820 + }, + { + "epoch": 0.1361656741622967, + "grad_norm": 1.7319644689559937, + "learning_rate": 1.8151658767772513e-05, + "loss": 2.0381, + "step": 3830 + }, + { + "epoch": 0.13652119811572305, + "grad_norm": 1.7674616575241089, + "learning_rate": 1.819905213270142e-05, + "loss": 2.0909, + "step": 3840 + }, + { + "epoch": 0.13687672206914941, + "grad_norm": 1.7459237575531006, + "learning_rate": 1.8246445497630334e-05, + "loss": 2.1042, + "step": 3850 + }, + { + "epoch": 0.13723224602257578, + "grad_norm": 1.760031819343567, + "learning_rate": 1.8293838862559242e-05, + "loss": 2.0488, + "step": 3860 + }, + { + "epoch": 0.13758776997600214, + "grad_norm": 1.712561011314392, + "learning_rate": 1.8341232227488154e-05, + "loss": 2.0586, + "step": 3870 + }, + { + "epoch": 0.1379432939294285, + "grad_norm": 1.7915306091308594, + "learning_rate": 1.8388625592417063e-05, + "loss": 2.0766, + "step": 3880 + }, + { + "epoch": 0.13829881788285486, + "grad_norm": 1.7054728269577026, + "learning_rate": 1.8436018957345975e-05, + "loss": 2.0715, + "step": 3890 + }, + { + "epoch": 0.13865434183628123, + "grad_norm": 1.7007368803024292, + "learning_rate": 1.8483412322274884e-05, + "loss": 2.095, + "step": 3900 + }, + { + "epoch": 0.1390098657897076, + "grad_norm": 1.7768644094467163, + "learning_rate": 1.8530805687203792e-05, + "loss": 2.0839, + "step": 3910 + }, + { + "epoch": 0.13936538974313395, + "grad_norm": 1.7239347696304321, + "learning_rate": 1.8578199052132704e-05, + "loss": 2.0959, + "step": 3920 + }, + { + "epoch": 0.1397209136965603, + "grad_norm": 1.720320463180542, + "learning_rate": 1.8625592417061613e-05, + "loss": 2.1016, + "step": 3930 + }, + { + "epoch": 0.14007643764998667, + "grad_norm": 1.6920055150985718, + "learning_rate": 1.8672985781990522e-05, + "loss": 2.0696, + "step": 3940 + }, + { + "epoch": 0.14043196160341304, + "grad_norm": 1.734519124031067, + "learning_rate": 1.872037914691943e-05, + "loss": 2.0701, + "step": 3950 + }, + { + "epoch": 0.1407874855568394, + "grad_norm": 1.675017237663269, + "learning_rate": 1.8767772511848342e-05, + "loss": 2.0882, + "step": 3960 + }, + { + "epoch": 0.14114300951026576, + "grad_norm": 1.674024224281311, + "learning_rate": 1.881516587677725e-05, + "loss": 2.0911, + "step": 3970 + }, + { + "epoch": 0.14149853346369212, + "grad_norm": 1.8348963260650635, + "learning_rate": 1.8862559241706163e-05, + "loss": 2.0811, + "step": 3980 + }, + { + "epoch": 0.14185405741711848, + "grad_norm": 1.7325314283370972, + "learning_rate": 1.8909952606635075e-05, + "loss": 2.1263, + "step": 3990 + }, + { + "epoch": 0.14220958137054485, + "grad_norm": 1.7979702949523926, + "learning_rate": 1.8957345971563984e-05, + "loss": 2.0669, + "step": 4000 + }, + { + "epoch": 0.1425651053239712, + "grad_norm": 1.8183528184890747, + "learning_rate": 1.9004739336492893e-05, + "loss": 2.0812, + "step": 4010 + }, + { + "epoch": 0.14292062927739757, + "grad_norm": 1.834951400756836, + "learning_rate": 1.90521327014218e-05, + "loss": 2.1108, + "step": 4020 + }, + { + "epoch": 0.14327615323082393, + "grad_norm": 1.8123762607574463, + "learning_rate": 1.9099526066350713e-05, + "loss": 2.1003, + "step": 4030 + }, + { + "epoch": 0.1436316771842503, + "grad_norm": 1.7275183200836182, + "learning_rate": 1.9146919431279622e-05, + "loss": 2.0628, + "step": 4040 + }, + { + "epoch": 0.14398720113767666, + "grad_norm": 1.761306643486023, + "learning_rate": 1.919431279620853e-05, + "loss": 2.0798, + "step": 4050 + }, + { + "epoch": 0.14434272509110302, + "grad_norm": 1.7983522415161133, + "learning_rate": 1.9241706161137443e-05, + "loss": 2.0592, + "step": 4060 + }, + { + "epoch": 0.14469824904452938, + "grad_norm": 1.7820279598236084, + "learning_rate": 1.928909952606635e-05, + "loss": 2.1256, + "step": 4070 + }, + { + "epoch": 0.14505377299795574, + "grad_norm": 1.7048124074935913, + "learning_rate": 1.9336492890995263e-05, + "loss": 2.0455, + "step": 4080 + }, + { + "epoch": 0.1454092969513821, + "grad_norm": 1.6913201808929443, + "learning_rate": 1.9383886255924172e-05, + "loss": 2.0553, + "step": 4090 + }, + { + "epoch": 0.14576482090480847, + "grad_norm": 1.7552233934402466, + "learning_rate": 1.9431279620853084e-05, + "loss": 2.0603, + "step": 4100 + }, + { + "epoch": 0.14612034485823483, + "grad_norm": 1.876004934310913, + "learning_rate": 1.9478672985781993e-05, + "loss": 2.055, + "step": 4110 + }, + { + "epoch": 0.1464758688116612, + "grad_norm": 1.7505298852920532, + "learning_rate": 1.95260663507109e-05, + "loss": 2.0663, + "step": 4120 + }, + { + "epoch": 0.14683139276508755, + "grad_norm": 1.703034520149231, + "learning_rate": 1.957345971563981e-05, + "loss": 2.057, + "step": 4130 + }, + { + "epoch": 0.14718691671851392, + "grad_norm": 1.696067214012146, + "learning_rate": 1.9620853080568722e-05, + "loss": 2.056, + "step": 4140 + }, + { + "epoch": 0.14754244067194028, + "grad_norm": 1.7719448804855347, + "learning_rate": 1.966824644549763e-05, + "loss": 2.0528, + "step": 4150 + }, + { + "epoch": 0.14789796462536664, + "grad_norm": 1.7029516696929932, + "learning_rate": 1.971563981042654e-05, + "loss": 2.0723, + "step": 4160 + }, + { + "epoch": 0.148253488578793, + "grad_norm": 1.654297113418579, + "learning_rate": 1.976303317535545e-05, + "loss": 2.1012, + "step": 4170 + }, + { + "epoch": 0.14860901253221936, + "grad_norm": 1.712504506111145, + "learning_rate": 1.9810426540284364e-05, + "loss": 2.0764, + "step": 4180 + }, + { + "epoch": 0.14896453648564573, + "grad_norm": 1.783565878868103, + "learning_rate": 1.9857819905213272e-05, + "loss": 2.0711, + "step": 4190 + }, + { + "epoch": 0.1493200604390721, + "grad_norm": 1.6996036767959595, + "learning_rate": 1.990521327014218e-05, + "loss": 2.0745, + "step": 4200 + }, + { + "epoch": 0.14967558439249845, + "grad_norm": 1.9073193073272705, + "learning_rate": 1.9952606635071093e-05, + "loss": 2.0869, + "step": 4210 + }, + { + "epoch": 0.1500311083459248, + "grad_norm": 1.7565745115280151, + "learning_rate": 2e-05, + "loss": 2.0942, + "step": 4220 + }, + { + "epoch": 0.15038663229935118, + "grad_norm": 1.7577910423278809, + "learning_rate": 1.9999999232031353e-05, + "loss": 2.0911, + "step": 4230 + }, + { + "epoch": 0.15074215625277754, + "grad_norm": 1.8030418157577515, + "learning_rate": 1.999999692812552e-05, + "loss": 2.0636, + "step": 4240 + }, + { + "epoch": 0.1510976802062039, + "grad_norm": 1.677435040473938, + "learning_rate": 1.9999993088282862e-05, + "loss": 2.0763, + "step": 4250 + }, + { + "epoch": 0.15145320415963026, + "grad_norm": 1.6775643825531006, + "learning_rate": 1.9999987712503962e-05, + "loss": 2.0756, + "step": 4260 + }, + { + "epoch": 0.15180872811305662, + "grad_norm": 1.6850625276565552, + "learning_rate": 1.9999980800789655e-05, + "loss": 2.0389, + "step": 4270 + }, + { + "epoch": 0.15216425206648299, + "grad_norm": 1.7442346811294556, + "learning_rate": 1.9999972353140994e-05, + "loss": 2.0205, + "step": 4280 + }, + { + "epoch": 0.15251977601990935, + "grad_norm": 1.7059242725372314, + "learning_rate": 1.9999962369559283e-05, + "loss": 2.1182, + "step": 4290 + }, + { + "epoch": 0.1528752999733357, + "grad_norm": 1.754390835762024, + "learning_rate": 1.999995085004605e-05, + "loss": 2.1102, + "step": 4300 + }, + { + "epoch": 0.15323082392676207, + "grad_norm": 1.7949542999267578, + "learning_rate": 1.9999937794603067e-05, + "loss": 2.0428, + "step": 4310 + }, + { + "epoch": 0.15358634788018843, + "grad_norm": 1.7033107280731201, + "learning_rate": 1.999992320323234e-05, + "loss": 2.0979, + "step": 4320 + }, + { + "epoch": 0.1539418718336148, + "grad_norm": 1.6337509155273438, + "learning_rate": 1.9999907075936108e-05, + "loss": 2.0603, + "step": 4330 + }, + { + "epoch": 0.15429739578704116, + "grad_norm": 1.660717248916626, + "learning_rate": 1.999988941271685e-05, + "loss": 1.9921, + "step": 4340 + }, + { + "epoch": 0.15465291974046752, + "grad_norm": 1.606544017791748, + "learning_rate": 1.9999870213577273e-05, + "loss": 2.0588, + "step": 4350 + }, + { + "epoch": 0.15500844369389388, + "grad_norm": 1.9054453372955322, + "learning_rate": 1.9999849478520335e-05, + "loss": 2.0657, + "step": 4360 + }, + { + "epoch": 0.15536396764732024, + "grad_norm": 1.735126256942749, + "learning_rate": 1.999982720754922e-05, + "loss": 2.0433, + "step": 4370 + }, + { + "epoch": 0.1557194916007466, + "grad_norm": 1.8115259408950806, + "learning_rate": 1.999980340066734e-05, + "loss": 2.0617, + "step": 4380 + }, + { + "epoch": 0.15607501555417297, + "grad_norm": 1.7404751777648926, + "learning_rate": 1.9999778057878355e-05, + "loss": 2.1696, + "step": 4390 + }, + { + "epoch": 0.15643053950759933, + "grad_norm": 1.8304661512374878, + "learning_rate": 1.9999751179186165e-05, + "loss": 2.0745, + "step": 4400 + }, + { + "epoch": 0.1567860634610257, + "grad_norm": 1.7348082065582275, + "learning_rate": 1.999972276459489e-05, + "loss": 2.0759, + "step": 4410 + }, + { + "epoch": 0.15714158741445206, + "grad_norm": 1.7998183965682983, + "learning_rate": 1.99996928141089e-05, + "loss": 2.0714, + "step": 4420 + }, + { + "epoch": 0.15749711136787842, + "grad_norm": 1.7887214422225952, + "learning_rate": 1.999966132773279e-05, + "loss": 2.0458, + "step": 4430 + }, + { + "epoch": 0.15785263532130478, + "grad_norm": 1.6847667694091797, + "learning_rate": 1.9999628305471398e-05, + "loss": 2.0631, + "step": 4440 + }, + { + "epoch": 0.15820815927473114, + "grad_norm": 1.6815139055252075, + "learning_rate": 1.99995937473298e-05, + "loss": 2.0624, + "step": 4450 + }, + { + "epoch": 0.1585636832281575, + "grad_norm": 1.7544565200805664, + "learning_rate": 1.9999557653313297e-05, + "loss": 2.0384, + "step": 4460 + }, + { + "epoch": 0.15891920718158387, + "grad_norm": 1.6978838443756104, + "learning_rate": 1.9999520023427436e-05, + "loss": 2.1006, + "step": 4470 + }, + { + "epoch": 0.15927473113501023, + "grad_norm": 1.7811368703842163, + "learning_rate": 1.9999480857677996e-05, + "loss": 1.9965, + "step": 4480 + }, + { + "epoch": 0.1596302550884366, + "grad_norm": 1.7523053884506226, + "learning_rate": 1.9999440156070994e-05, + "loss": 2.0728, + "step": 4490 + }, + { + "epoch": 0.15998577904186295, + "grad_norm": 1.7489559650421143, + "learning_rate": 1.999939791861268e-05, + "loss": 2.0528, + "step": 4500 + }, + { + "epoch": 0.16034130299528931, + "grad_norm": 1.6864522695541382, + "learning_rate": 1.9999354145309547e-05, + "loss": 2.0547, + "step": 4510 + }, + { + "epoch": 0.16069682694871568, + "grad_norm": 1.8162442445755005, + "learning_rate": 1.999930883616831e-05, + "loss": 2.0844, + "step": 4520 + }, + { + "epoch": 0.16105235090214204, + "grad_norm": 1.7836850881576538, + "learning_rate": 1.9999261991195932e-05, + "loss": 2.0555, + "step": 4530 + }, + { + "epoch": 0.1614078748555684, + "grad_norm": 1.7825000286102295, + "learning_rate": 1.999921361039961e-05, + "loss": 2.1001, + "step": 4540 + }, + { + "epoch": 0.16176339880899476, + "grad_norm": 1.6806426048278809, + "learning_rate": 1.9999163693786773e-05, + "loss": 2.0538, + "step": 4550 + }, + { + "epoch": 0.16211892276242112, + "grad_norm": 1.753866195678711, + "learning_rate": 1.9999112241365087e-05, + "loss": 2.066, + "step": 4560 + }, + { + "epoch": 0.1624744467158475, + "grad_norm": 1.8063266277313232, + "learning_rate": 1.9999059253142455e-05, + "loss": 2.0863, + "step": 4570 + }, + { + "epoch": 0.16282997066927385, + "grad_norm": 1.6922051906585693, + "learning_rate": 1.9999004729127015e-05, + "loss": 2.0699, + "step": 4580 + }, + { + "epoch": 0.1631854946227002, + "grad_norm": 1.783348798751831, + "learning_rate": 1.9998948669327146e-05, + "loss": 2.0548, + "step": 4590 + }, + { + "epoch": 0.16354101857612657, + "grad_norm": 1.8088620901107788, + "learning_rate": 1.9998891073751455e-05, + "loss": 2.036, + "step": 4600 + }, + { + "epoch": 0.16389654252955294, + "grad_norm": 1.6807172298431396, + "learning_rate": 1.9998831942408786e-05, + "loss": 2.0789, + "step": 4610 + }, + { + "epoch": 0.1642520664829793, + "grad_norm": 1.806488275527954, + "learning_rate": 1.9998771275308225e-05, + "loss": 2.0311, + "step": 4620 + }, + { + "epoch": 0.16460759043640566, + "grad_norm": 1.6912070512771606, + "learning_rate": 1.999870907245909e-05, + "loss": 2.0759, + "step": 4630 + }, + { + "epoch": 0.16496311438983202, + "grad_norm": 1.8495771884918213, + "learning_rate": 1.9998645333870933e-05, + "loss": 2.0675, + "step": 4640 + }, + { + "epoch": 0.16531863834325838, + "grad_norm": 1.8063349723815918, + "learning_rate": 1.9998580059553545e-05, + "loss": 2.0519, + "step": 4650 + }, + { + "epoch": 0.16567416229668475, + "grad_norm": 1.629701018333435, + "learning_rate": 1.999851324951695e-05, + "loss": 2.05, + "step": 4660 + }, + { + "epoch": 0.1660296862501111, + "grad_norm": 1.757324457168579, + "learning_rate": 1.9998444903771414e-05, + "loss": 2.0642, + "step": 4670 + }, + { + "epoch": 0.16638521020353747, + "grad_norm": 1.7033988237380981, + "learning_rate": 1.999837502232743e-05, + "loss": 2.0087, + "step": 4680 + }, + { + "epoch": 0.16674073415696383, + "grad_norm": 1.8296514749526978, + "learning_rate": 1.9998303605195733e-05, + "loss": 2.0361, + "step": 4690 + }, + { + "epoch": 0.1670962581103902, + "grad_norm": 1.7206568717956543, + "learning_rate": 1.999823065238729e-05, + "loss": 2.031, + "step": 4700 + }, + { + "epoch": 0.16745178206381656, + "grad_norm": 1.7555227279663086, + "learning_rate": 1.999815616391331e-05, + "loss": 2.0715, + "step": 4710 + }, + { + "epoch": 0.16780730601724292, + "grad_norm": 1.6362791061401367, + "learning_rate": 1.9998080139785233e-05, + "loss": 2.0709, + "step": 4720 + }, + { + "epoch": 0.16816282997066928, + "grad_norm": 1.6888320446014404, + "learning_rate": 1.999800258001473e-05, + "loss": 2.1093, + "step": 4730 + }, + { + "epoch": 0.16851835392409564, + "grad_norm": 1.8050616979599, + "learning_rate": 1.9997923484613726e-05, + "loss": 1.9949, + "step": 4740 + }, + { + "epoch": 0.168873877877522, + "grad_norm": 1.8730134963989258, + "learning_rate": 1.9997842853594358e-05, + "loss": 2.0634, + "step": 4750 + }, + { + "epoch": 0.16922940183094837, + "grad_norm": 1.7403571605682373, + "learning_rate": 1.999776068696902e-05, + "loss": 2.0523, + "step": 4760 + }, + { + "epoch": 0.16958492578437473, + "grad_norm": 1.7689752578735352, + "learning_rate": 1.999767698475032e-05, + "loss": 2.0308, + "step": 4770 + }, + { + "epoch": 0.1699404497378011, + "grad_norm": 1.739279866218567, + "learning_rate": 1.9997591746951124e-05, + "loss": 2.0456, + "step": 4780 + }, + { + "epoch": 0.17029597369122745, + "grad_norm": 1.750306248664856, + "learning_rate": 1.9997504973584522e-05, + "loss": 2.0603, + "step": 4790 + }, + { + "epoch": 0.17065149764465382, + "grad_norm": 1.6228183507919312, + "learning_rate": 1.9997416664663836e-05, + "loss": 2.0419, + "step": 4800 + }, + { + "epoch": 0.17100702159808018, + "grad_norm": 1.744736671447754, + "learning_rate": 1.9997326820202637e-05, + "loss": 2.085, + "step": 4810 + }, + { + "epoch": 0.17136254555150654, + "grad_norm": 1.699845314025879, + "learning_rate": 1.9997235440214727e-05, + "loss": 2.0595, + "step": 4820 + }, + { + "epoch": 0.1717180695049329, + "grad_norm": 1.7860050201416016, + "learning_rate": 1.9997142524714133e-05, + "loss": 2.0442, + "step": 4830 + }, + { + "epoch": 0.17207359345835926, + "grad_norm": 1.6302872896194458, + "learning_rate": 1.999704807371513e-05, + "loss": 1.9743, + "step": 4840 + }, + { + "epoch": 0.17242911741178563, + "grad_norm": 1.6316944360733032, + "learning_rate": 1.9996952087232224e-05, + "loss": 2.0304, + "step": 4850 + }, + { + "epoch": 0.172784641365212, + "grad_norm": 1.5872083902359009, + "learning_rate": 1.999685456528016e-05, + "loss": 2.0263, + "step": 4860 + }, + { + "epoch": 0.17314016531863835, + "grad_norm": 1.7513489723205566, + "learning_rate": 1.9996755507873913e-05, + "loss": 2.0782, + "step": 4870 + }, + { + "epoch": 0.1734956892720647, + "grad_norm": 1.7436871528625488, + "learning_rate": 1.9996654915028702e-05, + "loss": 2.028, + "step": 4880 + }, + { + "epoch": 0.17385121322549107, + "grad_norm": 1.938206672668457, + "learning_rate": 1.9996552786759976e-05, + "loss": 2.0195, + "step": 4890 + }, + { + "epoch": 0.17420673717891744, + "grad_norm": 1.8229395151138306, + "learning_rate": 1.999644912308342e-05, + "loss": 2.0548, + "step": 4900 + }, + { + "epoch": 0.1745622611323438, + "grad_norm": 1.8051745891571045, + "learning_rate": 1.9996343924014955e-05, + "loss": 2.0011, + "step": 4910 + }, + { + "epoch": 0.17491778508577016, + "grad_norm": 1.7209690809249878, + "learning_rate": 1.9996237189570745e-05, + "loss": 2.0255, + "step": 4920 + }, + { + "epoch": 0.17527330903919652, + "grad_norm": 1.8421382904052734, + "learning_rate": 1.999612891976718e-05, + "loss": 2.0211, + "step": 4930 + }, + { + "epoch": 0.17562883299262289, + "grad_norm": 1.680746078491211, + "learning_rate": 1.9996019114620883e-05, + "loss": 2.0211, + "step": 4940 + }, + { + "epoch": 0.17598435694604925, + "grad_norm": 1.7081247568130493, + "learning_rate": 1.9995907774148732e-05, + "loss": 2.0696, + "step": 4950 + }, + { + "epoch": 0.1763398808994756, + "grad_norm": 1.6198205947875977, + "learning_rate": 1.999579489836782e-05, + "loss": 2.0525, + "step": 4960 + }, + { + "epoch": 0.17669540485290197, + "grad_norm": 1.6535507440567017, + "learning_rate": 1.999568048729548e-05, + "loss": 2.0414, + "step": 4970 + }, + { + "epoch": 0.17705092880632833, + "grad_norm": 1.5443240404129028, + "learning_rate": 1.9995564540949298e-05, + "loss": 2.0826, + "step": 4980 + }, + { + "epoch": 0.1774064527597547, + "grad_norm": 1.733921766281128, + "learning_rate": 1.9995447059347072e-05, + "loss": 2.0257, + "step": 4990 + }, + { + "epoch": 0.17776197671318106, + "grad_norm": 1.6675341129302979, + "learning_rate": 1.999532804250685e-05, + "loss": 2.0282, + "step": 5000 + }, + { + "epoch": 0.17811750066660742, + "grad_norm": 1.6987435817718506, + "learning_rate": 1.9995207490446913e-05, + "loss": 2.0103, + "step": 5010 + }, + { + "epoch": 0.17847302462003378, + "grad_norm": 1.7035290002822876, + "learning_rate": 1.9995085403185772e-05, + "loss": 2.0162, + "step": 5020 + }, + { + "epoch": 0.17882854857346014, + "grad_norm": 1.6742795705795288, + "learning_rate": 1.9994961780742188e-05, + "loss": 1.9878, + "step": 5030 + }, + { + "epoch": 0.1791840725268865, + "grad_norm": 1.6395647525787354, + "learning_rate": 1.999483662313514e-05, + "loss": 2.0434, + "step": 5040 + }, + { + "epoch": 0.17953959648031287, + "grad_norm": 1.7217377424240112, + "learning_rate": 1.9994709930383857e-05, + "loss": 2.0372, + "step": 5050 + }, + { + "epoch": 0.17989512043373923, + "grad_norm": 1.750941514968872, + "learning_rate": 1.9994581702507793e-05, + "loss": 2.0878, + "step": 5060 + }, + { + "epoch": 0.1802506443871656, + "grad_norm": 1.6906191110610962, + "learning_rate": 1.999445193952665e-05, + "loss": 2.025, + "step": 5070 + }, + { + "epoch": 0.18060616834059195, + "grad_norm": 1.736352801322937, + "learning_rate": 1.999432064146035e-05, + "loss": 2.0176, + "step": 5080 + }, + { + "epoch": 0.18096169229401832, + "grad_norm": 1.7025351524353027, + "learning_rate": 1.9994187808329068e-05, + "loss": 2.0428, + "step": 5090 + }, + { + "epoch": 0.18131721624744468, + "grad_norm": 1.745902419090271, + "learning_rate": 1.99940534401532e-05, + "loss": 2.0525, + "step": 5100 + }, + { + "epoch": 0.18167274020087104, + "grad_norm": 1.695774793624878, + "learning_rate": 1.9993917536953387e-05, + "loss": 2.0147, + "step": 5110 + }, + { + "epoch": 0.1820282641542974, + "grad_norm": 1.876430630683899, + "learning_rate": 1.9993780098750506e-05, + "loss": 2.0695, + "step": 5120 + }, + { + "epoch": 0.18238378810772377, + "grad_norm": 1.6322757005691528, + "learning_rate": 1.999364112556566e-05, + "loss": 2.1169, + "step": 5130 + }, + { + "epoch": 0.18273931206115013, + "grad_norm": 1.739761471748352, + "learning_rate": 1.9993500617420202e-05, + "loss": 2.0632, + "step": 5140 + }, + { + "epoch": 0.1830948360145765, + "grad_norm": 1.6902257204055786, + "learning_rate": 1.9993358574335703e-05, + "loss": 1.982, + "step": 5150 + }, + { + "epoch": 0.18345035996800285, + "grad_norm": 1.7146416902542114, + "learning_rate": 1.999321499633399e-05, + "loss": 2.0088, + "step": 5160 + }, + { + "epoch": 0.18380588392142921, + "grad_norm": 1.7331721782684326, + "learning_rate": 1.999306988343711e-05, + "loss": 2.0219, + "step": 5170 + }, + { + "epoch": 0.18416140787485558, + "grad_norm": 1.7011710405349731, + "learning_rate": 1.9992923235667354e-05, + "loss": 1.9641, + "step": 5180 + }, + { + "epoch": 0.18451693182828194, + "grad_norm": 1.6844632625579834, + "learning_rate": 1.9992775053047245e-05, + "loss": 2.008, + "step": 5190 + }, + { + "epoch": 0.1848724557817083, + "grad_norm": 1.6663172245025635, + "learning_rate": 1.999262533559954e-05, + "loss": 2.0401, + "step": 5200 + }, + { + "epoch": 0.18522797973513466, + "grad_norm": 1.7183222770690918, + "learning_rate": 1.9992474083347243e-05, + "loss": 2.0093, + "step": 5210 + }, + { + "epoch": 0.18558350368856102, + "grad_norm": 1.7464849948883057, + "learning_rate": 1.9992321296313574e-05, + "loss": 2.0489, + "step": 5220 + }, + { + "epoch": 0.1859390276419874, + "grad_norm": 1.7614524364471436, + "learning_rate": 1.9992166974522012e-05, + "loss": 1.9959, + "step": 5230 + }, + { + "epoch": 0.18629455159541375, + "grad_norm": 1.7039707899093628, + "learning_rate": 1.999201111799625e-05, + "loss": 2.0562, + "step": 5240 + }, + { + "epoch": 0.1866500755488401, + "grad_norm": 1.6510025262832642, + "learning_rate": 1.9991853726760234e-05, + "loss": 2.0293, + "step": 5250 + }, + { + "epoch": 0.18700559950226647, + "grad_norm": 1.7329883575439453, + "learning_rate": 1.999169480083813e-05, + "loss": 2.0111, + "step": 5260 + }, + { + "epoch": 0.18736112345569284, + "grad_norm": 1.5895287990570068, + "learning_rate": 1.999153434025436e-05, + "loss": 2.0381, + "step": 5270 + }, + { + "epoch": 0.1877166474091192, + "grad_norm": 1.7224647998809814, + "learning_rate": 1.9991372345033558e-05, + "loss": 2.0457, + "step": 5280 + }, + { + "epoch": 0.18807217136254556, + "grad_norm": 1.6291033029556274, + "learning_rate": 1.999120881520061e-05, + "loss": 1.9931, + "step": 5290 + }, + { + "epoch": 0.18842769531597192, + "grad_norm": 1.7096868753433228, + "learning_rate": 1.9991043750780636e-05, + "loss": 2.0432, + "step": 5300 + }, + { + "epoch": 0.18878321926939828, + "grad_norm": 1.709851861000061, + "learning_rate": 1.9990877151798983e-05, + "loss": 2.0747, + "step": 5310 + }, + { + "epoch": 0.18913874322282465, + "grad_norm": 1.6630563735961914, + "learning_rate": 1.9990709018281244e-05, + "loss": 2.0401, + "step": 5320 + }, + { + "epoch": 0.189494267176251, + "grad_norm": 1.7594939470291138, + "learning_rate": 1.9990539350253244e-05, + "loss": 2.0068, + "step": 5330 + }, + { + "epoch": 0.18984979112967737, + "grad_norm": 1.6987512111663818, + "learning_rate": 1.999036814774104e-05, + "loss": 2.0458, + "step": 5340 + }, + { + "epoch": 0.19020531508310373, + "grad_norm": 1.7822080850601196, + "learning_rate": 1.999019541077093e-05, + "loss": 2.0003, + "step": 5350 + }, + { + "epoch": 0.1905608390365301, + "grad_norm": 1.6965094804763794, + "learning_rate": 1.9990021139369437e-05, + "loss": 2.0074, + "step": 5360 + }, + { + "epoch": 0.19091636298995646, + "grad_norm": 1.7385228872299194, + "learning_rate": 1.998984533356334e-05, + "loss": 2.007, + "step": 5370 + }, + { + "epoch": 0.19127188694338282, + "grad_norm": 1.8442659378051758, + "learning_rate": 1.9989667993379636e-05, + "loss": 2.0201, + "step": 5380 + }, + { + "epoch": 0.19162741089680918, + "grad_norm": 1.6964536905288696, + "learning_rate": 1.9989489118845565e-05, + "loss": 2.0059, + "step": 5390 + }, + { + "epoch": 0.19198293485023554, + "grad_norm": 1.6384342908859253, + "learning_rate": 1.99893087099886e-05, + "loss": 2.035, + "step": 5400 + }, + { + "epoch": 0.1923384588036619, + "grad_norm": 1.866665005683899, + "learning_rate": 1.998912676683645e-05, + "loss": 2.039, + "step": 5410 + }, + { + "epoch": 0.19269398275708827, + "grad_norm": 1.658677101135254, + "learning_rate": 1.998894328941706e-05, + "loss": 2.0323, + "step": 5420 + }, + { + "epoch": 0.19304950671051463, + "grad_norm": 1.708578109741211, + "learning_rate": 1.9988758277758613e-05, + "loss": 1.9997, + "step": 5430 + }, + { + "epoch": 0.193405030663941, + "grad_norm": 1.7182897329330444, + "learning_rate": 1.9988571731889522e-05, + "loss": 2.0486, + "step": 5440 + }, + { + "epoch": 0.19376055461736735, + "grad_norm": 1.7253814935684204, + "learning_rate": 1.9988383651838447e-05, + "loss": 2.068, + "step": 5450 + }, + { + "epoch": 0.19411607857079372, + "grad_norm": 1.7060468196868896, + "learning_rate": 1.9988194037634267e-05, + "loss": 2.0753, + "step": 5460 + }, + { + "epoch": 0.19447160252422008, + "grad_norm": 1.7148746252059937, + "learning_rate": 1.998800288930611e-05, + "loss": 2.1006, + "step": 5470 + }, + { + "epoch": 0.19482712647764644, + "grad_norm": 1.7313423156738281, + "learning_rate": 1.9987810206883334e-05, + "loss": 2.0498, + "step": 5480 + }, + { + "epoch": 0.1951826504310728, + "grad_norm": 1.7278350591659546, + "learning_rate": 1.9987615990395536e-05, + "loss": 2.0793, + "step": 5490 + }, + { + "epoch": 0.19553817438449916, + "grad_norm": 1.7059909105300903, + "learning_rate": 1.998742023987254e-05, + "loss": 2.0506, + "step": 5500 + }, + { + "epoch": 0.19589369833792553, + "grad_norm": 1.6768639087677002, + "learning_rate": 1.9987222955344423e-05, + "loss": 2.0102, + "step": 5510 + }, + { + "epoch": 0.1962492222913519, + "grad_norm": 1.6817662715911865, + "learning_rate": 1.998702413684148e-05, + "loss": 2.0643, + "step": 5520 + }, + { + "epoch": 0.19660474624477825, + "grad_norm": 1.687806248664856, + "learning_rate": 1.9986823784394244e-05, + "loss": 2.0429, + "step": 5530 + }, + { + "epoch": 0.1969602701982046, + "grad_norm": 1.6257810592651367, + "learning_rate": 1.9986621898033496e-05, + "loss": 2.0082, + "step": 5540 + }, + { + "epoch": 0.19731579415163097, + "grad_norm": 1.6978223323822021, + "learning_rate": 1.9986418477790237e-05, + "loss": 2.0416, + "step": 5550 + }, + { + "epoch": 0.19767131810505734, + "grad_norm": 1.6773324012756348, + "learning_rate": 1.998621352369572e-05, + "loss": 2.0248, + "step": 5560 + }, + { + "epoch": 0.1980268420584837, + "grad_norm": 1.616140365600586, + "learning_rate": 1.998600703578142e-05, + "loss": 1.9969, + "step": 5570 + }, + { + "epoch": 0.19838236601191006, + "grad_norm": 1.7479193210601807, + "learning_rate": 1.9985799014079048e-05, + "loss": 2.0435, + "step": 5580 + }, + { + "epoch": 0.19873788996533642, + "grad_norm": 1.7679039239883423, + "learning_rate": 1.998558945862056e-05, + "loss": 2.0311, + "step": 5590 + }, + { + "epoch": 0.19909341391876278, + "grad_norm": 1.6383432149887085, + "learning_rate": 1.9985378369438143e-05, + "loss": 2.0143, + "step": 5600 + }, + { + "epoch": 0.19944893787218915, + "grad_norm": 1.709038257598877, + "learning_rate": 1.9985165746564215e-05, + "loss": 1.9876, + "step": 5610 + }, + { + "epoch": 0.1998044618256155, + "grad_norm": 1.638545274734497, + "learning_rate": 1.9984951590031437e-05, + "loss": 2.0268, + "step": 5620 + }, + { + "epoch": 0.20015998577904187, + "grad_norm": 1.6110061407089233, + "learning_rate": 1.9984735899872703e-05, + "loss": 2.0289, + "step": 5630 + }, + { + "epoch": 0.20051550973246823, + "grad_norm": 1.7065180540084839, + "learning_rate": 1.9984518676121137e-05, + "loss": 1.9918, + "step": 5640 + }, + { + "epoch": 0.2008710336858946, + "grad_norm": 1.667405366897583, + "learning_rate": 1.9984299918810108e-05, + "loss": 2.0055, + "step": 5650 + }, + { + "epoch": 0.20122655763932096, + "grad_norm": 1.665473461151123, + "learning_rate": 1.998407962797321e-05, + "loss": 2.0696, + "step": 5660 + }, + { + "epoch": 0.20158208159274732, + "grad_norm": 1.6744557619094849, + "learning_rate": 1.9983857803644283e-05, + "loss": 2.0162, + "step": 5670 + }, + { + "epoch": 0.20193760554617368, + "grad_norm": 1.6898198127746582, + "learning_rate": 1.9983634445857397e-05, + "loss": 2.0127, + "step": 5680 + }, + { + "epoch": 0.20229312949960004, + "grad_norm": 1.6509838104248047, + "learning_rate": 1.998340955464686e-05, + "loss": 2.0387, + "step": 5690 + }, + { + "epoch": 0.2026486534530264, + "grad_norm": 1.6415313482284546, + "learning_rate": 1.998318313004721e-05, + "loss": 1.9999, + "step": 5700 + }, + { + "epoch": 0.20300417740645277, + "grad_norm": 1.5902107954025269, + "learning_rate": 1.9982955172093227e-05, + "loss": 2.0019, + "step": 5710 + }, + { + "epoch": 0.20335970135987913, + "grad_norm": 1.6910830736160278, + "learning_rate": 1.9982725680819922e-05, + "loss": 2.0246, + "step": 5720 + }, + { + "epoch": 0.2037152253133055, + "grad_norm": 1.7578151226043701, + "learning_rate": 1.9982494656262544e-05, + "loss": 2.0488, + "step": 5730 + }, + { + "epoch": 0.20407074926673185, + "grad_norm": 1.7221277952194214, + "learning_rate": 1.9982262098456582e-05, + "loss": 2.0274, + "step": 5740 + }, + { + "epoch": 0.20442627322015822, + "grad_norm": 1.6714609861373901, + "learning_rate": 1.9982028007437745e-05, + "loss": 1.9624, + "step": 5750 + }, + { + "epoch": 0.20478179717358458, + "grad_norm": 1.7402448654174805, + "learning_rate": 1.9981792383242e-05, + "loss": 2.0711, + "step": 5760 + }, + { + "epoch": 0.20513732112701094, + "grad_norm": 1.6645225286483765, + "learning_rate": 1.9981555225905526e-05, + "loss": 2.0254, + "step": 5770 + }, + { + "epoch": 0.2054928450804373, + "grad_norm": 1.6826421022415161, + "learning_rate": 1.9981316535464758e-05, + "loss": 1.9941, + "step": 5780 + }, + { + "epoch": 0.20584836903386367, + "grad_norm": 1.8125617504119873, + "learning_rate": 1.998107631195635e-05, + "loss": 1.9914, + "step": 5790 + }, + { + "epoch": 0.20620389298729003, + "grad_norm": 1.7178610563278198, + "learning_rate": 1.9980834555417203e-05, + "loss": 2.0078, + "step": 5800 + }, + { + "epoch": 0.2065594169407164, + "grad_norm": 1.7131744623184204, + "learning_rate": 1.998059126588445e-05, + "loss": 2.0586, + "step": 5810 + }, + { + "epoch": 0.20691494089414275, + "grad_norm": 1.646399974822998, + "learning_rate": 1.9980346443395454e-05, + "loss": 1.9898, + "step": 5820 + }, + { + "epoch": 0.2072704648475691, + "grad_norm": 1.6671161651611328, + "learning_rate": 1.9980100087987826e-05, + "loss": 1.9859, + "step": 5830 + }, + { + "epoch": 0.20762598880099548, + "grad_norm": 1.600407361984253, + "learning_rate": 1.9979852199699402e-05, + "loss": 2.0468, + "step": 5840 + }, + { + "epoch": 0.20798151275442184, + "grad_norm": 1.7410165071487427, + "learning_rate": 1.9979602778568246e-05, + "loss": 1.9843, + "step": 5850 + }, + { + "epoch": 0.2083370367078482, + "grad_norm": 1.652017593383789, + "learning_rate": 1.9979351824632683e-05, + "loss": 2.0221, + "step": 5860 + }, + { + "epoch": 0.20869256066127456, + "grad_norm": 1.7493135929107666, + "learning_rate": 1.997909933793125e-05, + "loss": 2.0233, + "step": 5870 + }, + { + "epoch": 0.20904808461470092, + "grad_norm": 1.6278657913208008, + "learning_rate": 1.9978845318502724e-05, + "loss": 2.0155, + "step": 5880 + }, + { + "epoch": 0.2094036085681273, + "grad_norm": 1.7754439115524292, + "learning_rate": 1.9978589766386126e-05, + "loss": 2.0214, + "step": 5890 + }, + { + "epoch": 0.20975913252155365, + "grad_norm": 1.7136434316635132, + "learning_rate": 1.997833268162071e-05, + "loss": 2.0478, + "step": 5900 + }, + { + "epoch": 0.21011465647498, + "grad_norm": 1.8517942428588867, + "learning_rate": 1.9978074064245955e-05, + "loss": 1.9675, + "step": 5910 + }, + { + "epoch": 0.21047018042840637, + "grad_norm": 1.7563371658325195, + "learning_rate": 1.997781391430159e-05, + "loss": 1.9882, + "step": 5920 + }, + { + "epoch": 0.21082570438183273, + "grad_norm": 1.78915274143219, + "learning_rate": 1.9977552231827566e-05, + "loss": 2.0131, + "step": 5930 + }, + { + "epoch": 0.2111812283352591, + "grad_norm": 1.6726717948913574, + "learning_rate": 1.997728901686408e-05, + "loss": 2.0216, + "step": 5940 + }, + { + "epoch": 0.21153675228868546, + "grad_norm": 1.6768577098846436, + "learning_rate": 1.9977024269451563e-05, + "loss": 2.013, + "step": 5950 + }, + { + "epoch": 0.21189227624211182, + "grad_norm": 1.6979024410247803, + "learning_rate": 1.9976757989630667e-05, + "loss": 1.9895, + "step": 5960 + }, + { + "epoch": 0.21224780019553818, + "grad_norm": 1.6715058088302612, + "learning_rate": 1.9976490177442303e-05, + "loss": 2.0148, + "step": 5970 + }, + { + "epoch": 0.21260332414896455, + "grad_norm": 1.655503273010254, + "learning_rate": 1.99762208329276e-05, + "loss": 1.9948, + "step": 5980 + }, + { + "epoch": 0.2129588481023909, + "grad_norm": 1.6729159355163574, + "learning_rate": 1.9975949956127928e-05, + "loss": 2.0333, + "step": 5990 + }, + { + "epoch": 0.21331437205581727, + "grad_norm": 1.8006285429000854, + "learning_rate": 1.9975677547084892e-05, + "loss": 2.0658, + "step": 6000 + }, + { + "epoch": 0.21366989600924363, + "grad_norm": 1.688770055770874, + "learning_rate": 1.9975403605840336e-05, + "loss": 2.0038, + "step": 6010 + }, + { + "epoch": 0.21402541996267, + "grad_norm": 1.6850205659866333, + "learning_rate": 1.997512813243633e-05, + "loss": 2.0117, + "step": 6020 + }, + { + "epoch": 0.21438094391609636, + "grad_norm": 1.7397541999816895, + "learning_rate": 1.9974851126915185e-05, + "loss": 1.9922, + "step": 6030 + }, + { + "epoch": 0.21473646786952272, + "grad_norm": 1.6360909938812256, + "learning_rate": 1.9974572589319456e-05, + "loss": 2.0097, + "step": 6040 + }, + { + "epoch": 0.21509199182294908, + "grad_norm": 1.600882649421692, + "learning_rate": 1.9974292519691912e-05, + "loss": 2.0519, + "step": 6050 + }, + { + "epoch": 0.21544751577637544, + "grad_norm": 1.6638786792755127, + "learning_rate": 1.9974010918075582e-05, + "loss": 2.0346, + "step": 6060 + }, + { + "epoch": 0.2158030397298018, + "grad_norm": 1.6250038146972656, + "learning_rate": 1.997372778451371e-05, + "loss": 2.0147, + "step": 6070 + }, + { + "epoch": 0.21615856368322817, + "grad_norm": 1.7188122272491455, + "learning_rate": 1.9973443119049785e-05, + "loss": 2.0257, + "step": 6080 + }, + { + "epoch": 0.21651408763665453, + "grad_norm": 1.798150897026062, + "learning_rate": 1.997315692172753e-05, + "loss": 1.9708, + "step": 6090 + }, + { + "epoch": 0.2168696115900809, + "grad_norm": 1.6894288063049316, + "learning_rate": 1.9972869192590906e-05, + "loss": 1.9706, + "step": 6100 + }, + { + "epoch": 0.21722513554350725, + "grad_norm": 1.6506953239440918, + "learning_rate": 1.99725799316841e-05, + "loss": 1.9368, + "step": 6110 + }, + { + "epoch": 0.21758065949693361, + "grad_norm": 1.7530012130737305, + "learning_rate": 1.9972289139051553e-05, + "loss": 2.0346, + "step": 6120 + }, + { + "epoch": 0.21793618345035998, + "grad_norm": 1.7064405679702759, + "learning_rate": 1.9971996814737916e-05, + "loss": 2.0102, + "step": 6130 + }, + { + "epoch": 0.21829170740378634, + "grad_norm": 1.6821569204330444, + "learning_rate": 1.9971702958788092e-05, + "loss": 2.0345, + "step": 6140 + }, + { + "epoch": 0.2186472313572127, + "grad_norm": 1.6987396478652954, + "learning_rate": 1.997140757124722e-05, + "loss": 2.0189, + "step": 6150 + }, + { + "epoch": 0.21900275531063906, + "grad_norm": 1.6398649215698242, + "learning_rate": 1.9971110652160665e-05, + "loss": 2.0298, + "step": 6160 + }, + { + "epoch": 0.21935827926406543, + "grad_norm": 1.791056752204895, + "learning_rate": 1.9970812201574033e-05, + "loss": 1.9631, + "step": 6170 + }, + { + "epoch": 0.2197138032174918, + "grad_norm": 1.6377999782562256, + "learning_rate": 1.9970512219533163e-05, + "loss": 2.0032, + "step": 6180 + }, + { + "epoch": 0.22006932717091815, + "grad_norm": 1.733130931854248, + "learning_rate": 1.9970210706084135e-05, + "loss": 2.0358, + "step": 6190 + }, + { + "epoch": 0.2204248511243445, + "grad_norm": 1.6794812679290771, + "learning_rate": 1.9969907661273254e-05, + "loss": 1.966, + "step": 6200 + }, + { + "epoch": 0.22078037507777087, + "grad_norm": 1.684327483177185, + "learning_rate": 1.996960308514707e-05, + "loss": 2.0665, + "step": 6210 + }, + { + "epoch": 0.22113589903119724, + "grad_norm": 1.670153021812439, + "learning_rate": 1.9969296977752358e-05, + "loss": 1.9962, + "step": 6220 + }, + { + "epoch": 0.2214914229846236, + "grad_norm": 1.7059471607208252, + "learning_rate": 1.996898933913614e-05, + "loss": 2.0477, + "step": 6230 + }, + { + "epoch": 0.22184694693804996, + "grad_norm": 1.7042587995529175, + "learning_rate": 1.996868016934567e-05, + "loss": 2.0678, + "step": 6240 + }, + { + "epoch": 0.22220247089147632, + "grad_norm": 1.6933459043502808, + "learning_rate": 1.9968369468428422e-05, + "loss": 1.9882, + "step": 6250 + }, + { + "epoch": 0.22255799484490268, + "grad_norm": 1.7479889392852783, + "learning_rate": 1.9968057236432132e-05, + "loss": 1.985, + "step": 6260 + }, + { + "epoch": 0.22291351879832905, + "grad_norm": 1.6197444200515747, + "learning_rate": 1.996774347340475e-05, + "loss": 1.9593, + "step": 6270 + }, + { + "epoch": 0.2232690427517554, + "grad_norm": 1.6699209213256836, + "learning_rate": 1.9967428179394464e-05, + "loss": 2.0094, + "step": 6280 + }, + { + "epoch": 0.22362456670518177, + "grad_norm": 1.7393150329589844, + "learning_rate": 1.996711135444971e-05, + "loss": 1.9976, + "step": 6290 + }, + { + "epoch": 0.22398009065860813, + "grad_norm": 1.6761125326156616, + "learning_rate": 1.9966792998619147e-05, + "loss": 1.9891, + "step": 6300 + }, + { + "epoch": 0.2243356146120345, + "grad_norm": 1.7271678447723389, + "learning_rate": 1.996647311195167e-05, + "loss": 2.0581, + "step": 6310 + }, + { + "epoch": 0.22469113856546086, + "grad_norm": 1.7058919668197632, + "learning_rate": 1.9966151694496413e-05, + "loss": 2.0233, + "step": 6320 + }, + { + "epoch": 0.22504666251888722, + "grad_norm": 1.668379306793213, + "learning_rate": 1.9965828746302743e-05, + "loss": 2.0138, + "step": 6330 + }, + { + "epoch": 0.22540218647231358, + "grad_norm": 1.7820411920547485, + "learning_rate": 1.9965504267420266e-05, + "loss": 2.0, + "step": 6340 + }, + { + "epoch": 0.22575771042573994, + "grad_norm": 1.8417869806289673, + "learning_rate": 1.9965178257898818e-05, + "loss": 1.9905, + "step": 6350 + }, + { + "epoch": 0.2261132343791663, + "grad_norm": 1.6538697481155396, + "learning_rate": 1.9964850717788468e-05, + "loss": 1.978, + "step": 6360 + }, + { + "epoch": 0.22646875833259267, + "grad_norm": 1.6233092546463013, + "learning_rate": 1.9964521647139534e-05, + "loss": 1.9075, + "step": 6370 + }, + { + "epoch": 0.22682428228601903, + "grad_norm": 1.6171590089797974, + "learning_rate": 1.996419104600255e-05, + "loss": 1.9938, + "step": 6380 + }, + { + "epoch": 0.2271798062394454, + "grad_norm": 1.6618120670318604, + "learning_rate": 1.9963858914428295e-05, + "loss": 2.0056, + "step": 6390 + }, + { + "epoch": 0.22753533019287175, + "grad_norm": 1.6301599740982056, + "learning_rate": 1.9963525252467787e-05, + "loss": 2.0055, + "step": 6400 + }, + { + "epoch": 0.22789085414629812, + "grad_norm": 1.8050549030303955, + "learning_rate": 1.996319006017227e-05, + "loss": 2.0112, + "step": 6410 + }, + { + "epoch": 0.22824637809972448, + "grad_norm": 1.8067948818206787, + "learning_rate": 1.9962853337593234e-05, + "loss": 1.9908, + "step": 6420 + }, + { + "epoch": 0.22860190205315084, + "grad_norm": 1.656425952911377, + "learning_rate": 1.996251508478239e-05, + "loss": 2.0401, + "step": 6430 + }, + { + "epoch": 0.2289574260065772, + "grad_norm": 1.7688844203948975, + "learning_rate": 1.9962175301791695e-05, + "loss": 1.9961, + "step": 6440 + }, + { + "epoch": 0.22931294996000356, + "grad_norm": 1.6689438819885254, + "learning_rate": 1.9961833988673332e-05, + "loss": 1.9763, + "step": 6450 + }, + { + "epoch": 0.22966847391342993, + "grad_norm": 1.6061345338821411, + "learning_rate": 1.9961491145479736e-05, + "loss": 2.0064, + "step": 6460 + }, + { + "epoch": 0.2300239978668563, + "grad_norm": 1.6575069427490234, + "learning_rate": 1.9961146772263557e-05, + "loss": 2.0424, + "step": 6470 + }, + { + "epoch": 0.23037952182028265, + "grad_norm": 1.7679375410079956, + "learning_rate": 1.996080086907769e-05, + "loss": 2.0052, + "step": 6480 + }, + { + "epoch": 0.230735045773709, + "grad_norm": 1.7035895586013794, + "learning_rate": 1.9960453435975266e-05, + "loss": 2.0053, + "step": 6490 + }, + { + "epoch": 0.23109056972713538, + "grad_norm": 1.6635199785232544, + "learning_rate": 1.9960104473009645e-05, + "loss": 2.0175, + "step": 6500 + }, + { + "epoch": 0.23144609368056174, + "grad_norm": 1.6349929571151733, + "learning_rate": 1.9959753980234428e-05, + "loss": 2.0434, + "step": 6510 + }, + { + "epoch": 0.2318016176339881, + "grad_norm": 1.7579482793807983, + "learning_rate": 1.9959401957703447e-05, + "loss": 1.9801, + "step": 6520 + }, + { + "epoch": 0.23215714158741446, + "grad_norm": 1.7130075693130493, + "learning_rate": 1.995904840547077e-05, + "loss": 1.9774, + "step": 6530 + }, + { + "epoch": 0.23251266554084082, + "grad_norm": 1.6815969944000244, + "learning_rate": 1.9958693323590706e-05, + "loss": 1.9948, + "step": 6540 + }, + { + "epoch": 0.23286818949426719, + "grad_norm": 1.673559308052063, + "learning_rate": 1.9958336712117783e-05, + "loss": 1.9878, + "step": 6550 + }, + { + "epoch": 0.23322371344769355, + "grad_norm": 1.74596107006073, + "learning_rate": 1.9957978571106785e-05, + "loss": 2.0221, + "step": 6560 + }, + { + "epoch": 0.2335792374011199, + "grad_norm": 1.663725733757019, + "learning_rate": 1.9957618900612714e-05, + "loss": 1.9824, + "step": 6570 + }, + { + "epoch": 0.23393476135454627, + "grad_norm": 1.6240813732147217, + "learning_rate": 1.9957257700690816e-05, + "loss": 2.0462, + "step": 6580 + }, + { + "epoch": 0.23429028530797263, + "grad_norm": 1.791911005973816, + "learning_rate": 1.9956894971396566e-05, + "loss": 1.9706, + "step": 6590 + }, + { + "epoch": 0.234645809261399, + "grad_norm": 1.6861087083816528, + "learning_rate": 1.995653071278568e-05, + "loss": 2.0081, + "step": 6600 + }, + { + "epoch": 0.23500133321482536, + "grad_norm": 1.6701043844223022, + "learning_rate": 1.9956164924914102e-05, + "loss": 2.0197, + "step": 6610 + }, + { + "epoch": 0.23535685716825172, + "grad_norm": 1.657975673675537, + "learning_rate": 1.995579760783802e-05, + "loss": 2.0568, + "step": 6620 + }, + { + "epoch": 0.23571238112167808, + "grad_norm": 1.7297064065933228, + "learning_rate": 1.9955428761613847e-05, + "loss": 1.9838, + "step": 6630 + }, + { + "epoch": 0.23606790507510444, + "grad_norm": 1.5905978679656982, + "learning_rate": 1.9955058386298235e-05, + "loss": 1.9611, + "step": 6640 + }, + { + "epoch": 0.2364234290285308, + "grad_norm": 1.620131492614746, + "learning_rate": 1.9954686481948078e-05, + "loss": 2.028, + "step": 6650 + }, + { + "epoch": 0.23677895298195717, + "grad_norm": 1.6750433444976807, + "learning_rate": 1.9954313048620493e-05, + "loss": 2.0173, + "step": 6660 + }, + { + "epoch": 0.23713447693538353, + "grad_norm": 1.7565199136734009, + "learning_rate": 1.9953938086372833e-05, + "loss": 2.0216, + "step": 6670 + }, + { + "epoch": 0.2374900008888099, + "grad_norm": 1.6780565977096558, + "learning_rate": 1.9953561595262702e-05, + "loss": 1.9907, + "step": 6680 + }, + { + "epoch": 0.23784552484223626, + "grad_norm": 1.7479665279388428, + "learning_rate": 1.9953183575347913e-05, + "loss": 2.011, + "step": 6690 + }, + { + "epoch": 0.23820104879566262, + "grad_norm": 1.497279405593872, + "learning_rate": 1.995280402668654e-05, + "loss": 1.9641, + "step": 6700 + }, + { + "epoch": 0.23855657274908898, + "grad_norm": 1.6642096042633057, + "learning_rate": 1.9952422949336867e-05, + "loss": 2.0186, + "step": 6710 + }, + { + "epoch": 0.23891209670251534, + "grad_norm": 1.632526159286499, + "learning_rate": 1.9952040343357434e-05, + "loss": 1.9739, + "step": 6720 + }, + { + "epoch": 0.2392676206559417, + "grad_norm": 1.690468192100525, + "learning_rate": 1.9951656208807005e-05, + "loss": 2.0481, + "step": 6730 + }, + { + "epoch": 0.23962314460936807, + "grad_norm": 1.7386364936828613, + "learning_rate": 1.9951270545744576e-05, + "loss": 2.0415, + "step": 6740 + }, + { + "epoch": 0.23997866856279443, + "grad_norm": 1.662619709968567, + "learning_rate": 1.9950883354229388e-05, + "loss": 2.0204, + "step": 6750 + }, + { + "epoch": 0.2403341925162208, + "grad_norm": 1.5891426801681519, + "learning_rate": 1.9950494634320912e-05, + "loss": 1.9883, + "step": 6760 + }, + { + "epoch": 0.24068971646964715, + "grad_norm": 1.4959187507629395, + "learning_rate": 1.995010438607885e-05, + "loss": 1.9959, + "step": 6770 + }, + { + "epoch": 0.24104524042307351, + "grad_norm": 1.819345235824585, + "learning_rate": 1.9949712609563136e-05, + "loss": 1.9796, + "step": 6780 + }, + { + "epoch": 0.24140076437649988, + "grad_norm": 1.6453137397766113, + "learning_rate": 1.9949319304833955e-05, + "loss": 1.9728, + "step": 6790 + }, + { + "epoch": 0.24175628832992624, + "grad_norm": 1.695168137550354, + "learning_rate": 1.9948924471951712e-05, + "loss": 1.9577, + "step": 6800 + }, + { + "epoch": 0.2421118122833526, + "grad_norm": 1.682152509689331, + "learning_rate": 1.994852811097705e-05, + "loss": 2.0488, + "step": 6810 + }, + { + "epoch": 0.24246733623677896, + "grad_norm": 1.5780915021896362, + "learning_rate": 1.9948130221970844e-05, + "loss": 1.9847, + "step": 6820 + }, + { + "epoch": 0.24282286019020533, + "grad_norm": 1.7200599908828735, + "learning_rate": 1.9947730804994215e-05, + "loss": 2.0024, + "step": 6830 + }, + { + "epoch": 0.2431783841436317, + "grad_norm": 1.6747233867645264, + "learning_rate": 1.9947329860108507e-05, + "loss": 2.0127, + "step": 6840 + }, + { + "epoch": 0.24353390809705805, + "grad_norm": 1.659424066543579, + "learning_rate": 1.99469273873753e-05, + "loss": 1.9576, + "step": 6850 + }, + { + "epoch": 0.2438894320504844, + "grad_norm": 1.7689529657363892, + "learning_rate": 1.994652338685642e-05, + "loss": 2.0254, + "step": 6860 + }, + { + "epoch": 0.24424495600391077, + "grad_norm": 1.6394208669662476, + "learning_rate": 1.9946117858613905e-05, + "loss": 1.9943, + "step": 6870 + }, + { + "epoch": 0.24460047995733714, + "grad_norm": 1.563705325126648, + "learning_rate": 1.9945710802710056e-05, + "loss": 1.9709, + "step": 6880 + }, + { + "epoch": 0.2449560039107635, + "grad_norm": 1.7137606143951416, + "learning_rate": 1.9945302219207386e-05, + "loss": 1.9595, + "step": 6890 + }, + { + "epoch": 0.24531152786418986, + "grad_norm": 1.663386344909668, + "learning_rate": 1.9944892108168653e-05, + "loss": 2.0311, + "step": 6900 + }, + { + "epoch": 0.24566705181761622, + "grad_norm": 1.6972986459732056, + "learning_rate": 1.9944480469656846e-05, + "loss": 1.9545, + "step": 6910 + }, + { + "epoch": 0.24602257577104258, + "grad_norm": 1.664995789527893, + "learning_rate": 1.994406730373519e-05, + "loss": 1.9906, + "step": 6920 + }, + { + "epoch": 0.24637809972446895, + "grad_norm": 1.7193104028701782, + "learning_rate": 1.9943652610467147e-05, + "loss": 2.0138, + "step": 6930 + }, + { + "epoch": 0.2467336236778953, + "grad_norm": 1.629887342453003, + "learning_rate": 1.9943236389916412e-05, + "loss": 2.0099, + "step": 6940 + }, + { + "epoch": 0.24708914763132167, + "grad_norm": 1.5323138236999512, + "learning_rate": 1.994281864214691e-05, + "loss": 2.0121, + "step": 6950 + }, + { + "epoch": 0.24744467158474803, + "grad_norm": 1.743226170539856, + "learning_rate": 1.9942399367222808e-05, + "loss": 1.938, + "step": 6960 + }, + { + "epoch": 0.2478001955381744, + "grad_norm": 1.6816143989562988, + "learning_rate": 1.99419785652085e-05, + "loss": 2.0286, + "step": 6970 + }, + { + "epoch": 0.24815571949160076, + "grad_norm": 1.6175199747085571, + "learning_rate": 1.9941556236168624e-05, + "loss": 2.0356, + "step": 6980 + }, + { + "epoch": 0.24851124344502712, + "grad_norm": 1.6289336681365967, + "learning_rate": 1.9941132380168043e-05, + "loss": 2.0156, + "step": 6990 + }, + { + "epoch": 0.24886676739845348, + "grad_norm": 1.7291028499603271, + "learning_rate": 1.994070699727186e-05, + "loss": 1.9689, + "step": 7000 + }, + { + "epoch": 0.24922229135187984, + "grad_norm": 1.6669102907180786, + "learning_rate": 1.994028008754541e-05, + "loss": 2.0108, + "step": 7010 + }, + { + "epoch": 0.2495778153053062, + "grad_norm": 1.6643351316452026, + "learning_rate": 1.9939851651054265e-05, + "loss": 2.0142, + "step": 7020 + }, + { + "epoch": 0.24993333925873257, + "grad_norm": 1.6312966346740723, + "learning_rate": 1.993942168786423e-05, + "loss": 1.9666, + "step": 7030 + }, + { + "epoch": 0.25028886321215893, + "grad_norm": 1.6095354557037354, + "learning_rate": 1.9938990198041344e-05, + "loss": 2.0038, + "step": 7040 + }, + { + "epoch": 0.2506443871655853, + "grad_norm": 1.5703375339508057, + "learning_rate": 1.9938557181651882e-05, + "loss": 1.9245, + "step": 7050 + }, + { + "epoch": 0.25099991111901165, + "grad_norm": 1.5972967147827148, + "learning_rate": 1.9938122638762353e-05, + "loss": 1.9794, + "step": 7060 + }, + { + "epoch": 0.251355435072438, + "grad_norm": 1.6072685718536377, + "learning_rate": 1.9937686569439497e-05, + "loss": 1.972, + "step": 7070 + }, + { + "epoch": 0.2517109590258644, + "grad_norm": 1.6625516414642334, + "learning_rate": 1.9937248973750297e-05, + "loss": 2.0329, + "step": 7080 + }, + { + "epoch": 0.25206648297929074, + "grad_norm": 1.6364060640335083, + "learning_rate": 1.993680985176196e-05, + "loss": 1.9573, + "step": 7090 + }, + { + "epoch": 0.2524220069327171, + "grad_norm": 1.6397068500518799, + "learning_rate": 1.9936369203541932e-05, + "loss": 1.9711, + "step": 7100 + }, + { + "epoch": 0.25277753088614346, + "grad_norm": 1.6169209480285645, + "learning_rate": 1.9935927029157897e-05, + "loss": 1.9561, + "step": 7110 + }, + { + "epoch": 0.2531330548395698, + "grad_norm": 1.6281391382217407, + "learning_rate": 1.993548332867777e-05, + "loss": 1.9774, + "step": 7120 + }, + { + "epoch": 0.2534885787929962, + "grad_norm": 1.7074105739593506, + "learning_rate": 1.99350381021697e-05, + "loss": 2.0044, + "step": 7130 + }, + { + "epoch": 0.25384410274642255, + "grad_norm": 1.6432868242263794, + "learning_rate": 1.9934591349702068e-05, + "loss": 2.0351, + "step": 7140 + }, + { + "epoch": 0.2541996266998489, + "grad_norm": 1.6380431652069092, + "learning_rate": 1.99341430713435e-05, + "loss": 1.945, + "step": 7150 + }, + { + "epoch": 0.2545551506532753, + "grad_norm": 1.728100299835205, + "learning_rate": 1.9933693267162838e-05, + "loss": 1.9846, + "step": 7160 + }, + { + "epoch": 0.25491067460670164, + "grad_norm": 1.668286919593811, + "learning_rate": 1.9933241937229176e-05, + "loss": 1.9898, + "step": 7170 + }, + { + "epoch": 0.255266198560128, + "grad_norm": 1.5646265745162964, + "learning_rate": 1.9932789081611838e-05, + "loss": 2.0117, + "step": 7180 + }, + { + "epoch": 0.25562172251355436, + "grad_norm": 1.7030260562896729, + "learning_rate": 1.9932334700380375e-05, + "loss": 2.0039, + "step": 7190 + }, + { + "epoch": 0.2559772464669807, + "grad_norm": 1.6385523080825806, + "learning_rate": 1.9931878793604577e-05, + "loss": 1.9967, + "step": 7200 + }, + { + "epoch": 0.2563327704204071, + "grad_norm": 1.6005942821502686, + "learning_rate": 1.993142136135447e-05, + "loss": 1.9734, + "step": 7210 + }, + { + "epoch": 0.25668829437383345, + "grad_norm": 1.6573022603988647, + "learning_rate": 1.9930962403700313e-05, + "loss": 2.0014, + "step": 7220 + }, + { + "epoch": 0.2570438183272598, + "grad_norm": 1.624893069267273, + "learning_rate": 1.9930501920712597e-05, + "loss": 1.9705, + "step": 7230 + }, + { + "epoch": 0.25739934228068617, + "grad_norm": 1.571858525276184, + "learning_rate": 1.9930039912462052e-05, + "loss": 2.033, + "step": 7240 + }, + { + "epoch": 0.25775486623411253, + "grad_norm": 1.672079086303711, + "learning_rate": 1.9929576379019638e-05, + "loss": 1.9694, + "step": 7250 + }, + { + "epoch": 0.2581103901875389, + "grad_norm": 1.7197091579437256, + "learning_rate": 1.992911132045655e-05, + "loss": 2.0, + "step": 7260 + }, + { + "epoch": 0.25846591414096526, + "grad_norm": 1.6931955814361572, + "learning_rate": 1.992864473684422e-05, + "loss": 1.9984, + "step": 7270 + }, + { + "epoch": 0.2588214380943916, + "grad_norm": 1.7102130651474, + "learning_rate": 1.9928176628254313e-05, + "loss": 1.9194, + "step": 7280 + }, + { + "epoch": 0.259176962047818, + "grad_norm": 1.6537212133407593, + "learning_rate": 1.9927706994758728e-05, + "loss": 2.0039, + "step": 7290 + }, + { + "epoch": 0.25953248600124434, + "grad_norm": 1.5721324682235718, + "learning_rate": 1.992723583642959e-05, + "loss": 1.9303, + "step": 7300 + }, + { + "epoch": 0.2598880099546707, + "grad_norm": 1.559722661972046, + "learning_rate": 1.9926763153339275e-05, + "loss": 1.9264, + "step": 7310 + }, + { + "epoch": 0.26024353390809707, + "grad_norm": 1.630336880683899, + "learning_rate": 1.992628894556038e-05, + "loss": 1.9928, + "step": 7320 + }, + { + "epoch": 0.26059905786152343, + "grad_norm": 1.6448107957839966, + "learning_rate": 1.9925813213165742e-05, + "loss": 1.9774, + "step": 7330 + }, + { + "epoch": 0.2609545818149498, + "grad_norm": 1.5473449230194092, + "learning_rate": 1.9925335956228426e-05, + "loss": 1.9798, + "step": 7340 + }, + { + "epoch": 0.26131010576837616, + "grad_norm": 1.6154736280441284, + "learning_rate": 1.9924857174821744e-05, + "loss": 1.9911, + "step": 7350 + }, + { + "epoch": 0.2616656297218025, + "grad_norm": 1.7111319303512573, + "learning_rate": 1.992437686901923e-05, + "loss": 1.9627, + "step": 7360 + }, + { + "epoch": 0.2620211536752289, + "grad_norm": 1.7432341575622559, + "learning_rate": 1.9923895038894652e-05, + "loss": 2.0037, + "step": 7370 + }, + { + "epoch": 0.26237667762865524, + "grad_norm": 1.6998772621154785, + "learning_rate": 1.992341168452202e-05, + "loss": 1.9835, + "step": 7380 + }, + { + "epoch": 0.2627322015820816, + "grad_norm": 1.6501591205596924, + "learning_rate": 1.9922926805975576e-05, + "loss": 2.0224, + "step": 7390 + }, + { + "epoch": 0.26308772553550797, + "grad_norm": 1.7000082731246948, + "learning_rate": 1.9922440403329788e-05, + "loss": 1.9953, + "step": 7400 + }, + { + "epoch": 0.26344324948893433, + "grad_norm": 1.7175724506378174, + "learning_rate": 1.992195247665937e-05, + "loss": 1.9985, + "step": 7410 + }, + { + "epoch": 0.2637987734423607, + "grad_norm": 1.661278486251831, + "learning_rate": 1.9921463026039264e-05, + "loss": 1.9272, + "step": 7420 + }, + { + "epoch": 0.26415429739578705, + "grad_norm": 1.6486077308654785, + "learning_rate": 1.9920972051544646e-05, + "loss": 1.9973, + "step": 7430 + }, + { + "epoch": 0.2645098213492134, + "grad_norm": 1.7884916067123413, + "learning_rate": 1.9920479553250922e-05, + "loss": 1.9849, + "step": 7440 + }, + { + "epoch": 0.2648653453026398, + "grad_norm": 1.6757444143295288, + "learning_rate": 1.9919985531233743e-05, + "loss": 1.9792, + "step": 7450 + }, + { + "epoch": 0.26522086925606614, + "grad_norm": 1.592671513557434, + "learning_rate": 1.9919489985568985e-05, + "loss": 1.9994, + "step": 7460 + }, + { + "epoch": 0.2655763932094925, + "grad_norm": 1.6154669523239136, + "learning_rate": 1.991899291633276e-05, + "loss": 1.9741, + "step": 7470 + }, + { + "epoch": 0.26593191716291886, + "grad_norm": 1.6644245386123657, + "learning_rate": 1.9918494323601415e-05, + "loss": 1.9716, + "step": 7480 + }, + { + "epoch": 0.2662874411163452, + "grad_norm": 1.726770281791687, + "learning_rate": 1.9917994207451533e-05, + "loss": 2.0, + "step": 7490 + }, + { + "epoch": 0.2666429650697716, + "grad_norm": 1.72080397605896, + "learning_rate": 1.9917492567959926e-05, + "loss": 1.9882, + "step": 7500 + }, + { + "epoch": 0.26699848902319795, + "grad_norm": 1.7543189525604248, + "learning_rate": 1.9916989405203648e-05, + "loss": 1.9709, + "step": 7510 + }, + { + "epoch": 0.2673540129766243, + "grad_norm": 1.7151052951812744, + "learning_rate": 1.9916484719259973e-05, + "loss": 1.9461, + "step": 7520 + }, + { + "epoch": 0.2677095369300507, + "grad_norm": 1.6305655241012573, + "learning_rate": 1.9915978510206423e-05, + "loss": 1.984, + "step": 7530 + }, + { + "epoch": 0.26806506088347704, + "grad_norm": 1.655402660369873, + "learning_rate": 1.9915470778120746e-05, + "loss": 1.9903, + "step": 7540 + }, + { + "epoch": 0.2684205848369034, + "grad_norm": 1.6377874612808228, + "learning_rate": 1.991496152308093e-05, + "loss": 2.0417, + "step": 7550 + }, + { + "epoch": 0.26877610879032976, + "grad_norm": 1.6017521619796753, + "learning_rate": 1.991445074516519e-05, + "loss": 1.9541, + "step": 7560 + }, + { + "epoch": 0.2691316327437561, + "grad_norm": 1.6277523040771484, + "learning_rate": 1.991393844445198e-05, + "loss": 1.9803, + "step": 7570 + }, + { + "epoch": 0.2694871566971825, + "grad_norm": 1.6906298398971558, + "learning_rate": 1.9913424621019987e-05, + "loss": 2.0192, + "step": 7580 + }, + { + "epoch": 0.26984268065060885, + "grad_norm": 1.611890435218811, + "learning_rate": 1.9912909274948126e-05, + "loss": 1.9576, + "step": 7590 + }, + { + "epoch": 0.2701982046040352, + "grad_norm": 1.5919207334518433, + "learning_rate": 1.991239240631556e-05, + "loss": 2.0154, + "step": 7600 + }, + { + "epoch": 0.27055372855746157, + "grad_norm": 1.6700984239578247, + "learning_rate": 1.9911874015201667e-05, + "loss": 1.9696, + "step": 7610 + }, + { + "epoch": 0.27090925251088793, + "grad_norm": 1.6968141794204712, + "learning_rate": 1.9911354101686076e-05, + "loss": 1.9666, + "step": 7620 + }, + { + "epoch": 0.2712647764643143, + "grad_norm": 1.6996207237243652, + "learning_rate": 1.991083266584864e-05, + "loss": 2.0248, + "step": 7630 + }, + { + "epoch": 0.27162030041774066, + "grad_norm": 1.5757447481155396, + "learning_rate": 1.9910309707769448e-05, + "loss": 2.0071, + "step": 7640 + }, + { + "epoch": 0.271975824371167, + "grad_norm": 1.7433087825775146, + "learning_rate": 1.990978522752882e-05, + "loss": 2.0058, + "step": 7650 + }, + { + "epoch": 0.2723313483245934, + "grad_norm": 1.6996994018554688, + "learning_rate": 1.9909259225207318e-05, + "loss": 1.9921, + "step": 7660 + }, + { + "epoch": 0.27268687227801974, + "grad_norm": 1.8098211288452148, + "learning_rate": 1.990873170088573e-05, + "loss": 1.959, + "step": 7670 + }, + { + "epoch": 0.2730423962314461, + "grad_norm": 1.6081329584121704, + "learning_rate": 1.9908202654645082e-05, + "loss": 1.9493, + "step": 7680 + }, + { + "epoch": 0.27339792018487247, + "grad_norm": 1.668576717376709, + "learning_rate": 1.990767208656663e-05, + "loss": 1.9601, + "step": 7690 + }, + { + "epoch": 0.27375344413829883, + "grad_norm": 1.720616102218628, + "learning_rate": 1.9907139996731866e-05, + "loss": 1.9961, + "step": 7700 + }, + { + "epoch": 0.2741089680917252, + "grad_norm": 1.6592384576797485, + "learning_rate": 1.990660638522252e-05, + "loss": 1.9922, + "step": 7710 + }, + { + "epoch": 0.27446449204515155, + "grad_norm": 1.6151938438415527, + "learning_rate": 1.9906071252120546e-05, + "loss": 1.9841, + "step": 7720 + }, + { + "epoch": 0.2748200159985779, + "grad_norm": 1.651526689529419, + "learning_rate": 1.990553459750814e-05, + "loss": 1.9546, + "step": 7730 + }, + { + "epoch": 0.2751755399520043, + "grad_norm": 1.7546813488006592, + "learning_rate": 1.9904996421467727e-05, + "loss": 1.9433, + "step": 7740 + }, + { + "epoch": 0.27553106390543064, + "grad_norm": 1.8067822456359863, + "learning_rate": 1.990445672408197e-05, + "loss": 1.9861, + "step": 7750 + }, + { + "epoch": 0.275886587858857, + "grad_norm": 1.693223237991333, + "learning_rate": 1.990391550543376e-05, + "loss": 1.9519, + "step": 7760 + }, + { + "epoch": 0.27624211181228336, + "grad_norm": 1.5922542810440063, + "learning_rate": 1.9903372765606227e-05, + "loss": 1.957, + "step": 7770 + }, + { + "epoch": 0.2765976357657097, + "grad_norm": 1.715636134147644, + "learning_rate": 1.9902828504682733e-05, + "loss": 1.9941, + "step": 7780 + }, + { + "epoch": 0.2769531597191361, + "grad_norm": 1.71389639377594, + "learning_rate": 1.990228272274687e-05, + "loss": 1.9925, + "step": 7790 + }, + { + "epoch": 0.27730868367256245, + "grad_norm": 1.6671241521835327, + "learning_rate": 1.9901735419882467e-05, + "loss": 1.9709, + "step": 7800 + }, + { + "epoch": 0.2776642076259888, + "grad_norm": 1.6671831607818604, + "learning_rate": 1.9901186596173593e-05, + "loss": 1.9648, + "step": 7810 + }, + { + "epoch": 0.2780197315794152, + "grad_norm": 1.6454179286956787, + "learning_rate": 1.9900636251704537e-05, + "loss": 1.9815, + "step": 7820 + }, + { + "epoch": 0.27837525553284154, + "grad_norm": 1.6738249063491821, + "learning_rate": 1.990008438655983e-05, + "loss": 1.9861, + "step": 7830 + }, + { + "epoch": 0.2787307794862679, + "grad_norm": 1.6551613807678223, + "learning_rate": 1.9899531000824234e-05, + "loss": 1.916, + "step": 7840 + }, + { + "epoch": 0.27908630343969426, + "grad_norm": 1.6680654287338257, + "learning_rate": 1.9898976094582746e-05, + "loss": 2.0078, + "step": 7850 + }, + { + "epoch": 0.2794418273931206, + "grad_norm": 1.69269597530365, + "learning_rate": 1.9898419667920598e-05, + "loss": 1.9147, + "step": 7860 + }, + { + "epoch": 0.279797351346547, + "grad_norm": 1.6375586986541748, + "learning_rate": 1.9897861720923255e-05, + "loss": 1.9521, + "step": 7870 + }, + { + "epoch": 0.28015287529997335, + "grad_norm": 1.6596815586090088, + "learning_rate": 1.989730225367641e-05, + "loss": 1.9801, + "step": 7880 + }, + { + "epoch": 0.2805083992533997, + "grad_norm": 1.649316668510437, + "learning_rate": 1.9896741266265994e-05, + "loss": 1.924, + "step": 7890 + }, + { + "epoch": 0.28086392320682607, + "grad_norm": 1.636256217956543, + "learning_rate": 1.9896178758778173e-05, + "loss": 2.0577, + "step": 7900 + }, + { + "epoch": 0.28121944716025243, + "grad_norm": 1.656315565109253, + "learning_rate": 1.9895614731299344e-05, + "loss": 1.9554, + "step": 7910 + }, + { + "epoch": 0.2815749711136788, + "grad_norm": 1.6406137943267822, + "learning_rate": 1.989504918391614e-05, + "loss": 1.969, + "step": 7920 + }, + { + "epoch": 0.28193049506710516, + "grad_norm": 1.7492715120315552, + "learning_rate": 1.9894482116715422e-05, + "loss": 2.0121, + "step": 7930 + }, + { + "epoch": 0.2822860190205315, + "grad_norm": 1.7002264261245728, + "learning_rate": 1.9893913529784294e-05, + "loss": 1.9895, + "step": 7940 + }, + { + "epoch": 0.2826415429739579, + "grad_norm": 1.7042263746261597, + "learning_rate": 1.9893343423210077e-05, + "loss": 1.9873, + "step": 7950 + }, + { + "epoch": 0.28299706692738424, + "grad_norm": 1.6575679779052734, + "learning_rate": 1.9892771797080342e-05, + "loss": 2.017, + "step": 7960 + }, + { + "epoch": 0.2833525908808106, + "grad_norm": 1.537338137626648, + "learning_rate": 1.989219865148289e-05, + "loss": 1.9702, + "step": 7970 + }, + { + "epoch": 0.28370811483423697, + "grad_norm": 1.6482051610946655, + "learning_rate": 1.9891623986505752e-05, + "loss": 1.9639, + "step": 7980 + }, + { + "epoch": 0.28406363878766333, + "grad_norm": 1.6478086709976196, + "learning_rate": 1.9891047802237188e-05, + "loss": 2.0201, + "step": 7990 + }, + { + "epoch": 0.2844191627410897, + "grad_norm": 1.7550305128097534, + "learning_rate": 1.98904700987657e-05, + "loss": 1.9951, + "step": 8000 + }, + { + "epoch": 0.28477468669451605, + "grad_norm": 1.7667663097381592, + "learning_rate": 1.9889890876180015e-05, + "loss": 1.9867, + "step": 8010 + }, + { + "epoch": 0.2851302106479424, + "grad_norm": 1.6240298748016357, + "learning_rate": 1.9889310134569104e-05, + "loss": 1.9708, + "step": 8020 + }, + { + "epoch": 0.2854857346013688, + "grad_norm": 1.6603147983551025, + "learning_rate": 1.9888727874022163e-05, + "loss": 1.9523, + "step": 8030 + }, + { + "epoch": 0.28584125855479514, + "grad_norm": 1.6123783588409424, + "learning_rate": 1.9888144094628624e-05, + "loss": 1.9703, + "step": 8040 + }, + { + "epoch": 0.2861967825082215, + "grad_norm": 1.8504589796066284, + "learning_rate": 1.988755879647815e-05, + "loss": 1.9828, + "step": 8050 + }, + { + "epoch": 0.28655230646164787, + "grad_norm": 1.5599009990692139, + "learning_rate": 1.988697197966064e-05, + "loss": 1.9852, + "step": 8060 + }, + { + "epoch": 0.2869078304150742, + "grad_norm": 1.671850562095642, + "learning_rate": 1.9886383644266226e-05, + "loss": 1.9513, + "step": 8070 + }, + { + "epoch": 0.2872633543685006, + "grad_norm": 1.6670169830322266, + "learning_rate": 1.9885793790385274e-05, + "loss": 1.9426, + "step": 8080 + }, + { + "epoch": 0.28761887832192695, + "grad_norm": 1.6796684265136719, + "learning_rate": 1.988520241810838e-05, + "loss": 1.9954, + "step": 8090 + }, + { + "epoch": 0.2879744022753533, + "grad_norm": 1.6433721780776978, + "learning_rate": 1.9884609527526374e-05, + "loss": 2.0219, + "step": 8100 + }, + { + "epoch": 0.2883299262287797, + "grad_norm": 1.6410399675369263, + "learning_rate": 1.988401511873032e-05, + "loss": 1.9721, + "step": 8110 + }, + { + "epoch": 0.28868545018220604, + "grad_norm": 1.6350502967834473, + "learning_rate": 1.988341919181152e-05, + "loss": 1.9548, + "step": 8120 + }, + { + "epoch": 0.2890409741356324, + "grad_norm": 1.6690397262573242, + "learning_rate": 1.98828217468615e-05, + "loss": 1.9407, + "step": 8130 + }, + { + "epoch": 0.28939649808905876, + "grad_norm": 1.7214834690093994, + "learning_rate": 1.9882222783972026e-05, + "loss": 1.953, + "step": 8140 + }, + { + "epoch": 0.2897520220424851, + "grad_norm": 1.700016975402832, + "learning_rate": 1.9881622303235094e-05, + "loss": 2.031, + "step": 8150 + }, + { + "epoch": 0.2901075459959115, + "grad_norm": 1.659131407737732, + "learning_rate": 1.9881020304742935e-05, + "loss": 1.9669, + "step": 8160 + }, + { + "epoch": 0.29046306994933785, + "grad_norm": 1.6631243228912354, + "learning_rate": 1.9880416788588013e-05, + "loss": 1.989, + "step": 8170 + }, + { + "epoch": 0.2908185939027642, + "grad_norm": 1.6930702924728394, + "learning_rate": 1.9879811754863022e-05, + "loss": 1.9386, + "step": 8180 + }, + { + "epoch": 0.2911741178561906, + "grad_norm": 1.6611827611923218, + "learning_rate": 1.987920520366089e-05, + "loss": 1.9319, + "step": 8190 + }, + { + "epoch": 0.29152964180961694, + "grad_norm": 1.7308286428451538, + "learning_rate": 1.9878597135074784e-05, + "loss": 1.9844, + "step": 8200 + }, + { + "epoch": 0.2918851657630433, + "grad_norm": 1.7405385971069336, + "learning_rate": 1.9877987549198097e-05, + "loss": 2.0176, + "step": 8210 + }, + { + "epoch": 0.29224068971646966, + "grad_norm": 1.5844897031784058, + "learning_rate": 1.987737644612446e-05, + "loss": 1.9362, + "step": 8220 + }, + { + "epoch": 0.292596213669896, + "grad_norm": 1.6568342447280884, + "learning_rate": 1.987676382594773e-05, + "loss": 1.9755, + "step": 8230 + }, + { + "epoch": 0.2929517376233224, + "grad_norm": 1.615202784538269, + "learning_rate": 1.9876149688762e-05, + "loss": 1.9943, + "step": 8240 + }, + { + "epoch": 0.29330726157674875, + "grad_norm": 1.6444897651672363, + "learning_rate": 1.9875534034661607e-05, + "loss": 2.0033, + "step": 8250 + }, + { + "epoch": 0.2936627855301751, + "grad_norm": 1.6145836114883423, + "learning_rate": 1.9874916863741108e-05, + "loss": 1.9547, + "step": 8260 + }, + { + "epoch": 0.29401830948360147, + "grad_norm": 1.6955664157867432, + "learning_rate": 1.9874298176095292e-05, + "loss": 1.9754, + "step": 8270 + }, + { + "epoch": 0.29437383343702783, + "grad_norm": 1.7114002704620361, + "learning_rate": 1.987367797181919e-05, + "loss": 1.9632, + "step": 8280 + }, + { + "epoch": 0.2947293573904542, + "grad_norm": 1.7069765329360962, + "learning_rate": 1.9873056251008057e-05, + "loss": 1.945, + "step": 8290 + }, + { + "epoch": 0.29508488134388056, + "grad_norm": 1.728232979774475, + "learning_rate": 1.987243301375739e-05, + "loss": 1.9799, + "step": 8300 + }, + { + "epoch": 0.2954404052973069, + "grad_norm": 1.684863567352295, + "learning_rate": 1.9871808260162914e-05, + "loss": 1.984, + "step": 8310 + }, + { + "epoch": 0.2957959292507333, + "grad_norm": 1.7075393199920654, + "learning_rate": 1.9871181990320586e-05, + "loss": 1.9602, + "step": 8320 + }, + { + "epoch": 0.29615145320415964, + "grad_norm": 1.620227575302124, + "learning_rate": 1.9870554204326597e-05, + "loss": 1.9698, + "step": 8330 + }, + { + "epoch": 0.296506977157586, + "grad_norm": 1.690664529800415, + "learning_rate": 1.9869924902277368e-05, + "loss": 1.996, + "step": 8340 + }, + { + "epoch": 0.29686250111101237, + "grad_norm": 1.6752780675888062, + "learning_rate": 1.9869294084269563e-05, + "loss": 1.9374, + "step": 8350 + }, + { + "epoch": 0.29721802506443873, + "grad_norm": 1.6255035400390625, + "learning_rate": 1.9868661750400066e-05, + "loss": 1.9483, + "step": 8360 + }, + { + "epoch": 0.2975735490178651, + "grad_norm": 1.593761920928955, + "learning_rate": 1.9868027900765997e-05, + "loss": 1.9777, + "step": 8370 + }, + { + "epoch": 0.29792907297129145, + "grad_norm": 1.7208243608474731, + "learning_rate": 1.986739253546472e-05, + "loss": 1.9921, + "step": 8380 + }, + { + "epoch": 0.2982845969247178, + "grad_norm": 1.6406222581863403, + "learning_rate": 1.986675565459382e-05, + "loss": 1.9242, + "step": 8390 + }, + { + "epoch": 0.2986401208781442, + "grad_norm": 1.610335111618042, + "learning_rate": 1.9866117258251112e-05, + "loss": 1.9952, + "step": 8400 + }, + { + "epoch": 0.29899564483157054, + "grad_norm": 1.560603141784668, + "learning_rate": 1.986547734653466e-05, + "loss": 1.9441, + "step": 8410 + }, + { + "epoch": 0.2993511687849969, + "grad_norm": 1.7074131965637207, + "learning_rate": 1.986483591954274e-05, + "loss": 1.9832, + "step": 8420 + }, + { + "epoch": 0.29970669273842326, + "grad_norm": 1.6053872108459473, + "learning_rate": 1.9864192977373876e-05, + "loss": 1.9541, + "step": 8430 + }, + { + "epoch": 0.3000622166918496, + "grad_norm": 1.589003324508667, + "learning_rate": 1.986354852012682e-05, + "loss": 1.9578, + "step": 8440 + }, + { + "epoch": 0.300417740645276, + "grad_norm": 1.634819746017456, + "learning_rate": 1.9862902547900556e-05, + "loss": 1.9853, + "step": 8450 + }, + { + "epoch": 0.30077326459870235, + "grad_norm": 1.6535834074020386, + "learning_rate": 1.98622550607943e-05, + "loss": 1.9676, + "step": 8460 + }, + { + "epoch": 0.3011287885521287, + "grad_norm": 1.7705475091934204, + "learning_rate": 1.986160605890751e-05, + "loss": 1.9961, + "step": 8470 + }, + { + "epoch": 0.3014843125055551, + "grad_norm": 1.552779197692871, + "learning_rate": 1.9860955542339857e-05, + "loss": 1.967, + "step": 8480 + }, + { + "epoch": 0.30183983645898144, + "grad_norm": 1.7160229682922363, + "learning_rate": 1.9860303511191263e-05, + "loss": 1.9741, + "step": 8490 + }, + { + "epoch": 0.3021953604124078, + "grad_norm": 1.616591453552246, + "learning_rate": 1.9859649965561872e-05, + "loss": 1.9501, + "step": 8500 + }, + { + "epoch": 0.30255088436583416, + "grad_norm": 1.6419483423233032, + "learning_rate": 1.985899490555207e-05, + "loss": 1.9324, + "step": 8510 + }, + { + "epoch": 0.3029064083192605, + "grad_norm": 1.7029931545257568, + "learning_rate": 1.9858338331262463e-05, + "loss": 1.9497, + "step": 8520 + }, + { + "epoch": 0.3032619322726869, + "grad_norm": 1.6598072052001953, + "learning_rate": 1.9857680242793903e-05, + "loss": 1.9854, + "step": 8530 + }, + { + "epoch": 0.30361745622611325, + "grad_norm": 1.6088287830352783, + "learning_rate": 1.9857020640247466e-05, + "loss": 1.9934, + "step": 8540 + }, + { + "epoch": 0.3039729801795396, + "grad_norm": 1.561793565750122, + "learning_rate": 1.9856359523724462e-05, + "loss": 1.9441, + "step": 8550 + }, + { + "epoch": 0.30432850413296597, + "grad_norm": 1.61014986038208, + "learning_rate": 1.9855696893326435e-05, + "loss": 1.9602, + "step": 8560 + }, + { + "epoch": 0.30468402808639233, + "grad_norm": 1.710195541381836, + "learning_rate": 1.9855032749155163e-05, + "loss": 1.9865, + "step": 8570 + }, + { + "epoch": 0.3050395520398187, + "grad_norm": 1.6399401426315308, + "learning_rate": 1.985436709131265e-05, + "loss": 1.968, + "step": 8580 + }, + { + "epoch": 0.30539507599324506, + "grad_norm": 1.6504136323928833, + "learning_rate": 1.985369991990114e-05, + "loss": 1.9412, + "step": 8590 + }, + { + "epoch": 0.3057505999466714, + "grad_norm": 1.6371665000915527, + "learning_rate": 1.9853031235023104e-05, + "loss": 2.036, + "step": 8600 + }, + { + "epoch": 0.3061061239000978, + "grad_norm": 1.5760153532028198, + "learning_rate": 1.985236103678125e-05, + "loss": 1.9256, + "step": 8610 + }, + { + "epoch": 0.30646164785352414, + "grad_norm": 1.683082103729248, + "learning_rate": 1.9851689325278516e-05, + "loss": 1.959, + "step": 8620 + }, + { + "epoch": 0.3068171718069505, + "grad_norm": 1.6927025318145752, + "learning_rate": 1.9851016100618072e-05, + "loss": 1.9749, + "step": 8630 + }, + { + "epoch": 0.30717269576037687, + "grad_norm": 1.7396620512008667, + "learning_rate": 1.9850341362903322e-05, + "loss": 1.9904, + "step": 8640 + }, + { + "epoch": 0.30752821971380323, + "grad_norm": 1.6537268161773682, + "learning_rate": 1.9849665112237898e-05, + "loss": 1.9988, + "step": 8650 + }, + { + "epoch": 0.3078837436672296, + "grad_norm": 1.6655324697494507, + "learning_rate": 1.9848987348725674e-05, + "loss": 2.0425, + "step": 8660 + }, + { + "epoch": 0.30823926762065595, + "grad_norm": 1.8228856325149536, + "learning_rate": 1.9848308072470746e-05, + "loss": 1.9687, + "step": 8670 + }, + { + "epoch": 0.3085947915740823, + "grad_norm": 1.6344400644302368, + "learning_rate": 1.9847627283577446e-05, + "loss": 1.9623, + "step": 8680 + }, + { + "epoch": 0.3089503155275087, + "grad_norm": 1.5347328186035156, + "learning_rate": 1.984694498215034e-05, + "loss": 1.9687, + "step": 8690 + }, + { + "epoch": 0.30930583948093504, + "grad_norm": 1.6126179695129395, + "learning_rate": 1.984626116829423e-05, + "loss": 1.9642, + "step": 8700 + }, + { + "epoch": 0.3096613634343614, + "grad_norm": 1.7136037349700928, + "learning_rate": 1.984557584211414e-05, + "loss": 1.9823, + "step": 8710 + }, + { + "epoch": 0.31001688738778777, + "grad_norm": 1.6425186395645142, + "learning_rate": 1.9844889003715327e-05, + "loss": 1.9147, + "step": 8720 + }, + { + "epoch": 0.3103724113412141, + "grad_norm": 1.6839289665222168, + "learning_rate": 1.9844200653203293e-05, + "loss": 1.9372, + "step": 8730 + }, + { + "epoch": 0.3107279352946405, + "grad_norm": 1.6468852758407593, + "learning_rate": 1.9843510790683767e-05, + "loss": 1.9748, + "step": 8740 + }, + { + "epoch": 0.31108345924806685, + "grad_norm": 1.6727285385131836, + "learning_rate": 1.9842819416262698e-05, + "loss": 1.9462, + "step": 8750 + }, + { + "epoch": 0.3114389832014932, + "grad_norm": 1.7718051671981812, + "learning_rate": 1.9842126530046285e-05, + "loss": 1.8897, + "step": 8760 + }, + { + "epoch": 0.3117945071549196, + "grad_norm": 1.708163857460022, + "learning_rate": 1.9841432132140945e-05, + "loss": 1.9546, + "step": 8770 + }, + { + "epoch": 0.31215003110834594, + "grad_norm": 1.61192786693573, + "learning_rate": 1.9840736222653337e-05, + "loss": 1.9602, + "step": 8780 + }, + { + "epoch": 0.3125055550617723, + "grad_norm": 1.5967910289764404, + "learning_rate": 1.9840038801690346e-05, + "loss": 1.995, + "step": 8790 + }, + { + "epoch": 0.31286107901519866, + "grad_norm": 1.7376841306686401, + "learning_rate": 1.9839339869359094e-05, + "loss": 1.929, + "step": 8800 + }, + { + "epoch": 0.313216602968625, + "grad_norm": 1.6677132844924927, + "learning_rate": 1.9838639425766932e-05, + "loss": 1.9763, + "step": 8810 + }, + { + "epoch": 0.3135721269220514, + "grad_norm": 1.5603753328323364, + "learning_rate": 1.9837937471021442e-05, + "loss": 1.9691, + "step": 8820 + }, + { + "epoch": 0.31392765087547775, + "grad_norm": 1.7010904550552368, + "learning_rate": 1.9837234005230442e-05, + "loss": 1.9976, + "step": 8830 + }, + { + "epoch": 0.3142831748289041, + "grad_norm": 1.6840176582336426, + "learning_rate": 1.9836529028501976e-05, + "loss": 1.9502, + "step": 8840 + }, + { + "epoch": 0.3146386987823305, + "grad_norm": 1.6860710382461548, + "learning_rate": 1.9835822540944328e-05, + "loss": 1.951, + "step": 8850 + }, + { + "epoch": 0.31499422273575683, + "grad_norm": 1.7188667058944702, + "learning_rate": 1.983511454266601e-05, + "loss": 1.9385, + "step": 8860 + }, + { + "epoch": 0.3153497466891832, + "grad_norm": 1.562816858291626, + "learning_rate": 1.9834405033775762e-05, + "loss": 1.9437, + "step": 8870 + }, + { + "epoch": 0.31570527064260956, + "grad_norm": 1.6143077611923218, + "learning_rate": 1.9833694014382565e-05, + "loss": 1.9291, + "step": 8880 + }, + { + "epoch": 0.3160607945960359, + "grad_norm": 1.7434130907058716, + "learning_rate": 1.9832981484595626e-05, + "loss": 1.9641, + "step": 8890 + }, + { + "epoch": 0.3164163185494623, + "grad_norm": 1.656266450881958, + "learning_rate": 1.983226744452438e-05, + "loss": 1.9055, + "step": 8900 + }, + { + "epoch": 0.31677184250288865, + "grad_norm": 1.6229407787322998, + "learning_rate": 1.9831551894278508e-05, + "loss": 2.0194, + "step": 8910 + }, + { + "epoch": 0.317127366456315, + "grad_norm": 1.568166732788086, + "learning_rate": 1.9830834833967907e-05, + "loss": 1.944, + "step": 8920 + }, + { + "epoch": 0.31748289040974137, + "grad_norm": 1.6000345945358276, + "learning_rate": 1.9830116263702717e-05, + "loss": 1.9643, + "step": 8930 + }, + { + "epoch": 0.31783841436316773, + "grad_norm": 1.6643948554992676, + "learning_rate": 1.9829396183593303e-05, + "loss": 1.9434, + "step": 8940 + }, + { + "epoch": 0.3181939383165941, + "grad_norm": 1.6422382593154907, + "learning_rate": 1.9828674593750266e-05, + "loss": 1.9447, + "step": 8950 + }, + { + "epoch": 0.31854946227002046, + "grad_norm": 1.6364524364471436, + "learning_rate": 1.982795149428444e-05, + "loss": 1.9909, + "step": 8960 + }, + { + "epoch": 0.3189049862234468, + "grad_norm": 1.5950331687927246, + "learning_rate": 1.9827226885306883e-05, + "loss": 1.9634, + "step": 8970 + }, + { + "epoch": 0.3192605101768732, + "grad_norm": 1.6469675302505493, + "learning_rate": 1.9826500766928896e-05, + "loss": 1.9507, + "step": 8980 + }, + { + "epoch": 0.31961603413029954, + "grad_norm": 1.7085630893707275, + "learning_rate": 1.9825773139262e-05, + "loss": 1.9463, + "step": 8990 + }, + { + "epoch": 0.3199715580837259, + "grad_norm": 1.7127751111984253, + "learning_rate": 1.9825044002417962e-05, + "loss": 1.9602, + "step": 9000 + }, + { + "epoch": 0.32032708203715227, + "grad_norm": 1.6652568578720093, + "learning_rate": 1.9824313356508766e-05, + "loss": 1.9677, + "step": 9010 + }, + { + "epoch": 0.32068260599057863, + "grad_norm": 1.617638349533081, + "learning_rate": 1.9823581201646638e-05, + "loss": 1.9952, + "step": 9020 + }, + { + "epoch": 0.321038129944005, + "grad_norm": 1.6622306108474731, + "learning_rate": 1.9822847537944033e-05, + "loss": 1.917, + "step": 9030 + }, + { + "epoch": 0.32139365389743135, + "grad_norm": 3.469229221343994, + "learning_rate": 1.9822112365513633e-05, + "loss": 1.9613, + "step": 9040 + }, + { + "epoch": 0.3217491778508577, + "grad_norm": 1.6523138284683228, + "learning_rate": 1.9821375684468363e-05, + "loss": 1.9621, + "step": 9050 + }, + { + "epoch": 0.3221047018042841, + "grad_norm": 1.6972028017044067, + "learning_rate": 1.9820637494921367e-05, + "loss": 1.9565, + "step": 9060 + }, + { + "epoch": 0.32246022575771044, + "grad_norm": 1.6572128534317017, + "learning_rate": 1.9819897796986027e-05, + "loss": 1.9616, + "step": 9070 + }, + { + "epoch": 0.3228157497111368, + "grad_norm": 1.641320824623108, + "learning_rate": 1.9819156590775958e-05, + "loss": 1.9568, + "step": 9080 + }, + { + "epoch": 0.32317127366456316, + "grad_norm": 1.7216322422027588, + "learning_rate": 1.9818413876405e-05, + "loss": 1.9556, + "step": 9090 + }, + { + "epoch": 0.3235267976179895, + "grad_norm": 1.5993764400482178, + "learning_rate": 1.9817669653987234e-05, + "loss": 1.9693, + "step": 9100 + }, + { + "epoch": 0.3238823215714159, + "grad_norm": 1.6592811346054077, + "learning_rate": 1.9816923923636967e-05, + "loss": 1.9442, + "step": 9110 + }, + { + "epoch": 0.32423784552484225, + "grad_norm": 1.6523818969726562, + "learning_rate": 1.981617668546874e-05, + "loss": 1.9406, + "step": 9120 + }, + { + "epoch": 0.3245933694782686, + "grad_norm": 1.6113784313201904, + "learning_rate": 1.9815427939597318e-05, + "loss": 1.9877, + "step": 9130 + }, + { + "epoch": 0.324948893431695, + "grad_norm": 1.7487437725067139, + "learning_rate": 1.981467768613771e-05, + "loss": 1.9455, + "step": 9140 + }, + { + "epoch": 0.32530441738512134, + "grad_norm": 1.6923714876174927, + "learning_rate": 1.9813925925205146e-05, + "loss": 1.9662, + "step": 9150 + }, + { + "epoch": 0.3256599413385477, + "grad_norm": 1.700963020324707, + "learning_rate": 1.9813172656915096e-05, + "loss": 1.911, + "step": 9160 + }, + { + "epoch": 0.32601546529197406, + "grad_norm": 1.6489852666854858, + "learning_rate": 1.9812417881383254e-05, + "loss": 1.9495, + "step": 9170 + }, + { + "epoch": 0.3263709892454004, + "grad_norm": 1.7082645893096924, + "learning_rate": 1.981166159872555e-05, + "loss": 1.9882, + "step": 9180 + }, + { + "epoch": 0.3267265131988268, + "grad_norm": 1.6140416860580444, + "learning_rate": 1.981090380905814e-05, + "loss": 1.9462, + "step": 9190 + }, + { + "epoch": 0.32708203715225315, + "grad_norm": 1.6398372650146484, + "learning_rate": 1.9810144512497426e-05, + "loss": 1.9147, + "step": 9200 + }, + { + "epoch": 0.3274375611056795, + "grad_norm": 1.6099936962127686, + "learning_rate": 1.9809383709160023e-05, + "loss": 1.978, + "step": 9210 + }, + { + "epoch": 0.32779308505910587, + "grad_norm": 1.6432914733886719, + "learning_rate": 1.980862139916279e-05, + "loss": 1.9342, + "step": 9220 + }, + { + "epoch": 0.32814860901253223, + "grad_norm": 1.7044553756713867, + "learning_rate": 1.9807857582622803e-05, + "loss": 1.9318, + "step": 9230 + }, + { + "epoch": 0.3285041329659586, + "grad_norm": 1.5483976602554321, + "learning_rate": 1.9807092259657395e-05, + "loss": 1.9708, + "step": 9240 + }, + { + "epoch": 0.32885965691938496, + "grad_norm": 1.7499970197677612, + "learning_rate": 1.9806325430384104e-05, + "loss": 1.9595, + "step": 9250 + }, + { + "epoch": 0.3292151808728113, + "grad_norm": 1.6411001682281494, + "learning_rate": 1.9805557094920712e-05, + "loss": 1.926, + "step": 9260 + }, + { + "epoch": 0.3295707048262377, + "grad_norm": 1.6664100885391235, + "learning_rate": 1.9804787253385232e-05, + "loss": 1.9795, + "step": 9270 + }, + { + "epoch": 0.32992622877966404, + "grad_norm": 1.6018718481063843, + "learning_rate": 1.9804015905895906e-05, + "loss": 1.9423, + "step": 9280 + }, + { + "epoch": 0.3302817527330904, + "grad_norm": 1.580801010131836, + "learning_rate": 1.980324305257121e-05, + "loss": 1.9469, + "step": 9290 + }, + { + "epoch": 0.33063727668651677, + "grad_norm": 1.6953452825546265, + "learning_rate": 1.9802468693529847e-05, + "loss": 1.9624, + "step": 9300 + }, + { + "epoch": 0.33099280063994313, + "grad_norm": 1.7434360980987549, + "learning_rate": 1.9801692828890754e-05, + "loss": 1.9296, + "step": 9310 + }, + { + "epoch": 0.3313483245933695, + "grad_norm": 1.6928406953811646, + "learning_rate": 1.98009154587731e-05, + "loss": 1.9109, + "step": 9320 + }, + { + "epoch": 0.33170384854679585, + "grad_norm": 1.6208775043487549, + "learning_rate": 1.9800136583296282e-05, + "loss": 1.9515, + "step": 9330 + }, + { + "epoch": 0.3320593725002222, + "grad_norm": 1.5621469020843506, + "learning_rate": 1.9799356202579935e-05, + "loss": 1.974, + "step": 9340 + }, + { + "epoch": 0.3324148964536486, + "grad_norm": 1.7981388568878174, + "learning_rate": 1.9798574316743918e-05, + "loss": 1.9863, + "step": 9350 + }, + { + "epoch": 0.33277042040707494, + "grad_norm": 1.6251649856567383, + "learning_rate": 1.979779092590832e-05, + "loss": 1.9512, + "step": 9360 + }, + { + "epoch": 0.3331259443605013, + "grad_norm": 1.5817049741744995, + "learning_rate": 1.9797006030193472e-05, + "loss": 1.9169, + "step": 9370 + }, + { + "epoch": 0.33348146831392766, + "grad_norm": 1.6933389902114868, + "learning_rate": 1.9796219629719923e-05, + "loss": 1.9411, + "step": 9380 + }, + { + "epoch": 0.333836992267354, + "grad_norm": 1.6590707302093506, + "learning_rate": 1.9795431724608465e-05, + "loss": 1.9705, + "step": 9390 + }, + { + "epoch": 0.3341925162207804, + "grad_norm": 1.6322592496871948, + "learning_rate": 1.9794642314980108e-05, + "loss": 1.9743, + "step": 9400 + }, + { + "epoch": 0.33454804017420675, + "grad_norm": 1.6067534685134888, + "learning_rate": 1.9793851400956106e-05, + "loss": 1.9444, + "step": 9410 + }, + { + "epoch": 0.3349035641276331, + "grad_norm": 1.6392176151275635, + "learning_rate": 1.979305898265794e-05, + "loss": 1.9448, + "step": 9420 + }, + { + "epoch": 0.3352590880810595, + "grad_norm": 1.462730884552002, + "learning_rate": 1.9792265060207313e-05, + "loss": 1.9329, + "step": 9430 + }, + { + "epoch": 0.33561461203448584, + "grad_norm": 1.507010817527771, + "learning_rate": 1.979146963372617e-05, + "loss": 1.9374, + "step": 9440 + }, + { + "epoch": 0.3359701359879122, + "grad_norm": 1.7325148582458496, + "learning_rate": 1.9790672703336688e-05, + "loss": 1.9483, + "step": 9450 + }, + { + "epoch": 0.33632565994133856, + "grad_norm": 1.5408869981765747, + "learning_rate": 1.9789874269161264e-05, + "loss": 1.9503, + "step": 9460 + }, + { + "epoch": 0.3366811838947649, + "grad_norm": 1.561712622642517, + "learning_rate": 1.9789074331322537e-05, + "loss": 1.9847, + "step": 9470 + }, + { + "epoch": 0.3370367078481913, + "grad_norm": 1.731758713722229, + "learning_rate": 1.978827288994337e-05, + "loss": 1.9415, + "step": 9480 + }, + { + "epoch": 0.33739223180161765, + "grad_norm": 1.676243543624878, + "learning_rate": 1.978746994514686e-05, + "loss": 1.9787, + "step": 9490 + }, + { + "epoch": 0.337747755755044, + "grad_norm": 1.5879807472229004, + "learning_rate": 1.978666549705633e-05, + "loss": 1.9606, + "step": 9500 + }, + { + "epoch": 0.3381032797084704, + "grad_norm": 1.629159927368164, + "learning_rate": 1.978585954579535e-05, + "loss": 1.9389, + "step": 9510 + }, + { + "epoch": 0.33845880366189673, + "grad_norm": 1.5797783136367798, + "learning_rate": 1.9785052091487697e-05, + "loss": 1.9321, + "step": 9520 + }, + { + "epoch": 0.3388143276153231, + "grad_norm": 1.6011348962783813, + "learning_rate": 1.9784243134257396e-05, + "loss": 1.9485, + "step": 9530 + }, + { + "epoch": 0.33916985156874946, + "grad_norm": 1.6593711376190186, + "learning_rate": 1.9783432674228697e-05, + "loss": 1.9338, + "step": 9540 + }, + { + "epoch": 0.3395253755221758, + "grad_norm": 1.489780306816101, + "learning_rate": 1.9782620711526084e-05, + "loss": 1.9475, + "step": 9550 + }, + { + "epoch": 0.3398808994756022, + "grad_norm": 1.6868865489959717, + "learning_rate": 1.9781807246274264e-05, + "loss": 1.8677, + "step": 9560 + }, + { + "epoch": 0.34023642342902854, + "grad_norm": 1.5744456052780151, + "learning_rate": 1.9780992278598187e-05, + "loss": 1.9541, + "step": 9570 + }, + { + "epoch": 0.3405919473824549, + "grad_norm": 1.7402019500732422, + "learning_rate": 1.978017580862302e-05, + "loss": 1.916, + "step": 9580 + }, + { + "epoch": 0.34094747133588127, + "grad_norm": 1.706446647644043, + "learning_rate": 1.9779357836474175e-05, + "loss": 1.9769, + "step": 9590 + }, + { + "epoch": 0.34130299528930763, + "grad_norm": 1.6508241891860962, + "learning_rate": 1.977853836227728e-05, + "loss": 1.936, + "step": 9600 + }, + { + "epoch": 0.341658519242734, + "grad_norm": 1.656411051750183, + "learning_rate": 1.9777717386158203e-05, + "loss": 1.932, + "step": 9610 + }, + { + "epoch": 0.34201404319616036, + "grad_norm": 1.6659698486328125, + "learning_rate": 1.977689490824305e-05, + "loss": 1.9417, + "step": 9620 + }, + { + "epoch": 0.3423695671495867, + "grad_norm": 1.6094539165496826, + "learning_rate": 1.9776070928658134e-05, + "loss": 1.9385, + "step": 9630 + }, + { + "epoch": 0.3427250911030131, + "grad_norm": 1.5917366743087769, + "learning_rate": 1.9775245447530017e-05, + "loss": 1.9526, + "step": 9640 + }, + { + "epoch": 0.34308061505643944, + "grad_norm": 1.7756634950637817, + "learning_rate": 1.97744184649855e-05, + "loss": 1.9502, + "step": 9650 + }, + { + "epoch": 0.3434361390098658, + "grad_norm": 1.629005789756775, + "learning_rate": 1.9773589981151585e-05, + "loss": 1.9, + "step": 9660 + }, + { + "epoch": 0.34379166296329217, + "grad_norm": 1.6309583187103271, + "learning_rate": 1.9772759996155533e-05, + "loss": 1.9751, + "step": 9670 + }, + { + "epoch": 0.34414718691671853, + "grad_norm": 1.6217846870422363, + "learning_rate": 1.977192851012482e-05, + "loss": 1.9265, + "step": 9680 + }, + { + "epoch": 0.3445027108701449, + "grad_norm": 1.7747803926467896, + "learning_rate": 1.977109552318716e-05, + "loss": 1.9351, + "step": 9690 + }, + { + "epoch": 0.34485823482357125, + "grad_norm": 1.629852294921875, + "learning_rate": 1.977026103547049e-05, + "loss": 1.9203, + "step": 9700 + }, + { + "epoch": 0.3452137587769976, + "grad_norm": 1.723342776298523, + "learning_rate": 1.9769425047102986e-05, + "loss": 1.9823, + "step": 9710 + }, + { + "epoch": 0.345569282730424, + "grad_norm": 1.6114434003829956, + "learning_rate": 1.9768587558213052e-05, + "loss": 1.9757, + "step": 9720 + }, + { + "epoch": 0.34592480668385034, + "grad_norm": 1.5842297077178955, + "learning_rate": 1.976774856892932e-05, + "loss": 1.9348, + "step": 9730 + }, + { + "epoch": 0.3462803306372767, + "grad_norm": 1.6690183877944946, + "learning_rate": 1.9766908079380645e-05, + "loss": 1.9714, + "step": 9740 + }, + { + "epoch": 0.34663585459070306, + "grad_norm": 1.626365303993225, + "learning_rate": 1.976606608969613e-05, + "loss": 1.9595, + "step": 9750 + }, + { + "epoch": 0.3469913785441294, + "grad_norm": 1.6194849014282227, + "learning_rate": 1.9765222600005104e-05, + "loss": 1.918, + "step": 9760 + }, + { + "epoch": 0.3473469024975558, + "grad_norm": 1.6636567115783691, + "learning_rate": 1.9764377610437106e-05, + "loss": 1.9131, + "step": 9770 + }, + { + "epoch": 0.34770242645098215, + "grad_norm": 1.5718607902526855, + "learning_rate": 1.9763531121121937e-05, + "loss": 1.9218, + "step": 9780 + }, + { + "epoch": 0.3480579504044085, + "grad_norm": 1.650025486946106, + "learning_rate": 1.97626831321896e-05, + "loss": 1.9203, + "step": 9790 + }, + { + "epoch": 0.3484134743578349, + "grad_norm": 1.69324791431427, + "learning_rate": 1.976183364377035e-05, + "loss": 1.9502, + "step": 9800 + }, + { + "epoch": 0.34876899831126124, + "grad_norm": 1.6470222473144531, + "learning_rate": 1.976098265599466e-05, + "loss": 1.9373, + "step": 9810 + }, + { + "epoch": 0.3491245222646876, + "grad_norm": 1.5626472234725952, + "learning_rate": 1.9760130168993233e-05, + "loss": 1.9187, + "step": 9820 + }, + { + "epoch": 0.34948004621811396, + "grad_norm": 1.632422685623169, + "learning_rate": 1.9759276182897012e-05, + "loss": 1.945, + "step": 9830 + }, + { + "epoch": 0.3498355701715403, + "grad_norm": 1.637856125831604, + "learning_rate": 1.975842069783716e-05, + "loss": 1.9157, + "step": 9840 + }, + { + "epoch": 0.3501910941249667, + "grad_norm": 1.5937124490737915, + "learning_rate": 1.9757563713945075e-05, + "loss": 1.9355, + "step": 9850 + }, + { + "epoch": 0.35054661807839305, + "grad_norm": 1.7395747900009155, + "learning_rate": 1.975670523135238e-05, + "loss": 1.9387, + "step": 9860 + }, + { + "epoch": 0.3509021420318194, + "grad_norm": 1.5780001878738403, + "learning_rate": 1.975584525019094e-05, + "loss": 1.9666, + "step": 9870 + }, + { + "epoch": 0.35125766598524577, + "grad_norm": 1.6015253067016602, + "learning_rate": 1.975498377059284e-05, + "loss": 1.9587, + "step": 9880 + }, + { + "epoch": 0.35161318993867213, + "grad_norm": 1.6278748512268066, + "learning_rate": 1.9754120792690392e-05, + "loss": 1.963, + "step": 9890 + }, + { + "epoch": 0.3519687138920985, + "grad_norm": 1.7236615419387817, + "learning_rate": 1.975325631661615e-05, + "loss": 1.9142, + "step": 9900 + }, + { + "epoch": 0.35232423784552486, + "grad_norm": 1.7599778175354004, + "learning_rate": 1.9752390342502895e-05, + "loss": 1.9628, + "step": 9910 + }, + { + "epoch": 0.3526797617989512, + "grad_norm": 1.6511987447738647, + "learning_rate": 1.975152287048363e-05, + "loss": 1.9447, + "step": 9920 + }, + { + "epoch": 0.3530352857523776, + "grad_norm": 1.7247734069824219, + "learning_rate": 1.9750653900691595e-05, + "loss": 1.9288, + "step": 9930 + }, + { + "epoch": 0.35339080970580394, + "grad_norm": 1.667855143547058, + "learning_rate": 1.9749783433260258e-05, + "loss": 1.9486, + "step": 9940 + }, + { + "epoch": 0.3537463336592303, + "grad_norm": 1.671859622001648, + "learning_rate": 1.9748911468323314e-05, + "loss": 1.9339, + "step": 9950 + }, + { + "epoch": 0.35410185761265667, + "grad_norm": 1.6676883697509766, + "learning_rate": 1.9748038006014698e-05, + "loss": 1.9307, + "step": 9960 + }, + { + "epoch": 0.35445738156608303, + "grad_norm": 1.5199689865112305, + "learning_rate": 1.974716304646856e-05, + "loss": 1.9542, + "step": 9970 + }, + { + "epoch": 0.3548129055195094, + "grad_norm": 1.6532329320907593, + "learning_rate": 1.97462865898193e-05, + "loss": 1.9508, + "step": 9980 + }, + { + "epoch": 0.35516842947293575, + "grad_norm": 1.6774016618728638, + "learning_rate": 1.9745408636201525e-05, + "loss": 1.9041, + "step": 9990 + }, + { + "epoch": 0.3555239534263621, + "grad_norm": 1.5950804948806763, + "learning_rate": 1.9744529185750093e-05, + "loss": 1.9588, + "step": 10000 + }, + { + "epoch": 0.3558794773797885, + "grad_norm": 1.639394998550415, + "learning_rate": 1.9743648238600074e-05, + "loss": 1.9836, + "step": 10010 + }, + { + "epoch": 0.35623500133321484, + "grad_norm": 1.6646896600723267, + "learning_rate": 1.9742765794886774e-05, + "loss": 1.9421, + "step": 10020 + }, + { + "epoch": 0.3565905252866412, + "grad_norm": 1.6006731986999512, + "learning_rate": 1.974188185474574e-05, + "loss": 1.9689, + "step": 10030 + }, + { + "epoch": 0.35694604924006756, + "grad_norm": 1.85233736038208, + "learning_rate": 1.9740996418312735e-05, + "loss": 1.9364, + "step": 10040 + }, + { + "epoch": 0.3573015731934939, + "grad_norm": 1.7664031982421875, + "learning_rate": 1.9740109485723758e-05, + "loss": 1.9545, + "step": 10050 + }, + { + "epoch": 0.3576570971469203, + "grad_norm": 1.6243541240692139, + "learning_rate": 1.973922105711503e-05, + "loss": 1.9743, + "step": 10060 + }, + { + "epoch": 0.35801262110034665, + "grad_norm": 1.7577147483825684, + "learning_rate": 1.973833113262302e-05, + "loss": 1.9407, + "step": 10070 + }, + { + "epoch": 0.358368145053773, + "grad_norm": 1.598021388053894, + "learning_rate": 1.9737439712384404e-05, + "loss": 1.9488, + "step": 10080 + }, + { + "epoch": 0.3587236690071994, + "grad_norm": 1.663016676902771, + "learning_rate": 1.97365467965361e-05, + "loss": 1.9123, + "step": 10090 + }, + { + "epoch": 0.35907919296062574, + "grad_norm": 1.629992127418518, + "learning_rate": 1.973565238521526e-05, + "loss": 1.9211, + "step": 10100 + }, + { + "epoch": 0.3594347169140521, + "grad_norm": 1.6966192722320557, + "learning_rate": 1.9734756478559255e-05, + "loss": 1.9352, + "step": 10110 + }, + { + "epoch": 0.35979024086747846, + "grad_norm": 1.6816771030426025, + "learning_rate": 1.9733859076705696e-05, + "loss": 1.9132, + "step": 10120 + }, + { + "epoch": 0.3601457648209048, + "grad_norm": 1.5039012432098389, + "learning_rate": 1.973296017979241e-05, + "loss": 1.9761, + "step": 10130 + }, + { + "epoch": 0.3605012887743312, + "grad_norm": 1.685896635055542, + "learning_rate": 1.9732059787957466e-05, + "loss": 1.9875, + "step": 10140 + }, + { + "epoch": 0.36085681272775755, + "grad_norm": 1.6027092933654785, + "learning_rate": 1.973115790133916e-05, + "loss": 1.9406, + "step": 10150 + }, + { + "epoch": 0.3612123366811839, + "grad_norm": 1.6120712757110596, + "learning_rate": 1.973025452007602e-05, + "loss": 1.9095, + "step": 10160 + }, + { + "epoch": 0.36156786063461027, + "grad_norm": 1.705488920211792, + "learning_rate": 1.972934964430679e-05, + "loss": 1.942, + "step": 10170 + }, + { + "epoch": 0.36192338458803663, + "grad_norm": 1.63666832447052, + "learning_rate": 1.972844327417046e-05, + "loss": 1.9205, + "step": 10180 + }, + { + "epoch": 0.362278908541463, + "grad_norm": 1.6071900129318237, + "learning_rate": 1.972753540980624e-05, + "loss": 1.9248, + "step": 10190 + }, + { + "epoch": 0.36263443249488936, + "grad_norm": 1.6618894338607788, + "learning_rate": 1.972662605135357e-05, + "loss": 1.9252, + "step": 10200 + }, + { + "epoch": 0.3629899564483157, + "grad_norm": 1.6412007808685303, + "learning_rate": 1.972571519895213e-05, + "loss": 1.8986, + "step": 10210 + }, + { + "epoch": 0.3633454804017421, + "grad_norm": 1.6706079244613647, + "learning_rate": 1.9724802852741817e-05, + "loss": 1.9449, + "step": 10220 + }, + { + "epoch": 0.36370100435516844, + "grad_norm": 1.550260305404663, + "learning_rate": 1.9723889012862757e-05, + "loss": 1.9436, + "step": 10230 + }, + { + "epoch": 0.3640565283085948, + "grad_norm": 1.6799967288970947, + "learning_rate": 1.9722973679455316e-05, + "loss": 1.9889, + "step": 10240 + }, + { + "epoch": 0.36441205226202117, + "grad_norm": 1.598900556564331, + "learning_rate": 1.972205685266008e-05, + "loss": 1.9648, + "step": 10250 + }, + { + "epoch": 0.36476757621544753, + "grad_norm": 1.63681960105896, + "learning_rate": 1.972113853261787e-05, + "loss": 1.8904, + "step": 10260 + }, + { + "epoch": 0.3651231001688739, + "grad_norm": 1.6806844472885132, + "learning_rate": 1.9720218719469732e-05, + "loss": 1.9616, + "step": 10270 + }, + { + "epoch": 0.36547862412230026, + "grad_norm": 1.6543742418289185, + "learning_rate": 1.9719297413356945e-05, + "loss": 1.8776, + "step": 10280 + }, + { + "epoch": 0.3658341480757266, + "grad_norm": 1.6117966175079346, + "learning_rate": 1.971837461442102e-05, + "loss": 1.9284, + "step": 10290 + }, + { + "epoch": 0.366189672029153, + "grad_norm": 1.5849404335021973, + "learning_rate": 1.9717450322803682e-05, + "loss": 1.9351, + "step": 10300 + }, + { + "epoch": 0.36654519598257934, + "grad_norm": 1.7554417848587036, + "learning_rate": 1.9716524538646912e-05, + "loss": 1.9462, + "step": 10310 + }, + { + "epoch": 0.3669007199360057, + "grad_norm": 1.5392756462097168, + "learning_rate": 1.971559726209289e-05, + "loss": 1.9657, + "step": 10320 + }, + { + "epoch": 0.36725624388943207, + "grad_norm": 1.6382545232772827, + "learning_rate": 1.9714668493284045e-05, + "loss": 1.9089, + "step": 10330 + }, + { + "epoch": 0.36761176784285843, + "grad_norm": 1.6288036108016968, + "learning_rate": 1.9713738232363033e-05, + "loss": 1.9634, + "step": 10340 + }, + { + "epoch": 0.3679672917962848, + "grad_norm": 1.622418761253357, + "learning_rate": 1.9712806479472736e-05, + "loss": 1.926, + "step": 10350 + }, + { + "epoch": 0.36832281574971115, + "grad_norm": 1.5704150199890137, + "learning_rate": 1.9711873234756262e-05, + "loss": 1.9506, + "step": 10360 + }, + { + "epoch": 0.3686783397031375, + "grad_norm": 1.7928626537322998, + "learning_rate": 1.971093849835695e-05, + "loss": 1.9115, + "step": 10370 + }, + { + "epoch": 0.3690338636565639, + "grad_norm": 1.653241515159607, + "learning_rate": 1.9710002270418377e-05, + "loss": 1.9287, + "step": 10380 + }, + { + "epoch": 0.36938938760999024, + "grad_norm": 1.6538161039352417, + "learning_rate": 1.970906455108433e-05, + "loss": 1.9478, + "step": 10390 + }, + { + "epoch": 0.3697449115634166, + "grad_norm": 1.6418386697769165, + "learning_rate": 1.9708125340498853e-05, + "loss": 1.9312, + "step": 10400 + }, + { + "epoch": 0.37010043551684296, + "grad_norm": 1.542913556098938, + "learning_rate": 1.970718463880619e-05, + "loss": 1.9444, + "step": 10410 + }, + { + "epoch": 0.3704559594702693, + "grad_norm": 1.6365742683410645, + "learning_rate": 1.9706242446150833e-05, + "loss": 1.9704, + "step": 10420 + }, + { + "epoch": 0.3708114834236957, + "grad_norm": 1.6890367269515991, + "learning_rate": 1.9705298762677492e-05, + "loss": 1.9118, + "step": 10430 + }, + { + "epoch": 0.37116700737712205, + "grad_norm": 1.7141989469528198, + "learning_rate": 1.9704353588531115e-05, + "loss": 1.9375, + "step": 10440 + }, + { + "epoch": 0.3715225313305484, + "grad_norm": 1.6225630044937134, + "learning_rate": 1.9703406923856875e-05, + "loss": 1.969, + "step": 10450 + }, + { + "epoch": 0.3718780552839748, + "grad_norm": 1.5948486328125, + "learning_rate": 1.9702458768800173e-05, + "loss": 1.9047, + "step": 10460 + }, + { + "epoch": 0.37223357923740114, + "grad_norm": 1.6593469381332397, + "learning_rate": 1.9701509123506635e-05, + "loss": 1.9562, + "step": 10470 + }, + { + "epoch": 0.3725891031908275, + "grad_norm": 1.6699244976043701, + "learning_rate": 1.970055798812213e-05, + "loss": 1.9842, + "step": 10480 + }, + { + "epoch": 0.37294462714425386, + "grad_norm": 1.6418598890304565, + "learning_rate": 1.9699605362792736e-05, + "loss": 1.9448, + "step": 10490 + }, + { + "epoch": 0.3733001510976802, + "grad_norm": 1.7763127088546753, + "learning_rate": 1.9698651247664778e-05, + "loss": 1.9569, + "step": 10500 + }, + { + "epoch": 0.3736556750511066, + "grad_norm": 1.5967490673065186, + "learning_rate": 1.96976956428848e-05, + "loss": 1.9609, + "step": 10510 + }, + { + "epoch": 0.37401119900453295, + "grad_norm": 1.666662573814392, + "learning_rate": 1.9696738548599575e-05, + "loss": 1.8398, + "step": 10520 + }, + { + "epoch": 0.3743667229579593, + "grad_norm": 1.682659387588501, + "learning_rate": 1.9695779964956106e-05, + "loss": 1.9632, + "step": 10530 + }, + { + "epoch": 0.37472224691138567, + "grad_norm": 1.6394110918045044, + "learning_rate": 1.9694819892101627e-05, + "loss": 1.9686, + "step": 10540 + }, + { + "epoch": 0.37507777086481203, + "grad_norm": 1.7876205444335938, + "learning_rate": 1.9693858330183604e-05, + "loss": 1.9342, + "step": 10550 + }, + { + "epoch": 0.3754332948182384, + "grad_norm": 1.6362394094467163, + "learning_rate": 1.9692895279349723e-05, + "loss": 1.9146, + "step": 10560 + }, + { + "epoch": 0.37578881877166476, + "grad_norm": 1.6904796361923218, + "learning_rate": 1.96919307397479e-05, + "loss": 1.9252, + "step": 10570 + }, + { + "epoch": 0.3761443427250911, + "grad_norm": 1.64126455783844, + "learning_rate": 1.969096471152628e-05, + "loss": 1.9646, + "step": 10580 + }, + { + "epoch": 0.3764998666785175, + "grad_norm": 1.6415106058120728, + "learning_rate": 1.968999719483325e-05, + "loss": 1.9016, + "step": 10590 + }, + { + "epoch": 0.37685539063194384, + "grad_norm": 1.6585341691970825, + "learning_rate": 1.96890281898174e-05, + "loss": 1.9525, + "step": 10600 + }, + { + "epoch": 0.3772109145853702, + "grad_norm": 1.6419931650161743, + "learning_rate": 1.9688057696627575e-05, + "loss": 1.9364, + "step": 10610 + }, + { + "epoch": 0.37756643853879657, + "grad_norm": 1.625403642654419, + "learning_rate": 1.9687085715412832e-05, + "loss": 1.9232, + "step": 10620 + }, + { + "epoch": 0.37792196249222293, + "grad_norm": 1.5984399318695068, + "learning_rate": 1.9686112246322464e-05, + "loss": 1.9623, + "step": 10630 + }, + { + "epoch": 0.3782774864456493, + "grad_norm": 1.6441439390182495, + "learning_rate": 1.9685137289505985e-05, + "loss": 1.9348, + "step": 10640 + }, + { + "epoch": 0.37863301039907565, + "grad_norm": 1.6237053871154785, + "learning_rate": 1.9684160845113145e-05, + "loss": 1.9182, + "step": 10650 + }, + { + "epoch": 0.378988534352502, + "grad_norm": 1.6530194282531738, + "learning_rate": 1.9683182913293918e-05, + "loss": 1.9084, + "step": 10660 + }, + { + "epoch": 0.3793440583059284, + "grad_norm": 1.6410064697265625, + "learning_rate": 1.968220349419851e-05, + "loss": 1.9048, + "step": 10670 + }, + { + "epoch": 0.37969958225935474, + "grad_norm": 1.5841423273086548, + "learning_rate": 1.9681222587977357e-05, + "loss": 1.9042, + "step": 10680 + }, + { + "epoch": 0.3800551062127811, + "grad_norm": 1.5987772941589355, + "learning_rate": 1.9680240194781113e-05, + "loss": 1.9088, + "step": 10690 + }, + { + "epoch": 0.38041063016620746, + "grad_norm": 1.746238350868225, + "learning_rate": 1.967925631476067e-05, + "loss": 1.948, + "step": 10700 + }, + { + "epoch": 0.3807661541196338, + "grad_norm": 1.6123676300048828, + "learning_rate": 1.9678270948067148e-05, + "loss": 1.9685, + "step": 10710 + }, + { + "epoch": 0.3811216780730602, + "grad_norm": 1.5973690748214722, + "learning_rate": 1.9677284094851893e-05, + "loss": 1.9073, + "step": 10720 + }, + { + "epoch": 0.38147720202648655, + "grad_norm": 1.5998799800872803, + "learning_rate": 1.9676295755266475e-05, + "loss": 1.9561, + "step": 10730 + }, + { + "epoch": 0.3818327259799129, + "grad_norm": 1.565384030342102, + "learning_rate": 1.96753059294627e-05, + "loss": 1.9553, + "step": 10740 + }, + { + "epoch": 0.3821882499333393, + "grad_norm": 1.621324896812439, + "learning_rate": 1.9674314617592597e-05, + "loss": 1.9116, + "step": 10750 + }, + { + "epoch": 0.38254377388676564, + "grad_norm": 1.5708541870117188, + "learning_rate": 1.9673321819808432e-05, + "loss": 1.9764, + "step": 10760 + }, + { + "epoch": 0.382899297840192, + "grad_norm": 1.6714891195297241, + "learning_rate": 1.9672327536262687e-05, + "loss": 1.9354, + "step": 10770 + }, + { + "epoch": 0.38325482179361836, + "grad_norm": 1.6341030597686768, + "learning_rate": 1.9671331767108074e-05, + "loss": 1.9139, + "step": 10780 + }, + { + "epoch": 0.3836103457470447, + "grad_norm": 1.5689213275909424, + "learning_rate": 1.9670334512497546e-05, + "loss": 1.9208, + "step": 10790 + }, + { + "epoch": 0.3839658697004711, + "grad_norm": 1.6299171447753906, + "learning_rate": 1.9669335772584268e-05, + "loss": 1.875, + "step": 10800 + }, + { + "epoch": 0.38432139365389745, + "grad_norm": 1.6344177722930908, + "learning_rate": 1.9668335547521645e-05, + "loss": 1.9297, + "step": 10810 + }, + { + "epoch": 0.3846769176073238, + "grad_norm": 1.5587433576583862, + "learning_rate": 1.9667333837463302e-05, + "loss": 1.9084, + "step": 10820 + }, + { + "epoch": 0.38503244156075017, + "grad_norm": 1.574712872505188, + "learning_rate": 1.966633064256309e-05, + "loss": 1.9071, + "step": 10830 + }, + { + "epoch": 0.38538796551417653, + "grad_norm": 1.5423403978347778, + "learning_rate": 1.966532596297511e-05, + "loss": 1.9419, + "step": 10840 + }, + { + "epoch": 0.3857434894676029, + "grad_norm": 1.6940547227859497, + "learning_rate": 1.966431979885366e-05, + "loss": 1.9032, + "step": 10850 + }, + { + "epoch": 0.38609901342102926, + "grad_norm": 1.6661192178726196, + "learning_rate": 1.9663312150353283e-05, + "loss": 1.971, + "step": 10860 + }, + { + "epoch": 0.3864545373744556, + "grad_norm": 1.6065622568130493, + "learning_rate": 1.966230301762875e-05, + "loss": 1.9547, + "step": 10870 + }, + { + "epoch": 0.386810061327882, + "grad_norm": 1.5964338779449463, + "learning_rate": 1.966129240083505e-05, + "loss": 1.9231, + "step": 10880 + }, + { + "epoch": 0.38716558528130834, + "grad_norm": 1.7134864330291748, + "learning_rate": 1.9660280300127423e-05, + "loss": 1.9143, + "step": 10890 + }, + { + "epoch": 0.3875211092347347, + "grad_norm": 1.6787341833114624, + "learning_rate": 1.965926671566131e-05, + "loss": 1.9589, + "step": 10900 + }, + { + "epoch": 0.38787663318816107, + "grad_norm": 1.7104814052581787, + "learning_rate": 1.9658251647592396e-05, + "loss": 1.9147, + "step": 10910 + }, + { + "epoch": 0.38823215714158743, + "grad_norm": 1.6374329328536987, + "learning_rate": 1.965723509607658e-05, + "loss": 1.9499, + "step": 10920 + }, + { + "epoch": 0.3885876810950138, + "grad_norm": 1.6665334701538086, + "learning_rate": 1.965621706127001e-05, + "loss": 1.8982, + "step": 10930 + }, + { + "epoch": 0.38894320504844015, + "grad_norm": 1.658685564994812, + "learning_rate": 1.9655197543329043e-05, + "loss": 1.9017, + "step": 10940 + }, + { + "epoch": 0.3892987290018665, + "grad_norm": 1.6136287450790405, + "learning_rate": 1.9654176542410274e-05, + "loss": 1.9129, + "step": 10950 + }, + { + "epoch": 0.3896542529552929, + "grad_norm": 1.6693179607391357, + "learning_rate": 1.9653154058670517e-05, + "loss": 1.9192, + "step": 10960 + }, + { + "epoch": 0.39000977690871924, + "grad_norm": 1.6613389253616333, + "learning_rate": 1.965213009226682e-05, + "loss": 1.911, + "step": 10970 + }, + { + "epoch": 0.3903653008621456, + "grad_norm": 1.5833563804626465, + "learning_rate": 1.9651104643356465e-05, + "loss": 1.9032, + "step": 10980 + }, + { + "epoch": 0.39072082481557197, + "grad_norm": 1.7355057001113892, + "learning_rate": 1.965007771209695e-05, + "loss": 1.9614, + "step": 10990 + }, + { + "epoch": 0.3910763487689983, + "grad_norm": 1.660593032836914, + "learning_rate": 1.9649049298646004e-05, + "loss": 1.9462, + "step": 11000 + }, + { + "epoch": 0.3914318727224247, + "grad_norm": 1.7234996557235718, + "learning_rate": 1.964801940316158e-05, + "loss": 1.964, + "step": 11010 + }, + { + "epoch": 0.39178739667585105, + "grad_norm": 1.5677638053894043, + "learning_rate": 1.9646988025801878e-05, + "loss": 1.9563, + "step": 11020 + }, + { + "epoch": 0.3921429206292774, + "grad_norm": 1.5681052207946777, + "learning_rate": 1.9645955166725298e-05, + "loss": 1.9512, + "step": 11030 + }, + { + "epoch": 0.3924984445827038, + "grad_norm": 1.7496832609176636, + "learning_rate": 1.9644920826090485e-05, + "loss": 1.9241, + "step": 11040 + }, + { + "epoch": 0.39285396853613014, + "grad_norm": 1.7510963678359985, + "learning_rate": 1.9643885004056308e-05, + "loss": 1.9386, + "step": 11050 + }, + { + "epoch": 0.3932094924895565, + "grad_norm": 1.56932532787323, + "learning_rate": 1.964284770078186e-05, + "loss": 1.9036, + "step": 11060 + }, + { + "epoch": 0.39356501644298286, + "grad_norm": 1.6665147542953491, + "learning_rate": 1.9641808916426468e-05, + "loss": 1.9061, + "step": 11070 + }, + { + "epoch": 0.3939205403964092, + "grad_norm": 1.6765482425689697, + "learning_rate": 1.9640768651149683e-05, + "loss": 1.9217, + "step": 11080 + }, + { + "epoch": 0.3942760643498356, + "grad_norm": 1.7809001207351685, + "learning_rate": 1.9639726905111275e-05, + "loss": 1.9605, + "step": 11090 + }, + { + "epoch": 0.39463158830326195, + "grad_norm": 1.6571179628372192, + "learning_rate": 1.9638683678471262e-05, + "loss": 1.9131, + "step": 11100 + }, + { + "epoch": 0.3949871122566883, + "grad_norm": 1.6656454801559448, + "learning_rate": 1.963763897138987e-05, + "loss": 1.9423, + "step": 11110 + }, + { + "epoch": 0.3953426362101147, + "grad_norm": 1.6632436513900757, + "learning_rate": 1.963659278402756e-05, + "loss": 1.8889, + "step": 11120 + }, + { + "epoch": 0.39569816016354104, + "grad_norm": 1.6332790851593018, + "learning_rate": 1.963554511654502e-05, + "loss": 1.9505, + "step": 11130 + }, + { + "epoch": 0.3960536841169674, + "grad_norm": 1.6795090436935425, + "learning_rate": 1.963449596910316e-05, + "loss": 1.9273, + "step": 11140 + }, + { + "epoch": 0.39640920807039376, + "grad_norm": 1.7208044528961182, + "learning_rate": 1.963344534186314e-05, + "loss": 1.9377, + "step": 11150 + }, + { + "epoch": 0.3967647320238201, + "grad_norm": 1.6807401180267334, + "learning_rate": 1.963239323498631e-05, + "loss": 1.9144, + "step": 11160 + }, + { + "epoch": 0.3971202559772465, + "grad_norm": 1.535725474357605, + "learning_rate": 1.9631339648634273e-05, + "loss": 1.905, + "step": 11170 + }, + { + "epoch": 0.39747577993067285, + "grad_norm": 1.7075817584991455, + "learning_rate": 1.9630284582968858e-05, + "loss": 1.9189, + "step": 11180 + }, + { + "epoch": 0.3978313038840992, + "grad_norm": 1.557862639427185, + "learning_rate": 1.9629228038152114e-05, + "loss": 1.9081, + "step": 11190 + }, + { + "epoch": 0.39818682783752557, + "grad_norm": 1.5928765535354614, + "learning_rate": 1.9628170014346316e-05, + "loss": 1.9666, + "step": 11200 + }, + { + "epoch": 0.39854235179095193, + "grad_norm": 1.5738533735275269, + "learning_rate": 1.9627110511713977e-05, + "loss": 1.917, + "step": 11210 + }, + { + "epoch": 0.3988978757443783, + "grad_norm": 1.663844108581543, + "learning_rate": 1.962604953041782e-05, + "loss": 1.9212, + "step": 11220 + }, + { + "epoch": 0.39925339969780466, + "grad_norm": 1.6645182371139526, + "learning_rate": 1.9624987070620817e-05, + "loss": 1.9029, + "step": 11230 + }, + { + "epoch": 0.399608923651231, + "grad_norm": 1.6867510080337524, + "learning_rate": 1.9623923132486148e-05, + "loss": 1.95, + "step": 11240 + }, + { + "epoch": 0.3999644476046574, + "grad_norm": 1.7048712968826294, + "learning_rate": 1.9622857716177223e-05, + "loss": 1.8893, + "step": 11250 + }, + { + "epoch": 0.40031997155808374, + "grad_norm": 1.668708324432373, + "learning_rate": 1.9621790821857693e-05, + "loss": 1.9566, + "step": 11260 + }, + { + "epoch": 0.4006754955115101, + "grad_norm": 1.6398382186889648, + "learning_rate": 1.962072244969142e-05, + "loss": 1.894, + "step": 11270 + }, + { + "epoch": 0.40103101946493647, + "grad_norm": 1.7319424152374268, + "learning_rate": 1.9619652599842506e-05, + "loss": 1.9149, + "step": 11280 + }, + { + "epoch": 0.40138654341836283, + "grad_norm": 1.6810721158981323, + "learning_rate": 1.9618581272475263e-05, + "loss": 1.9177, + "step": 11290 + }, + { + "epoch": 0.4017420673717892, + "grad_norm": 1.6722646951675415, + "learning_rate": 1.9617508467754248e-05, + "loss": 1.9404, + "step": 11300 + }, + { + "epoch": 0.40209759132521555, + "grad_norm": 1.566178798675537, + "learning_rate": 1.9616434185844233e-05, + "loss": 1.913, + "step": 11310 + }, + { + "epoch": 0.4024531152786419, + "grad_norm": 1.5735375881195068, + "learning_rate": 1.9615358426910223e-05, + "loss": 1.9028, + "step": 11320 + }, + { + "epoch": 0.4028086392320683, + "grad_norm": 1.5467545986175537, + "learning_rate": 1.9614281191117448e-05, + "loss": 1.9518, + "step": 11330 + }, + { + "epoch": 0.40316416318549464, + "grad_norm": 1.557876706123352, + "learning_rate": 1.9613202478631365e-05, + "loss": 1.9298, + "step": 11340 + }, + { + "epoch": 0.403519687138921, + "grad_norm": 1.5856714248657227, + "learning_rate": 1.9612122289617656e-05, + "loss": 1.8999, + "step": 11350 + }, + { + "epoch": 0.40387521109234736, + "grad_norm": 1.6419317722320557, + "learning_rate": 1.961104062424223e-05, + "loss": 1.9112, + "step": 11360 + }, + { + "epoch": 0.4042307350457737, + "grad_norm": 1.5576741695404053, + "learning_rate": 1.9609957482671228e-05, + "loss": 1.9552, + "step": 11370 + }, + { + "epoch": 0.4045862589992001, + "grad_norm": 1.6433225870132446, + "learning_rate": 1.960887286507101e-05, + "loss": 1.9176, + "step": 11380 + }, + { + "epoch": 0.40494178295262645, + "grad_norm": 1.6947438716888428, + "learning_rate": 1.9607786771608167e-05, + "loss": 1.9159, + "step": 11390 + }, + { + "epoch": 0.4052973069060528, + "grad_norm": 1.6012718677520752, + "learning_rate": 1.960669920244952e-05, + "loss": 1.9614, + "step": 11400 + }, + { + "epoch": 0.4056528308594792, + "grad_norm": 1.623199224472046, + "learning_rate": 1.960561015776211e-05, + "loss": 2.0027, + "step": 11410 + }, + { + "epoch": 0.40600835481290554, + "grad_norm": 1.6276559829711914, + "learning_rate": 1.9604519637713207e-05, + "loss": 1.9317, + "step": 11420 + }, + { + "epoch": 0.4063638787663319, + "grad_norm": 1.6999454498291016, + "learning_rate": 1.9603427642470306e-05, + "loss": 1.9109, + "step": 11430 + }, + { + "epoch": 0.40671940271975826, + "grad_norm": 1.6149712800979614, + "learning_rate": 1.9602334172201138e-05, + "loss": 1.9643, + "step": 11440 + }, + { + "epoch": 0.4070749266731846, + "grad_norm": 1.6748502254486084, + "learning_rate": 1.9601239227073644e-05, + "loss": 1.9647, + "step": 11450 + }, + { + "epoch": 0.407430450626611, + "grad_norm": 1.6914095878601074, + "learning_rate": 1.9600142807256003e-05, + "loss": 1.9464, + "step": 11460 + }, + { + "epoch": 0.40778597458003735, + "grad_norm": 1.6857210397720337, + "learning_rate": 1.9599044912916624e-05, + "loss": 1.9394, + "step": 11470 + }, + { + "epoch": 0.4081414985334637, + "grad_norm": 1.6877509355545044, + "learning_rate": 1.9597945544224134e-05, + "loss": 1.9421, + "step": 11480 + }, + { + "epoch": 0.40849702248689007, + "grad_norm": 1.6180742979049683, + "learning_rate": 1.9596844701347386e-05, + "loss": 1.9018, + "step": 11490 + }, + { + "epoch": 0.40885254644031643, + "grad_norm": 2.0876150131225586, + "learning_rate": 1.9595742384455466e-05, + "loss": 1.8849, + "step": 11500 + }, + { + "epoch": 0.4092080703937428, + "grad_norm": 1.581478476524353, + "learning_rate": 1.9594638593717676e-05, + "loss": 1.9041, + "step": 11510 + }, + { + "epoch": 0.40956359434716916, + "grad_norm": 1.655171275138855, + "learning_rate": 1.9593533329303562e-05, + "loss": 1.9051, + "step": 11520 + }, + { + "epoch": 0.4099191183005955, + "grad_norm": 1.6481330394744873, + "learning_rate": 1.959242659138288e-05, + "loss": 1.9061, + "step": 11530 + }, + { + "epoch": 0.4102746422540219, + "grad_norm": 1.7449766397476196, + "learning_rate": 1.9591318380125618e-05, + "loss": 1.9395, + "step": 11540 + }, + { + "epoch": 0.41063016620744824, + "grad_norm": 1.6915010213851929, + "learning_rate": 1.959020869570199e-05, + "loss": 1.932, + "step": 11550 + }, + { + "epoch": 0.4109856901608746, + "grad_norm": 1.5623327493667603, + "learning_rate": 1.958909753828244e-05, + "loss": 1.8734, + "step": 11560 + }, + { + "epoch": 0.41134121411430097, + "grad_norm": 1.646648645401001, + "learning_rate": 1.9587984908037628e-05, + "loss": 1.8816, + "step": 11570 + }, + { + "epoch": 0.41169673806772733, + "grad_norm": 1.6405104398727417, + "learning_rate": 1.9586870805138452e-05, + "loss": 1.9458, + "step": 11580 + }, + { + "epoch": 0.4120522620211537, + "grad_norm": 1.5775431394577026, + "learning_rate": 1.9585755229756035e-05, + "loss": 1.8958, + "step": 11590 + }, + { + "epoch": 0.41240778597458005, + "grad_norm": 1.6822415590286255, + "learning_rate": 1.9584638182061715e-05, + "loss": 1.9371, + "step": 11600 + }, + { + "epoch": 0.4127633099280064, + "grad_norm": 1.6032915115356445, + "learning_rate": 1.9583519662227067e-05, + "loss": 1.9238, + "step": 11610 + }, + { + "epoch": 0.4131188338814328, + "grad_norm": 1.6131377220153809, + "learning_rate": 1.958239967042389e-05, + "loss": 1.9565, + "step": 11620 + }, + { + "epoch": 0.41347435783485914, + "grad_norm": 1.6385976076126099, + "learning_rate": 1.95812782068242e-05, + "loss": 1.8512, + "step": 11630 + }, + { + "epoch": 0.4138298817882855, + "grad_norm": 1.616721272468567, + "learning_rate": 1.9580155271600257e-05, + "loss": 1.928, + "step": 11640 + }, + { + "epoch": 0.41418540574171187, + "grad_norm": 1.5912649631500244, + "learning_rate": 1.9579030864924528e-05, + "loss": 1.9304, + "step": 11650 + }, + { + "epoch": 0.4145409296951382, + "grad_norm": 1.5707584619522095, + "learning_rate": 1.9577904986969724e-05, + "loss": 1.8915, + "step": 11660 + }, + { + "epoch": 0.4148964536485646, + "grad_norm": 1.617496132850647, + "learning_rate": 1.9576777637908765e-05, + "loss": 1.9334, + "step": 11670 + }, + { + "epoch": 0.41525197760199095, + "grad_norm": 1.6306051015853882, + "learning_rate": 1.9575648817914808e-05, + "loss": 1.9461, + "step": 11680 + }, + { + "epoch": 0.4156075015554173, + "grad_norm": 1.6395132541656494, + "learning_rate": 1.9574518527161234e-05, + "loss": 1.902, + "step": 11690 + }, + { + "epoch": 0.4159630255088437, + "grad_norm": 1.6522890329360962, + "learning_rate": 1.9573386765821647e-05, + "loss": 1.9307, + "step": 11700 + }, + { + "epoch": 0.41631854946227004, + "grad_norm": 1.6019282341003418, + "learning_rate": 1.9572253534069876e-05, + "loss": 1.9779, + "step": 11710 + }, + { + "epoch": 0.4166740734156964, + "grad_norm": 1.6080857515335083, + "learning_rate": 1.9571118832079982e-05, + "loss": 1.974, + "step": 11720 + }, + { + "epoch": 0.41702959736912276, + "grad_norm": 1.581402063369751, + "learning_rate": 1.956998266002625e-05, + "loss": 1.9045, + "step": 11730 + }, + { + "epoch": 0.4173851213225491, + "grad_norm": 1.6517729759216309, + "learning_rate": 1.9568845018083182e-05, + "loss": 1.9641, + "step": 11740 + }, + { + "epoch": 0.4177406452759755, + "grad_norm": 1.6685960292816162, + "learning_rate": 1.9567705906425515e-05, + "loss": 1.9623, + "step": 11750 + }, + { + "epoch": 0.41809616922940185, + "grad_norm": 1.6158983707427979, + "learning_rate": 1.956656532522821e-05, + "loss": 1.9484, + "step": 11760 + }, + { + "epoch": 0.4184516931828282, + "grad_norm": 1.55313241481781, + "learning_rate": 1.956542327466646e-05, + "loss": 1.8969, + "step": 11770 + }, + { + "epoch": 0.4188072171362546, + "grad_norm": 1.6633541584014893, + "learning_rate": 1.9564279754915666e-05, + "loss": 1.9344, + "step": 11780 + }, + { + "epoch": 0.41916274108968093, + "grad_norm": 1.5694234371185303, + "learning_rate": 1.9563134766151474e-05, + "loss": 1.9093, + "step": 11790 + }, + { + "epoch": 0.4195182650431073, + "grad_norm": 1.6353678703308105, + "learning_rate": 1.956198830854974e-05, + "loss": 1.8947, + "step": 11800 + }, + { + "epoch": 0.41987378899653366, + "grad_norm": 1.6353477239608765, + "learning_rate": 1.9560840382286556e-05, + "loss": 1.8985, + "step": 11810 + }, + { + "epoch": 0.42022931294996, + "grad_norm": 1.6388938426971436, + "learning_rate": 1.9559690987538235e-05, + "loss": 1.9071, + "step": 11820 + }, + { + "epoch": 0.4205848369033864, + "grad_norm": 1.7158305644989014, + "learning_rate": 1.955854012448132e-05, + "loss": 1.9155, + "step": 11830 + }, + { + "epoch": 0.42094036085681275, + "grad_norm": 1.6991742849349976, + "learning_rate": 1.9557387793292574e-05, + "loss": 1.9092, + "step": 11840 + }, + { + "epoch": 0.4212958848102391, + "grad_norm": 1.7193807363510132, + "learning_rate": 1.955623399414899e-05, + "loss": 1.9046, + "step": 11850 + }, + { + "epoch": 0.42165140876366547, + "grad_norm": 1.6610537767410278, + "learning_rate": 1.9555078727227782e-05, + "loss": 1.9967, + "step": 11860 + }, + { + "epoch": 0.42200693271709183, + "grad_norm": 1.6989997625350952, + "learning_rate": 1.9553921992706392e-05, + "loss": 1.9065, + "step": 11870 + }, + { + "epoch": 0.4223624566705182, + "grad_norm": 1.6551401615142822, + "learning_rate": 1.9552763790762484e-05, + "loss": 1.9347, + "step": 11880 + }, + { + "epoch": 0.42271798062394456, + "grad_norm": 1.628297209739685, + "learning_rate": 1.9551604121573956e-05, + "loss": 1.9132, + "step": 11890 + }, + { + "epoch": 0.4230735045773709, + "grad_norm": 1.6718446016311646, + "learning_rate": 1.9550442985318926e-05, + "loss": 1.931, + "step": 11900 + }, + { + "epoch": 0.4234290285307973, + "grad_norm": 1.5980277061462402, + "learning_rate": 1.9549280382175734e-05, + "loss": 1.9359, + "step": 11910 + }, + { + "epoch": 0.42378455248422364, + "grad_norm": 1.787895917892456, + "learning_rate": 1.954811631232295e-05, + "loss": 1.8912, + "step": 11920 + }, + { + "epoch": 0.42414007643765, + "grad_norm": 1.5402417182922363, + "learning_rate": 1.9546950775939366e-05, + "loss": 1.9499, + "step": 11930 + }, + { + "epoch": 0.42449560039107637, + "grad_norm": 1.5597251653671265, + "learning_rate": 1.9545783773204006e-05, + "loss": 1.9, + "step": 11940 + }, + { + "epoch": 0.42485112434450273, + "grad_norm": 1.6577147245407104, + "learning_rate": 1.954461530429611e-05, + "loss": 1.9103, + "step": 11950 + }, + { + "epoch": 0.4252066482979291, + "grad_norm": 1.6680374145507812, + "learning_rate": 1.9543445369395144e-05, + "loss": 1.8914, + "step": 11960 + }, + { + "epoch": 0.42556217225135545, + "grad_norm": 1.5822381973266602, + "learning_rate": 1.954227396868081e-05, + "loss": 1.8932, + "step": 11970 + }, + { + "epoch": 0.4259176962047818, + "grad_norm": 1.7338037490844727, + "learning_rate": 1.9541101102333026e-05, + "loss": 1.9341, + "step": 11980 + }, + { + "epoch": 0.4262732201582082, + "grad_norm": 1.7024821043014526, + "learning_rate": 1.9539926770531937e-05, + "loss": 1.9385, + "step": 11990 + }, + { + "epoch": 0.42662874411163454, + "grad_norm": 1.6833677291870117, + "learning_rate": 1.9538750973457907e-05, + "loss": 1.9207, + "step": 12000 + }, + { + "epoch": 0.4269842680650609, + "grad_norm": 1.6972459554672241, + "learning_rate": 1.953757371129154e-05, + "loss": 1.8644, + "step": 12010 + }, + { + "epoch": 0.42733979201848726, + "grad_norm": 1.5408259630203247, + "learning_rate": 1.953639498421365e-05, + "loss": 1.9359, + "step": 12020 + }, + { + "epoch": 0.4276953159719136, + "grad_norm": 1.6107205152511597, + "learning_rate": 1.9535214792405286e-05, + "loss": 1.9678, + "step": 12030 + }, + { + "epoch": 0.42805083992534, + "grad_norm": 1.7111223936080933, + "learning_rate": 1.9534033136047715e-05, + "loss": 1.9065, + "step": 12040 + }, + { + "epoch": 0.42840636387876635, + "grad_norm": 1.6180115938186646, + "learning_rate": 1.9532850015322434e-05, + "loss": 1.9034, + "step": 12050 + }, + { + "epoch": 0.4287618878321927, + "grad_norm": 1.5723388195037842, + "learning_rate": 1.953166543041116e-05, + "loss": 1.926, + "step": 12060 + }, + { + "epoch": 0.4291174117856191, + "grad_norm": 1.6159480810165405, + "learning_rate": 1.953047938149584e-05, + "loss": 1.9281, + "step": 12070 + }, + { + "epoch": 0.42947293573904544, + "grad_norm": 1.650526523590088, + "learning_rate": 1.9529291868758646e-05, + "loss": 1.9341, + "step": 12080 + }, + { + "epoch": 0.4298284596924718, + "grad_norm": 1.643018126487732, + "learning_rate": 1.952810289238197e-05, + "loss": 1.9349, + "step": 12090 + }, + { + "epoch": 0.43018398364589816, + "grad_norm": 1.7136913537979126, + "learning_rate": 1.952691245254843e-05, + "loss": 1.9012, + "step": 12100 + }, + { + "epoch": 0.4305395075993245, + "grad_norm": 1.5042798519134521, + "learning_rate": 1.952572054944087e-05, + "loss": 1.9318, + "step": 12110 + }, + { + "epoch": 0.4308950315527509, + "grad_norm": 1.6267616748809814, + "learning_rate": 1.9524527183242364e-05, + "loss": 1.9018, + "step": 12120 + }, + { + "epoch": 0.43125055550617725, + "grad_norm": 1.628717064857483, + "learning_rate": 1.95233323541362e-05, + "loss": 1.9261, + "step": 12130 + }, + { + "epoch": 0.4316060794596036, + "grad_norm": 1.5327138900756836, + "learning_rate": 1.95221360623059e-05, + "loss": 1.9033, + "step": 12140 + }, + { + "epoch": 0.43196160341302997, + "grad_norm": 1.6792123317718506, + "learning_rate": 1.95209383079352e-05, + "loss": 1.8913, + "step": 12150 + }, + { + "epoch": 0.43231712736645633, + "grad_norm": 1.5949348211288452, + "learning_rate": 1.951973909120808e-05, + "loss": 1.9061, + "step": 12160 + }, + { + "epoch": 0.4326726513198827, + "grad_norm": 1.5922819375991821, + "learning_rate": 1.9518538412308717e-05, + "loss": 1.979, + "step": 12170 + }, + { + "epoch": 0.43302817527330906, + "grad_norm": 1.6448159217834473, + "learning_rate": 1.9517336271421542e-05, + "loss": 1.9077, + "step": 12180 + }, + { + "epoch": 0.4333836992267354, + "grad_norm": 1.6367768049240112, + "learning_rate": 1.9516132668731186e-05, + "loss": 1.9506, + "step": 12190 + }, + { + "epoch": 0.4337392231801618, + "grad_norm": 1.6600077152252197, + "learning_rate": 1.951492760442252e-05, + "loss": 1.9433, + "step": 12200 + }, + { + "epoch": 0.43409474713358814, + "grad_norm": 1.5921251773834229, + "learning_rate": 1.9513721078680633e-05, + "loss": 1.9233, + "step": 12210 + }, + { + "epoch": 0.4344502710870145, + "grad_norm": 1.6690195798873901, + "learning_rate": 1.9512513091690838e-05, + "loss": 1.9013, + "step": 12220 + }, + { + "epoch": 0.43480579504044087, + "grad_norm": 1.6232579946517944, + "learning_rate": 1.951130364363868e-05, + "loss": 1.9308, + "step": 12230 + }, + { + "epoch": 0.43516131899386723, + "grad_norm": 1.5536707639694214, + "learning_rate": 1.9510092734709917e-05, + "loss": 1.8874, + "step": 12240 + }, + { + "epoch": 0.4355168429472936, + "grad_norm": 1.5925525426864624, + "learning_rate": 1.9508880365090537e-05, + "loss": 1.9133, + "step": 12250 + }, + { + "epoch": 0.43587236690071995, + "grad_norm": 1.5788756608963013, + "learning_rate": 1.9507666534966755e-05, + "loss": 1.9278, + "step": 12260 + }, + { + "epoch": 0.4362278908541463, + "grad_norm": 1.5934933423995972, + "learning_rate": 1.9506451244525008e-05, + "loss": 1.8873, + "step": 12270 + }, + { + "epoch": 0.4365834148075727, + "grad_norm": 1.6129816770553589, + "learning_rate": 1.9505234493951953e-05, + "loss": 1.8911, + "step": 12280 + }, + { + "epoch": 0.43693893876099904, + "grad_norm": 1.5828301906585693, + "learning_rate": 1.950401628343448e-05, + "loss": 1.8813, + "step": 12290 + }, + { + "epoch": 0.4372944627144254, + "grad_norm": 1.6710927486419678, + "learning_rate": 1.9502796613159698e-05, + "loss": 1.9387, + "step": 12300 + }, + { + "epoch": 0.43764998666785176, + "grad_norm": 1.695595622062683, + "learning_rate": 1.9501575483314938e-05, + "loss": 1.9406, + "step": 12310 + }, + { + "epoch": 0.4380055106212781, + "grad_norm": 1.6410837173461914, + "learning_rate": 1.9500352894087754e-05, + "loss": 1.9012, + "step": 12320 + }, + { + "epoch": 0.4383610345747045, + "grad_norm": 1.558901309967041, + "learning_rate": 1.949912884566594e-05, + "loss": 1.9015, + "step": 12330 + }, + { + "epoch": 0.43871655852813085, + "grad_norm": 1.6479629278182983, + "learning_rate": 1.9497903338237495e-05, + "loss": 1.8987, + "step": 12340 + }, + { + "epoch": 0.4390720824815572, + "grad_norm": 1.517884612083435, + "learning_rate": 1.9496676371990647e-05, + "loss": 1.8754, + "step": 12350 + }, + { + "epoch": 0.4394276064349836, + "grad_norm": 1.6467676162719727, + "learning_rate": 1.9495447947113852e-05, + "loss": 1.9431, + "step": 12360 + }, + { + "epoch": 0.43978313038840994, + "grad_norm": 1.6293061971664429, + "learning_rate": 1.949421806379579e-05, + "loss": 1.9369, + "step": 12370 + }, + { + "epoch": 0.4401386543418363, + "grad_norm": 1.5900800228118896, + "learning_rate": 1.9492986722225363e-05, + "loss": 1.9401, + "step": 12380 + }, + { + "epoch": 0.44049417829526266, + "grad_norm": 1.676795482635498, + "learning_rate": 1.9491753922591695e-05, + "loss": 1.8996, + "step": 12390 + }, + { + "epoch": 0.440849702248689, + "grad_norm": 1.604500412940979, + "learning_rate": 1.9490519665084142e-05, + "loss": 1.9152, + "step": 12400 + }, + { + "epoch": 0.4412052262021154, + "grad_norm": 1.7166574001312256, + "learning_rate": 1.9489283949892275e-05, + "loss": 1.9418, + "step": 12410 + }, + { + "epoch": 0.44156075015554175, + "grad_norm": 1.6014140844345093, + "learning_rate": 1.948804677720589e-05, + "loss": 1.8891, + "step": 12420 + }, + { + "epoch": 0.4419162741089681, + "grad_norm": 1.6134687662124634, + "learning_rate": 1.9486808147215007e-05, + "loss": 1.9174, + "step": 12430 + }, + { + "epoch": 0.44227179806239447, + "grad_norm": 1.6366617679595947, + "learning_rate": 1.948556806010988e-05, + "loss": 1.9196, + "step": 12440 + }, + { + "epoch": 0.44262732201582083, + "grad_norm": 1.7176318168640137, + "learning_rate": 1.9484326516080973e-05, + "loss": 1.9369, + "step": 12450 + }, + { + "epoch": 0.4429828459692472, + "grad_norm": 1.704402208328247, + "learning_rate": 1.948308351531898e-05, + "loss": 1.9034, + "step": 12460 + }, + { + "epoch": 0.44333836992267356, + "grad_norm": 1.6780890226364136, + "learning_rate": 1.948183905801482e-05, + "loss": 1.921, + "step": 12470 + }, + { + "epoch": 0.4436938938760999, + "grad_norm": 1.6434670686721802, + "learning_rate": 1.9480593144359627e-05, + "loss": 1.921, + "step": 12480 + }, + { + "epoch": 0.4440494178295263, + "grad_norm": 1.7120178937911987, + "learning_rate": 1.9479345774544774e-05, + "loss": 1.9433, + "step": 12490 + }, + { + "epoch": 0.44440494178295264, + "grad_norm": 1.7237123250961304, + "learning_rate": 1.9478096948761846e-05, + "loss": 1.9048, + "step": 12500 + }, + { + "epoch": 0.444760465736379, + "grad_norm": 1.5160402059555054, + "learning_rate": 1.9476846667202657e-05, + "loss": 1.8998, + "step": 12510 + }, + { + "epoch": 0.44511598968980537, + "grad_norm": 1.6251343488693237, + "learning_rate": 1.947559493005924e-05, + "loss": 1.9001, + "step": 12520 + }, + { + "epoch": 0.44547151364323173, + "grad_norm": 1.6302895545959473, + "learning_rate": 1.947434173752385e-05, + "loss": 1.894, + "step": 12530 + }, + { + "epoch": 0.4458270375966581, + "grad_norm": 1.5880894660949707, + "learning_rate": 1.9473087089788975e-05, + "loss": 1.9003, + "step": 12540 + }, + { + "epoch": 0.44618256155008446, + "grad_norm": 1.6394000053405762, + "learning_rate": 1.947183098704732e-05, + "loss": 1.9044, + "step": 12550 + }, + { + "epoch": 0.4465380855035108, + "grad_norm": 1.6077085733413696, + "learning_rate": 1.9470573429491816e-05, + "loss": 1.9345, + "step": 12560 + }, + { + "epoch": 0.4468936094569372, + "grad_norm": 1.6135870218276978, + "learning_rate": 1.9469314417315615e-05, + "loss": 1.9363, + "step": 12570 + }, + { + "epoch": 0.44724913341036354, + "grad_norm": 1.593467354774475, + "learning_rate": 1.9468053950712086e-05, + "loss": 1.8974, + "step": 12580 + }, + { + "epoch": 0.4476046573637899, + "grad_norm": 5.049159049987793, + "learning_rate": 1.9466792029874845e-05, + "loss": 1.9817, + "step": 12590 + }, + { + "epoch": 0.44796018131721627, + "grad_norm": 1.66642165184021, + "learning_rate": 1.9465528654997698e-05, + "loss": 1.9273, + "step": 12600 + }, + { + "epoch": 0.44831570527064263, + "grad_norm": 1.6853978633880615, + "learning_rate": 1.9464263826274702e-05, + "loss": 1.9217, + "step": 12610 + }, + { + "epoch": 0.448671229224069, + "grad_norm": 1.632203459739685, + "learning_rate": 1.946299754390012e-05, + "loss": 1.8761, + "step": 12620 + }, + { + "epoch": 0.44902675317749535, + "grad_norm": 1.5917268991470337, + "learning_rate": 1.9461729808068456e-05, + "loss": 1.9205, + "step": 12630 + }, + { + "epoch": 0.4493822771309217, + "grad_norm": 1.6298881769180298, + "learning_rate": 1.9460460618974414e-05, + "loss": 1.9213, + "step": 12640 + }, + { + "epoch": 0.4497378010843481, + "grad_norm": 1.6424980163574219, + "learning_rate": 1.945918997681294e-05, + "loss": 1.9061, + "step": 12650 + }, + { + "epoch": 0.45009332503777444, + "grad_norm": 1.6484566926956177, + "learning_rate": 1.9457917881779193e-05, + "loss": 1.8696, + "step": 12660 + }, + { + "epoch": 0.4504488489912008, + "grad_norm": 1.640828013420105, + "learning_rate": 1.945664433406856e-05, + "loss": 1.9214, + "step": 12670 + }, + { + "epoch": 0.45080437294462716, + "grad_norm": 1.7374014854431152, + "learning_rate": 1.9455369333876656e-05, + "loss": 1.8919, + "step": 12680 + }, + { + "epoch": 0.4511598968980535, + "grad_norm": 1.6202126741409302, + "learning_rate": 1.9454092881399305e-05, + "loss": 1.9765, + "step": 12690 + }, + { + "epoch": 0.4515154208514799, + "grad_norm": 1.59443199634552, + "learning_rate": 1.9452814976832567e-05, + "loss": 1.9083, + "step": 12700 + }, + { + "epoch": 0.45187094480490625, + "grad_norm": 1.7748507261276245, + "learning_rate": 1.9451535620372715e-05, + "loss": 1.8549, + "step": 12710 + }, + { + "epoch": 0.4522264687583326, + "grad_norm": 1.6348975896835327, + "learning_rate": 1.9450254812216254e-05, + "loss": 1.9299, + "step": 12720 + }, + { + "epoch": 0.452581992711759, + "grad_norm": 1.525699496269226, + "learning_rate": 1.9448972552559907e-05, + "loss": 1.9804, + "step": 12730 + }, + { + "epoch": 0.45293751666518534, + "grad_norm": 1.582080602645874, + "learning_rate": 1.9447688841600624e-05, + "loss": 1.8692, + "step": 12740 + }, + { + "epoch": 0.4532930406186117, + "grad_norm": 1.6867910623550415, + "learning_rate": 1.944640367953557e-05, + "loss": 1.9214, + "step": 12750 + }, + { + "epoch": 0.45364856457203806, + "grad_norm": 1.6968588829040527, + "learning_rate": 1.944511706656214e-05, + "loss": 1.8956, + "step": 12760 + }, + { + "epoch": 0.4540040885254644, + "grad_norm": 1.5950309038162231, + "learning_rate": 1.944382900287795e-05, + "loss": 1.8831, + "step": 12770 + }, + { + "epoch": 0.4543596124788908, + "grad_norm": 1.6183698177337646, + "learning_rate": 1.944253948868084e-05, + "loss": 1.9049, + "step": 12780 + }, + { + "epoch": 0.45471513643231715, + "grad_norm": 1.6075899600982666, + "learning_rate": 1.944124852416887e-05, + "loss": 1.904, + "step": 12790 + }, + { + "epoch": 0.4550706603857435, + "grad_norm": 1.6205227375030518, + "learning_rate": 1.9439956109540327e-05, + "loss": 1.9254, + "step": 12800 + }, + { + "epoch": 0.45542618433916987, + "grad_norm": 1.6769038438796997, + "learning_rate": 1.9438662244993706e-05, + "loss": 1.9147, + "step": 12810 + }, + { + "epoch": 0.45578170829259623, + "grad_norm": 1.553166151046753, + "learning_rate": 1.9437366930727753e-05, + "loss": 1.8794, + "step": 12820 + }, + { + "epoch": 0.4561372322460226, + "grad_norm": 1.585443377494812, + "learning_rate": 1.9436070166941408e-05, + "loss": 1.8902, + "step": 12830 + }, + { + "epoch": 0.45649275619944896, + "grad_norm": 1.6273727416992188, + "learning_rate": 1.943477195383385e-05, + "loss": 1.9012, + "step": 12840 + }, + { + "epoch": 0.4568482801528753, + "grad_norm": 1.5927307605743408, + "learning_rate": 1.9433472291604478e-05, + "loss": 1.9499, + "step": 12850 + }, + { + "epoch": 0.4572038041063017, + "grad_norm": 1.636775016784668, + "learning_rate": 1.943217118045291e-05, + "loss": 1.9058, + "step": 12860 + }, + { + "epoch": 0.45755932805972804, + "grad_norm": 1.5159486532211304, + "learning_rate": 1.9430868620578987e-05, + "loss": 1.8783, + "step": 12870 + }, + { + "epoch": 0.4579148520131544, + "grad_norm": 1.6984831094741821, + "learning_rate": 1.9429564612182776e-05, + "loss": 1.9713, + "step": 12880 + }, + { + "epoch": 0.45827037596658077, + "grad_norm": 1.663204550743103, + "learning_rate": 1.9428259155464566e-05, + "loss": 1.9375, + "step": 12890 + }, + { + "epoch": 0.45862589992000713, + "grad_norm": 1.7423876523971558, + "learning_rate": 1.9426952250624866e-05, + "loss": 1.9169, + "step": 12900 + }, + { + "epoch": 0.4589814238734335, + "grad_norm": 1.6697475910186768, + "learning_rate": 1.9425643897864404e-05, + "loss": 1.8952, + "step": 12910 + }, + { + "epoch": 0.45933694782685985, + "grad_norm": 1.5229991674423218, + "learning_rate": 1.9424334097384143e-05, + "loss": 1.8981, + "step": 12920 + }, + { + "epoch": 0.4596924717802862, + "grad_norm": 1.6769003868103027, + "learning_rate": 1.9423022849385256e-05, + "loss": 1.9478, + "step": 12930 + }, + { + "epoch": 0.4600479957337126, + "grad_norm": 1.5650204420089722, + "learning_rate": 1.942171015406914e-05, + "loss": 1.8654, + "step": 12940 + }, + { + "epoch": 0.46040351968713894, + "grad_norm": 1.6443099975585938, + "learning_rate": 1.942039601163742e-05, + "loss": 1.8507, + "step": 12950 + }, + { + "epoch": 0.4607590436405653, + "grad_norm": 1.5867111682891846, + "learning_rate": 1.9419080422291935e-05, + "loss": 1.923, + "step": 12960 + }, + { + "epoch": 0.46111456759399166, + "grad_norm": 1.7638205289840698, + "learning_rate": 1.941776338623476e-05, + "loss": 1.924, + "step": 12970 + }, + { + "epoch": 0.461470091547418, + "grad_norm": 1.6498541831970215, + "learning_rate": 1.9416444903668175e-05, + "loss": 1.9073, + "step": 12980 + }, + { + "epoch": 0.4618256155008444, + "grad_norm": 1.7238328456878662, + "learning_rate": 1.9415124974794696e-05, + "loss": 1.9016, + "step": 12990 + }, + { + "epoch": 0.46218113945427075, + "grad_norm": 1.6477906703948975, + "learning_rate": 1.9413803599817056e-05, + "loss": 1.9108, + "step": 13000 + }, + { + "epoch": 0.4625366634076971, + "grad_norm": 1.6353874206542969, + "learning_rate": 1.9412480778938206e-05, + "loss": 1.8839, + "step": 13010 + }, + { + "epoch": 0.4628921873611235, + "grad_norm": 1.6428017616271973, + "learning_rate": 1.9411156512361327e-05, + "loss": 1.8822, + "step": 13020 + }, + { + "epoch": 0.46324771131454984, + "grad_norm": 1.6262905597686768, + "learning_rate": 1.9409830800289814e-05, + "loss": 1.9024, + "step": 13030 + }, + { + "epoch": 0.4636032352679762, + "grad_norm": 1.6137045621871948, + "learning_rate": 1.940850364292729e-05, + "loss": 1.9447, + "step": 13040 + }, + { + "epoch": 0.46395875922140256, + "grad_norm": 1.689227819442749, + "learning_rate": 1.9407175040477598e-05, + "loss": 1.9493, + "step": 13050 + }, + { + "epoch": 0.4643142831748289, + "grad_norm": 1.5779759883880615, + "learning_rate": 1.9405844993144806e-05, + "loss": 1.8913, + "step": 13060 + }, + { + "epoch": 0.4646698071282553, + "grad_norm": 1.5751711130142212, + "learning_rate": 1.94045135011332e-05, + "loss": 1.8812, + "step": 13070 + }, + { + "epoch": 0.46502533108168165, + "grad_norm": 1.5909771919250488, + "learning_rate": 1.940318056464728e-05, + "loss": 1.9119, + "step": 13080 + }, + { + "epoch": 0.465380855035108, + "grad_norm": 1.6060901880264282, + "learning_rate": 1.9401846183891785e-05, + "loss": 1.8829, + "step": 13090 + }, + { + "epoch": 0.46573637898853437, + "grad_norm": 1.659328579902649, + "learning_rate": 1.9400510359071668e-05, + "loss": 1.9048, + "step": 13100 + }, + { + "epoch": 0.46609190294196073, + "grad_norm": 1.6586227416992188, + "learning_rate": 1.9399173090392102e-05, + "loss": 1.8598, + "step": 13110 + }, + { + "epoch": 0.4664474268953871, + "grad_norm": 1.7154213190078735, + "learning_rate": 1.9397834378058484e-05, + "loss": 1.9386, + "step": 13120 + }, + { + "epoch": 0.46680295084881346, + "grad_norm": 1.599129319190979, + "learning_rate": 1.939649422227643e-05, + "loss": 1.8908, + "step": 13130 + }, + { + "epoch": 0.4671584748022398, + "grad_norm": 1.5811879634857178, + "learning_rate": 1.9395152623251778e-05, + "loss": 1.8768, + "step": 13140 + }, + { + "epoch": 0.4675139987556662, + "grad_norm": 1.5494011640548706, + "learning_rate": 1.939380958119059e-05, + "loss": 1.8714, + "step": 13150 + }, + { + "epoch": 0.46786952270909254, + "grad_norm": 1.5415995121002197, + "learning_rate": 1.9392465096299154e-05, + "loss": 1.8892, + "step": 13160 + }, + { + "epoch": 0.4682250466625189, + "grad_norm": 1.589040756225586, + "learning_rate": 1.9391119168783966e-05, + "loss": 1.9234, + "step": 13170 + }, + { + "epoch": 0.46858057061594527, + "grad_norm": 1.597524881362915, + "learning_rate": 1.938977179885176e-05, + "loss": 1.885, + "step": 13180 + }, + { + "epoch": 0.46893609456937163, + "grad_norm": 1.7451609373092651, + "learning_rate": 1.938842298670948e-05, + "loss": 1.9116, + "step": 13190 + }, + { + "epoch": 0.469291618522798, + "grad_norm": 1.6840392351150513, + "learning_rate": 1.938707273256429e-05, + "loss": 1.9634, + "step": 13200 + }, + { + "epoch": 0.46964714247622436, + "grad_norm": 1.607974648475647, + "learning_rate": 1.938572103662359e-05, + "loss": 1.9127, + "step": 13210 + }, + { + "epoch": 0.4700026664296507, + "grad_norm": 1.5624415874481201, + "learning_rate": 1.938436789909499e-05, + "loss": 1.9265, + "step": 13220 + }, + { + "epoch": 0.4703581903830771, + "grad_norm": 1.5929343700408936, + "learning_rate": 1.9383013320186317e-05, + "loss": 1.9057, + "step": 13230 + }, + { + "epoch": 0.47071371433650344, + "grad_norm": 1.5993164777755737, + "learning_rate": 1.9381657300105633e-05, + "loss": 1.9265, + "step": 13240 + }, + { + "epoch": 0.4710692382899298, + "grad_norm": 1.6968023777008057, + "learning_rate": 1.9380299839061207e-05, + "loss": 1.9215, + "step": 13250 + }, + { + "epoch": 0.47142476224335617, + "grad_norm": 1.6598241329193115, + "learning_rate": 1.9378940937261544e-05, + "loss": 1.9089, + "step": 13260 + }, + { + "epoch": 0.47178028619678253, + "grad_norm": 1.6763114929199219, + "learning_rate": 1.937758059491536e-05, + "loss": 1.882, + "step": 13270 + }, + { + "epoch": 0.4721358101502089, + "grad_norm": 1.6208711862564087, + "learning_rate": 1.937621881223159e-05, + "loss": 1.949, + "step": 13280 + }, + { + "epoch": 0.47249133410363525, + "grad_norm": 1.656736969947815, + "learning_rate": 1.9374855589419406e-05, + "loss": 1.9028, + "step": 13290 + }, + { + "epoch": 0.4728468580570616, + "grad_norm": 1.6773045063018799, + "learning_rate": 1.937349092668818e-05, + "loss": 1.8978, + "step": 13300 + }, + { + "epoch": 0.473202382010488, + "grad_norm": 1.639001727104187, + "learning_rate": 1.937212482424752e-05, + "loss": 1.9054, + "step": 13310 + }, + { + "epoch": 0.47355790596391434, + "grad_norm": 1.6301771402359009, + "learning_rate": 1.9370757282307252e-05, + "loss": 1.8839, + "step": 13320 + }, + { + "epoch": 0.4739134299173407, + "grad_norm": 1.6172529458999634, + "learning_rate": 1.9369388301077422e-05, + "loss": 1.8938, + "step": 13330 + }, + { + "epoch": 0.47426895387076706, + "grad_norm": 1.5646872520446777, + "learning_rate": 1.9368017880768292e-05, + "loss": 1.9032, + "step": 13340 + }, + { + "epoch": 0.4746244778241934, + "grad_norm": 1.6018928289413452, + "learning_rate": 1.9366646021590356e-05, + "loss": 1.909, + "step": 13350 + }, + { + "epoch": 0.4749800017776198, + "grad_norm": 1.692284345626831, + "learning_rate": 1.9365272723754318e-05, + "loss": 1.9055, + "step": 13360 + }, + { + "epoch": 0.47533552573104615, + "grad_norm": 1.6436405181884766, + "learning_rate": 1.9363897987471113e-05, + "loss": 1.9448, + "step": 13370 + }, + { + "epoch": 0.4756910496844725, + "grad_norm": 1.5884928703308105, + "learning_rate": 1.936252181295189e-05, + "loss": 1.8558, + "step": 13380 + }, + { + "epoch": 0.4760465736378989, + "grad_norm": 1.5993527173995972, + "learning_rate": 1.9361144200408016e-05, + "loss": 1.8373, + "step": 13390 + }, + { + "epoch": 0.47640209759132524, + "grad_norm": 1.6061522960662842, + "learning_rate": 1.9359765150051092e-05, + "loss": 1.8592, + "step": 13400 + }, + { + "epoch": 0.4767576215447516, + "grad_norm": 1.6307588815689087, + "learning_rate": 1.9358384662092923e-05, + "loss": 1.9192, + "step": 13410 + }, + { + "epoch": 0.47711314549817796, + "grad_norm": 1.5935544967651367, + "learning_rate": 1.935700273674555e-05, + "loss": 1.8745, + "step": 13420 + }, + { + "epoch": 0.4774686694516043, + "grad_norm": 1.5504134893417358, + "learning_rate": 1.9355619374221223e-05, + "loss": 1.9084, + "step": 13430 + }, + { + "epoch": 0.4778241934050307, + "grad_norm": 1.635148286819458, + "learning_rate": 1.9354234574732422e-05, + "loss": 1.915, + "step": 13440 + }, + { + "epoch": 0.47817971735845705, + "grad_norm": 1.7150083780288696, + "learning_rate": 1.9352848338491842e-05, + "loss": 1.867, + "step": 13450 + }, + { + "epoch": 0.4785352413118834, + "grad_norm": 1.595038890838623, + "learning_rate": 1.93514606657124e-05, + "loss": 1.885, + "step": 13460 + }, + { + "epoch": 0.47889076526530977, + "grad_norm": 1.6162172555923462, + "learning_rate": 1.9350071556607234e-05, + "loss": 1.8561, + "step": 13470 + }, + { + "epoch": 0.47924628921873613, + "grad_norm": 1.6387050151824951, + "learning_rate": 1.93486810113897e-05, + "loss": 1.9129, + "step": 13480 + }, + { + "epoch": 0.4796018131721625, + "grad_norm": 1.7246519327163696, + "learning_rate": 1.9347289030273385e-05, + "loss": 1.9086, + "step": 13490 + }, + { + "epoch": 0.47995733712558886, + "grad_norm": 1.7230844497680664, + "learning_rate": 1.934589561347208e-05, + "loss": 1.9352, + "step": 13500 + }, + { + "epoch": 0.4803128610790152, + "grad_norm": 1.6448251008987427, + "learning_rate": 1.9344500761199806e-05, + "loss": 1.9607, + "step": 13510 + }, + { + "epoch": 0.4806683850324416, + "grad_norm": 1.631333589553833, + "learning_rate": 1.9343104473670808e-05, + "loss": 1.851, + "step": 13520 + }, + { + "epoch": 0.48102390898586794, + "grad_norm": 1.6933995485305786, + "learning_rate": 1.9341706751099542e-05, + "loss": 1.9284, + "step": 13530 + }, + { + "epoch": 0.4813794329392943, + "grad_norm": 1.556166172027588, + "learning_rate": 1.9340307593700695e-05, + "loss": 1.8744, + "step": 13540 + }, + { + "epoch": 0.48173495689272067, + "grad_norm": 1.5917972326278687, + "learning_rate": 1.933890700168916e-05, + "loss": 1.9204, + "step": 13550 + }, + { + "epoch": 0.48209048084614703, + "grad_norm": 1.685929298400879, + "learning_rate": 1.933750497528007e-05, + "loss": 1.9084, + "step": 13560 + }, + { + "epoch": 0.4824460047995734, + "grad_norm": 1.6132503747940063, + "learning_rate": 1.9336101514688764e-05, + "loss": 1.913, + "step": 13570 + }, + { + "epoch": 0.48280152875299975, + "grad_norm": 1.6278367042541504, + "learning_rate": 1.93346966201308e-05, + "loss": 1.9202, + "step": 13580 + }, + { + "epoch": 0.4831570527064261, + "grad_norm": 1.604725956916809, + "learning_rate": 1.9333290291821966e-05, + "loss": 1.943, + "step": 13590 + }, + { + "epoch": 0.4835125766598525, + "grad_norm": 1.6484644412994385, + "learning_rate": 1.933188252997826e-05, + "loss": 1.8807, + "step": 13600 + }, + { + "epoch": 0.48386810061327884, + "grad_norm": 1.629534363746643, + "learning_rate": 1.9330473334815912e-05, + "loss": 1.9084, + "step": 13610 + }, + { + "epoch": 0.4842236245667052, + "grad_norm": 1.6911847591400146, + "learning_rate": 1.932906270655136e-05, + "loss": 1.9041, + "step": 13620 + }, + { + "epoch": 0.48457914852013156, + "grad_norm": 1.6820714473724365, + "learning_rate": 1.9327650645401272e-05, + "loss": 1.9035, + "step": 13630 + }, + { + "epoch": 0.4849346724735579, + "grad_norm": 1.5599110126495361, + "learning_rate": 1.932623715158253e-05, + "loss": 1.8682, + "step": 13640 + }, + { + "epoch": 0.4852901964269843, + "grad_norm": 1.7825508117675781, + "learning_rate": 1.9324822225312236e-05, + "loss": 1.8875, + "step": 13650 + }, + { + "epoch": 0.48564572038041065, + "grad_norm": 1.5723313093185425, + "learning_rate": 1.9323405866807716e-05, + "loss": 1.8622, + "step": 13660 + }, + { + "epoch": 0.486001244333837, + "grad_norm": 1.7442368268966675, + "learning_rate": 1.9321988076286514e-05, + "loss": 1.8773, + "step": 13670 + }, + { + "epoch": 0.4863567682872634, + "grad_norm": 1.6873736381530762, + "learning_rate": 1.932056885396639e-05, + "loss": 1.9515, + "step": 13680 + }, + { + "epoch": 0.48671229224068974, + "grad_norm": 1.6768368482589722, + "learning_rate": 1.931914820006533e-05, + "loss": 1.9093, + "step": 13690 + }, + { + "epoch": 0.4870678161941161, + "grad_norm": 1.6003371477127075, + "learning_rate": 1.9317726114801544e-05, + "loss": 1.915, + "step": 13700 + }, + { + "epoch": 0.48742334014754246, + "grad_norm": 1.6443897485733032, + "learning_rate": 1.931630259839344e-05, + "loss": 1.9073, + "step": 13710 + }, + { + "epoch": 0.4877788641009688, + "grad_norm": 1.5972694158554077, + "learning_rate": 1.931487765105968e-05, + "loss": 1.8917, + "step": 13720 + }, + { + "epoch": 0.4881343880543952, + "grad_norm": 1.5997296571731567, + "learning_rate": 1.9313451273019112e-05, + "loss": 1.9197, + "step": 13730 + }, + { + "epoch": 0.48848991200782155, + "grad_norm": 1.5601133108139038, + "learning_rate": 1.9312023464490825e-05, + "loss": 1.9408, + "step": 13740 + }, + { + "epoch": 0.4888454359612479, + "grad_norm": 1.601619839668274, + "learning_rate": 1.9310594225694122e-05, + "loss": 1.9261, + "step": 13750 + }, + { + "epoch": 0.48920095991467427, + "grad_norm": 1.6181591749191284, + "learning_rate": 1.9309163556848523e-05, + "loss": 1.9181, + "step": 13760 + }, + { + "epoch": 0.48955648386810063, + "grad_norm": 1.5995105504989624, + "learning_rate": 1.930773145817377e-05, + "loss": 1.8906, + "step": 13770 + }, + { + "epoch": 0.489912007821527, + "grad_norm": 1.6839659214019775, + "learning_rate": 1.930629792988983e-05, + "loss": 1.9285, + "step": 13780 + }, + { + "epoch": 0.49026753177495336, + "grad_norm": 1.7488518953323364, + "learning_rate": 1.930486297221687e-05, + "loss": 1.9038, + "step": 13790 + }, + { + "epoch": 0.4906230557283797, + "grad_norm": 1.7097923755645752, + "learning_rate": 1.9303426585375305e-05, + "loss": 1.8895, + "step": 13800 + }, + { + "epoch": 0.4909785796818061, + "grad_norm": 1.5708833932876587, + "learning_rate": 1.930198876958575e-05, + "loss": 1.8745, + "step": 13810 + }, + { + "epoch": 0.49133410363523244, + "grad_norm": 1.5927371978759766, + "learning_rate": 1.9300549525069043e-05, + "loss": 1.9429, + "step": 13820 + }, + { + "epoch": 0.4916896275886588, + "grad_norm": 1.603779673576355, + "learning_rate": 1.929910885204624e-05, + "loss": 1.8854, + "step": 13830 + }, + { + "epoch": 0.49204515154208517, + "grad_norm": 1.6558383703231812, + "learning_rate": 1.9297666750738627e-05, + "loss": 1.9246, + "step": 13840 + }, + { + "epoch": 0.49240067549551153, + "grad_norm": 1.598236083984375, + "learning_rate": 1.9296223221367696e-05, + "loss": 1.9234, + "step": 13850 + }, + { + "epoch": 0.4927561994489379, + "grad_norm": 1.67493736743927, + "learning_rate": 1.929477826415517e-05, + "loss": 1.8706, + "step": 13860 + }, + { + "epoch": 0.49311172340236425, + "grad_norm": 1.7419062852859497, + "learning_rate": 1.929333187932298e-05, + "loss": 1.8659, + "step": 13870 + }, + { + "epoch": 0.4934672473557906, + "grad_norm": 1.6347029209136963, + "learning_rate": 1.929188406709328e-05, + "loss": 1.9214, + "step": 13880 + }, + { + "epoch": 0.493822771309217, + "grad_norm": 1.6613197326660156, + "learning_rate": 1.929043482768845e-05, + "loss": 1.8609, + "step": 13890 + }, + { + "epoch": 0.49417829526264334, + "grad_norm": 1.6763050556182861, + "learning_rate": 1.928898416133108e-05, + "loss": 1.8925, + "step": 13900 + }, + { + "epoch": 0.4945338192160697, + "grad_norm": 1.6605231761932373, + "learning_rate": 1.9287532068243986e-05, + "loss": 1.9065, + "step": 13910 + }, + { + "epoch": 0.49488934316949607, + "grad_norm": 1.751645565032959, + "learning_rate": 1.92860785486502e-05, + "loss": 1.9027, + "step": 13920 + }, + { + "epoch": 0.4952448671229224, + "grad_norm": 1.6546722650527954, + "learning_rate": 1.9284623602772973e-05, + "loss": 1.8895, + "step": 13930 + }, + { + "epoch": 0.4956003910763488, + "grad_norm": 1.5988556146621704, + "learning_rate": 1.928316723083578e-05, + "loss": 1.8798, + "step": 13940 + }, + { + "epoch": 0.49595591502977515, + "grad_norm": 1.657577633857727, + "learning_rate": 1.9281709433062298e-05, + "loss": 1.9087, + "step": 13950 + }, + { + "epoch": 0.4963114389832015, + "grad_norm": 1.639682412147522, + "learning_rate": 1.928025020967645e-05, + "loss": 1.8825, + "step": 13960 + }, + { + "epoch": 0.4966669629366279, + "grad_norm": 1.5884422063827515, + "learning_rate": 1.9278789560902354e-05, + "loss": 1.8911, + "step": 13970 + }, + { + "epoch": 0.49702248689005424, + "grad_norm": 1.6883779764175415, + "learning_rate": 1.9277327486964364e-05, + "loss": 1.9463, + "step": 13980 + }, + { + "epoch": 0.4973780108434806, + "grad_norm": 1.6581050157546997, + "learning_rate": 1.927586398808704e-05, + "loss": 1.9201, + "step": 13990 + }, + { + "epoch": 0.49773353479690696, + "grad_norm": 1.616876482963562, + "learning_rate": 1.9274399064495162e-05, + "loss": 1.8996, + "step": 14000 + }, + { + "epoch": 0.4980890587503333, + "grad_norm": 1.7162045240402222, + "learning_rate": 1.9272932716413742e-05, + "loss": 1.9051, + "step": 14010 + }, + { + "epoch": 0.4984445827037597, + "grad_norm": 1.655335545539856, + "learning_rate": 1.9271464944068e-05, + "loss": 1.9265, + "step": 14020 + }, + { + "epoch": 0.49880010665718605, + "grad_norm": 1.575579285621643, + "learning_rate": 1.9269995747683375e-05, + "loss": 1.8908, + "step": 14030 + }, + { + "epoch": 0.4991556306106124, + "grad_norm": 1.6541529893875122, + "learning_rate": 1.9268525127485528e-05, + "loss": 1.8684, + "step": 14040 + }, + { + "epoch": 0.4995111545640388, + "grad_norm": 1.6520228385925293, + "learning_rate": 1.9267053083700332e-05, + "loss": 1.9144, + "step": 14050 + }, + { + "epoch": 0.49986667851746514, + "grad_norm": 1.5795373916625977, + "learning_rate": 1.9265579616553886e-05, + "loss": 1.8651, + "step": 14060 + }, + { + "epoch": 0.5002222024708914, + "grad_norm": 1.6121716499328613, + "learning_rate": 1.926410472627251e-05, + "loss": 1.8891, + "step": 14070 + }, + { + "epoch": 0.5005777264243179, + "grad_norm": 1.676979899406433, + "learning_rate": 1.9262628413082733e-05, + "loss": 1.8769, + "step": 14080 + }, + { + "epoch": 0.5009332503777442, + "grad_norm": 1.6514569520950317, + "learning_rate": 1.9261150677211313e-05, + "loss": 1.892, + "step": 14090 + }, + { + "epoch": 0.5012887743311706, + "grad_norm": 1.5872944593429565, + "learning_rate": 1.925967151888521e-05, + "loss": 1.9033, + "step": 14100 + }, + { + "epoch": 0.5016442982845969, + "grad_norm": 1.5890864133834839, + "learning_rate": 1.9258190938331624e-05, + "loss": 1.9534, + "step": 14110 + }, + { + "epoch": 0.5019998222380233, + "grad_norm": 1.624387502670288, + "learning_rate": 1.925670893577796e-05, + "loss": 1.894, + "step": 14120 + }, + { + "epoch": 0.5023553461914496, + "grad_norm": 1.62134850025177, + "learning_rate": 1.9255225511451843e-05, + "loss": 1.8916, + "step": 14130 + }, + { + "epoch": 0.502710870144876, + "grad_norm": 1.5931661128997803, + "learning_rate": 1.9253740665581117e-05, + "loss": 1.8384, + "step": 14140 + }, + { + "epoch": 0.5030663940983023, + "grad_norm": 1.6112724542617798, + "learning_rate": 1.925225439839385e-05, + "loss": 1.8913, + "step": 14150 + }, + { + "epoch": 0.5034219180517288, + "grad_norm": 1.6279163360595703, + "learning_rate": 1.9250766710118314e-05, + "loss": 1.9066, + "step": 14160 + }, + { + "epoch": 0.5037774420051551, + "grad_norm": 1.6252950429916382, + "learning_rate": 1.9249277600983018e-05, + "loss": 1.8862, + "step": 14170 + }, + { + "epoch": 0.5041329659585815, + "grad_norm": 1.787638783454895, + "learning_rate": 1.924778707121667e-05, + "loss": 1.8761, + "step": 14180 + }, + { + "epoch": 0.5044884899120078, + "grad_norm": 1.5983929634094238, + "learning_rate": 1.9246295121048217e-05, + "loss": 1.9112, + "step": 14190 + }, + { + "epoch": 0.5048440138654342, + "grad_norm": 1.659288763999939, + "learning_rate": 1.9244801750706807e-05, + "loss": 1.8965, + "step": 14200 + }, + { + "epoch": 0.5051995378188605, + "grad_norm": 1.6085327863693237, + "learning_rate": 1.924330696042181e-05, + "loss": 1.891, + "step": 14210 + }, + { + "epoch": 0.5055550617722869, + "grad_norm": 1.627176284790039, + "learning_rate": 1.9241810750422826e-05, + "loss": 1.8777, + "step": 14220 + }, + { + "epoch": 0.5059105857257132, + "grad_norm": 1.5823941230773926, + "learning_rate": 1.9240313120939654e-05, + "loss": 1.9021, + "step": 14230 + }, + { + "epoch": 0.5062661096791397, + "grad_norm": 1.8027126789093018, + "learning_rate": 1.9238814072202326e-05, + "loss": 1.8911, + "step": 14240 + }, + { + "epoch": 0.506621633632566, + "grad_norm": 1.660543441772461, + "learning_rate": 1.9237313604441083e-05, + "loss": 1.89, + "step": 14250 + }, + { + "epoch": 0.5069771575859924, + "grad_norm": 1.633901834487915, + "learning_rate": 1.923581171788639e-05, + "loss": 1.8983, + "step": 14260 + }, + { + "epoch": 0.5073326815394187, + "grad_norm": 1.521661400794983, + "learning_rate": 1.9234308412768925e-05, + "loss": 1.8813, + "step": 14270 + }, + { + "epoch": 0.5076882054928451, + "grad_norm": 1.5929253101348877, + "learning_rate": 1.9232803689319585e-05, + "loss": 1.9159, + "step": 14280 + }, + { + "epoch": 0.5080437294462714, + "grad_norm": 1.6317856311798096, + "learning_rate": 1.9231297547769494e-05, + "loss": 1.9374, + "step": 14290 + }, + { + "epoch": 0.5083992533996978, + "grad_norm": 1.6200380325317383, + "learning_rate": 1.9229789988349973e-05, + "loss": 1.9038, + "step": 14300 + }, + { + "epoch": 0.5087547773531241, + "grad_norm": 1.5889546871185303, + "learning_rate": 1.9228281011292587e-05, + "loss": 1.8463, + "step": 14310 + }, + { + "epoch": 0.5091103013065506, + "grad_norm": 1.5688313245773315, + "learning_rate": 1.92267706168291e-05, + "loss": 1.8565, + "step": 14320 + }, + { + "epoch": 0.5094658252599769, + "grad_norm": 1.6941825151443481, + "learning_rate": 1.9225258805191494e-05, + "loss": 1.8345, + "step": 14330 + }, + { + "epoch": 0.5098213492134033, + "grad_norm": 1.541550874710083, + "learning_rate": 1.922374557661198e-05, + "loss": 1.9111, + "step": 14340 + }, + { + "epoch": 0.5101768731668296, + "grad_norm": 1.5329803228378296, + "learning_rate": 1.9222230931322977e-05, + "loss": 1.9328, + "step": 14350 + }, + { + "epoch": 0.510532397120256, + "grad_norm": 1.5915532112121582, + "learning_rate": 1.922071486955713e-05, + "loss": 1.8905, + "step": 14360 + }, + { + "epoch": 0.5108879210736823, + "grad_norm": 1.7071433067321777, + "learning_rate": 1.9219197391547294e-05, + "loss": 1.8765, + "step": 14370 + }, + { + "epoch": 0.5112434450271087, + "grad_norm": 1.560180425643921, + "learning_rate": 1.921767849752654e-05, + "loss": 1.9211, + "step": 14380 + }, + { + "epoch": 0.511598968980535, + "grad_norm": 1.679560661315918, + "learning_rate": 1.9216158187728165e-05, + "loss": 1.8989, + "step": 14390 + }, + { + "epoch": 0.5119544929339614, + "grad_norm": 1.572483777999878, + "learning_rate": 1.921463646238568e-05, + "loss": 1.9313, + "step": 14400 + }, + { + "epoch": 0.5123100168873878, + "grad_norm": 1.6964569091796875, + "learning_rate": 1.9213113321732807e-05, + "loss": 1.938, + "step": 14410 + }, + { + "epoch": 0.5126655408408142, + "grad_norm": 1.680965781211853, + "learning_rate": 1.92115887660035e-05, + "loss": 1.8979, + "step": 14420 + }, + { + "epoch": 0.5130210647942405, + "grad_norm": 1.6551510095596313, + "learning_rate": 1.921006279543191e-05, + "loss": 1.8742, + "step": 14430 + }, + { + "epoch": 0.5133765887476669, + "grad_norm": 1.6164863109588623, + "learning_rate": 1.9208535410252425e-05, + "loss": 1.9107, + "step": 14440 + }, + { + "epoch": 0.5137321127010932, + "grad_norm": 1.6352697610855103, + "learning_rate": 1.920700661069964e-05, + "loss": 1.9052, + "step": 14450 + }, + { + "epoch": 0.5140876366545196, + "grad_norm": 1.5313262939453125, + "learning_rate": 1.9205476397008366e-05, + "loss": 1.8627, + "step": 14460 + }, + { + "epoch": 0.5144431606079459, + "grad_norm": 1.6885111331939697, + "learning_rate": 1.9203944769413638e-05, + "loss": 1.8998, + "step": 14470 + }, + { + "epoch": 0.5147986845613723, + "grad_norm": 1.6766098737716675, + "learning_rate": 1.9202411728150702e-05, + "loss": 1.8591, + "step": 14480 + }, + { + "epoch": 0.5151542085147987, + "grad_norm": 1.556369662284851, + "learning_rate": 1.9200877273455024e-05, + "loss": 1.872, + "step": 14490 + }, + { + "epoch": 0.5155097324682251, + "grad_norm": 1.6718697547912598, + "learning_rate": 1.9199341405562285e-05, + "loss": 1.8891, + "step": 14500 + }, + { + "epoch": 0.5158652564216514, + "grad_norm": 1.5532348155975342, + "learning_rate": 1.919780412470839e-05, + "loss": 1.8722, + "step": 14510 + }, + { + "epoch": 0.5162207803750778, + "grad_norm": 1.6569695472717285, + "learning_rate": 1.919626543112945e-05, + "loss": 1.896, + "step": 14520 + }, + { + "epoch": 0.5165763043285041, + "grad_norm": 1.6707987785339355, + "learning_rate": 1.91947253250618e-05, + "loss": 1.8953, + "step": 14530 + }, + { + "epoch": 0.5169318282819305, + "grad_norm": 1.5621235370635986, + "learning_rate": 1.919318380674199e-05, + "loss": 1.8604, + "step": 14540 + }, + { + "epoch": 0.5172873522353568, + "grad_norm": 1.6478444337844849, + "learning_rate": 1.9191640876406793e-05, + "loss": 1.8787, + "step": 14550 + }, + { + "epoch": 0.5176428761887832, + "grad_norm": 1.6549452543258667, + "learning_rate": 1.9190096534293188e-05, + "loss": 1.8726, + "step": 14560 + }, + { + "epoch": 0.5179984001422095, + "grad_norm": 1.6203386783599854, + "learning_rate": 1.9188550780638376e-05, + "loss": 1.9032, + "step": 14570 + }, + { + "epoch": 0.518353924095636, + "grad_norm": 1.7803328037261963, + "learning_rate": 1.9187003615679778e-05, + "loss": 1.8958, + "step": 14580 + }, + { + "epoch": 0.5187094480490623, + "grad_norm": 1.5585798025131226, + "learning_rate": 1.9185455039655028e-05, + "loss": 1.8964, + "step": 14590 + }, + { + "epoch": 0.5190649720024887, + "grad_norm": 1.5389212369918823, + "learning_rate": 1.9183905052801975e-05, + "loss": 1.8958, + "step": 14600 + }, + { + "epoch": 0.519420495955915, + "grad_norm": 1.609155535697937, + "learning_rate": 1.918235365535869e-05, + "loss": 1.8648, + "step": 14610 + }, + { + "epoch": 0.5197760199093414, + "grad_norm": 1.6422481536865234, + "learning_rate": 1.9180800847563462e-05, + "loss": 1.8656, + "step": 14620 + }, + { + "epoch": 0.5201315438627677, + "grad_norm": 1.6211472749710083, + "learning_rate": 1.9179246629654782e-05, + "loss": 1.8754, + "step": 14630 + }, + { + "epoch": 0.5204870678161941, + "grad_norm": 1.5963804721832275, + "learning_rate": 1.9177691001871377e-05, + "loss": 1.8616, + "step": 14640 + }, + { + "epoch": 0.5208425917696204, + "grad_norm": 1.593908667564392, + "learning_rate": 1.917613396445218e-05, + "loss": 1.9166, + "step": 14650 + }, + { + "epoch": 0.5211981157230469, + "grad_norm": 1.6430562734603882, + "learning_rate": 1.9174575517636338e-05, + "loss": 1.9046, + "step": 14660 + }, + { + "epoch": 0.5215536396764732, + "grad_norm": 1.6117980480194092, + "learning_rate": 1.917301566166322e-05, + "loss": 1.8737, + "step": 14670 + }, + { + "epoch": 0.5219091636298996, + "grad_norm": 1.7191271781921387, + "learning_rate": 1.9171454396772416e-05, + "loss": 1.8638, + "step": 14680 + }, + { + "epoch": 0.5222646875833259, + "grad_norm": 1.818167805671692, + "learning_rate": 1.916989172320372e-05, + "loss": 1.8808, + "step": 14690 + }, + { + "epoch": 0.5226202115367523, + "grad_norm": 1.7359200716018677, + "learning_rate": 1.916832764119715e-05, + "loss": 1.9067, + "step": 14700 + }, + { + "epoch": 0.5229757354901786, + "grad_norm": 1.5372364521026611, + "learning_rate": 1.9166762150992944e-05, + "loss": 1.9367, + "step": 14710 + }, + { + "epoch": 0.523331259443605, + "grad_norm": 1.6493061780929565, + "learning_rate": 1.9165195252831542e-05, + "loss": 1.8905, + "step": 14720 + }, + { + "epoch": 0.5236867833970313, + "grad_norm": 1.6419951915740967, + "learning_rate": 1.916362694695362e-05, + "loss": 1.9246, + "step": 14730 + }, + { + "epoch": 0.5240423073504578, + "grad_norm": 1.6568384170532227, + "learning_rate": 1.9162057233600052e-05, + "loss": 1.891, + "step": 14740 + }, + { + "epoch": 0.5243978313038841, + "grad_norm": 1.7263054847717285, + "learning_rate": 1.916048611301194e-05, + "loss": 1.8896, + "step": 14750 + }, + { + "epoch": 0.5247533552573105, + "grad_norm": 1.6149612665176392, + "learning_rate": 1.9158913585430602e-05, + "loss": 1.8639, + "step": 14760 + }, + { + "epoch": 0.5251088792107368, + "grad_norm": 1.6069010496139526, + "learning_rate": 1.915733965109756e-05, + "loss": 1.8915, + "step": 14770 + }, + { + "epoch": 0.5254644031641632, + "grad_norm": 1.7398128509521484, + "learning_rate": 1.9155764310254564e-05, + "loss": 1.9007, + "step": 14780 + }, + { + "epoch": 0.5258199271175895, + "grad_norm": 1.647915005683899, + "learning_rate": 1.915418756314358e-05, + "loss": 1.9293, + "step": 14790 + }, + { + "epoch": 0.5261754510710159, + "grad_norm": 1.6997140645980835, + "learning_rate": 1.9152609410006784e-05, + "loss": 1.8964, + "step": 14800 + }, + { + "epoch": 0.5265309750244422, + "grad_norm": 1.6560719013214111, + "learning_rate": 1.9151029851086565e-05, + "loss": 1.9442, + "step": 14810 + }, + { + "epoch": 0.5268864989778687, + "grad_norm": 1.6268523931503296, + "learning_rate": 1.9149448886625542e-05, + "loss": 1.8794, + "step": 14820 + }, + { + "epoch": 0.527242022931295, + "grad_norm": 1.5146976709365845, + "learning_rate": 1.914786651686654e-05, + "loss": 1.916, + "step": 14830 + }, + { + "epoch": 0.5275975468847214, + "grad_norm": 1.564766764640808, + "learning_rate": 1.9146282742052593e-05, + "loss": 1.9018, + "step": 14840 + }, + { + "epoch": 0.5279530708381477, + "grad_norm": 1.6078290939331055, + "learning_rate": 1.9144697562426965e-05, + "loss": 1.896, + "step": 14850 + }, + { + "epoch": 0.5283085947915741, + "grad_norm": 1.6417776346206665, + "learning_rate": 1.9143110978233133e-05, + "loss": 1.8669, + "step": 14860 + }, + { + "epoch": 0.5286641187450004, + "grad_norm": 1.5287001132965088, + "learning_rate": 1.914152298971478e-05, + "loss": 1.9229, + "step": 14870 + }, + { + "epoch": 0.5290196426984268, + "grad_norm": 1.5520888566970825, + "learning_rate": 1.9139933597115815e-05, + "loss": 1.8709, + "step": 14880 + }, + { + "epoch": 0.5293751666518531, + "grad_norm": 1.6008402109146118, + "learning_rate": 1.9138342800680358e-05, + "loss": 1.8714, + "step": 14890 + }, + { + "epoch": 0.5297306906052796, + "grad_norm": 1.589961290359497, + "learning_rate": 1.9136750600652743e-05, + "loss": 1.9262, + "step": 14900 + }, + { + "epoch": 0.5300862145587059, + "grad_norm": 1.7259788513183594, + "learning_rate": 1.9135156997277523e-05, + "loss": 1.8574, + "step": 14910 + }, + { + "epoch": 0.5304417385121323, + "grad_norm": 1.539460301399231, + "learning_rate": 1.9133561990799466e-05, + "loss": 1.8557, + "step": 14920 + }, + { + "epoch": 0.5307972624655586, + "grad_norm": 1.6305017471313477, + "learning_rate": 1.9131965581463558e-05, + "loss": 1.8761, + "step": 14930 + }, + { + "epoch": 0.531152786418985, + "grad_norm": 1.6037888526916504, + "learning_rate": 1.9130367769514988e-05, + "loss": 1.913, + "step": 14940 + }, + { + "epoch": 0.5315083103724113, + "grad_norm": 1.6008785963058472, + "learning_rate": 1.912876855519918e-05, + "loss": 1.8913, + "step": 14950 + }, + { + "epoch": 0.5318638343258377, + "grad_norm": 1.6513231992721558, + "learning_rate": 1.9127167938761762e-05, + "loss": 1.8988, + "step": 14960 + }, + { + "epoch": 0.532219358279264, + "grad_norm": 1.6702121496200562, + "learning_rate": 1.9125565920448575e-05, + "loss": 1.9275, + "step": 14970 + }, + { + "epoch": 0.5325748822326904, + "grad_norm": 1.7012349367141724, + "learning_rate": 1.912396250050568e-05, + "loss": 1.8969, + "step": 14980 + }, + { + "epoch": 0.5329304061861168, + "grad_norm": 1.5561339855194092, + "learning_rate": 1.9122357679179356e-05, + "loss": 1.8335, + "step": 14990 + }, + { + "epoch": 0.5332859301395432, + "grad_norm": 1.6674284934997559, + "learning_rate": 1.9120751456716083e-05, + "loss": 1.8655, + "step": 15000 + }, + { + "epoch": 0.5336414540929695, + "grad_norm": 1.689611792564392, + "learning_rate": 1.911914383336258e-05, + "loss": 1.8365, + "step": 15010 + }, + { + "epoch": 0.5339969780463959, + "grad_norm": 1.5600208044052124, + "learning_rate": 1.911753480936576e-05, + "loss": 1.9306, + "step": 15020 + }, + { + "epoch": 0.5343525019998222, + "grad_norm": 1.5981788635253906, + "learning_rate": 1.9115924384972758e-05, + "loss": 1.9228, + "step": 15030 + }, + { + "epoch": 0.5347080259532486, + "grad_norm": 1.5904815196990967, + "learning_rate": 1.911431256043093e-05, + "loss": 1.8945, + "step": 15040 + }, + { + "epoch": 0.5350635499066749, + "grad_norm": 1.650617003440857, + "learning_rate": 1.911269933598784e-05, + "loss": 1.8472, + "step": 15050 + }, + { + "epoch": 0.5354190738601013, + "grad_norm": 1.7228983640670776, + "learning_rate": 1.911108471189127e-05, + "loss": 1.8869, + "step": 15060 + }, + { + "epoch": 0.5357745978135277, + "grad_norm": 1.6754595041275024, + "learning_rate": 1.9109468688389216e-05, + "loss": 1.8485, + "step": 15070 + }, + { + "epoch": 0.5361301217669541, + "grad_norm": 1.5948920249938965, + "learning_rate": 1.9107851265729885e-05, + "loss": 1.9075, + "step": 15080 + }, + { + "epoch": 0.5364856457203804, + "grad_norm": 1.6747260093688965, + "learning_rate": 1.9106232444161707e-05, + "loss": 1.9437, + "step": 15090 + }, + { + "epoch": 0.5368411696738068, + "grad_norm": 1.511064052581787, + "learning_rate": 1.9104612223933322e-05, + "loss": 1.8816, + "step": 15100 + }, + { + "epoch": 0.5371966936272331, + "grad_norm": 1.492945909500122, + "learning_rate": 1.910299060529359e-05, + "loss": 1.8629, + "step": 15110 + }, + { + "epoch": 0.5375522175806595, + "grad_norm": 1.6290943622589111, + "learning_rate": 1.910136758849157e-05, + "loss": 1.8982, + "step": 15120 + }, + { + "epoch": 0.5379077415340858, + "grad_norm": 1.7283633947372437, + "learning_rate": 1.9099743173776558e-05, + "loss": 1.8189, + "step": 15130 + }, + { + "epoch": 0.5382632654875122, + "grad_norm": 1.55548095703125, + "learning_rate": 1.909811736139805e-05, + "loss": 1.8938, + "step": 15140 + }, + { + "epoch": 0.5386187894409386, + "grad_norm": 1.6615575551986694, + "learning_rate": 1.9096490151605764e-05, + "loss": 1.8888, + "step": 15150 + }, + { + "epoch": 0.538974313394365, + "grad_norm": 1.5879693031311035, + "learning_rate": 1.909486154464962e-05, + "loss": 1.9041, + "step": 15160 + }, + { + "epoch": 0.5393298373477913, + "grad_norm": 1.6335123777389526, + "learning_rate": 1.9093231540779773e-05, + "loss": 1.8635, + "step": 15170 + }, + { + "epoch": 0.5396853613012177, + "grad_norm": 1.5720808506011963, + "learning_rate": 1.9091600140246574e-05, + "loss": 1.8662, + "step": 15180 + }, + { + "epoch": 0.540040885254644, + "grad_norm": 1.6797782182693481, + "learning_rate": 1.9089967343300602e-05, + "loss": 1.9517, + "step": 15190 + }, + { + "epoch": 0.5403964092080704, + "grad_norm": 1.7217504978179932, + "learning_rate": 1.9088333150192638e-05, + "loss": 1.9007, + "step": 15200 + }, + { + "epoch": 0.5407519331614967, + "grad_norm": 1.624611258506775, + "learning_rate": 1.9086697561173683e-05, + "loss": 1.8755, + "step": 15210 + }, + { + "epoch": 0.5411074571149231, + "grad_norm": 1.6011959314346313, + "learning_rate": 1.908506057649496e-05, + "loss": 1.8762, + "step": 15220 + }, + { + "epoch": 0.5414629810683494, + "grad_norm": 1.6140680313110352, + "learning_rate": 1.9083422196407897e-05, + "loss": 1.8803, + "step": 15230 + }, + { + "epoch": 0.5418185050217759, + "grad_norm": 1.6139609813690186, + "learning_rate": 1.9081782421164136e-05, + "loss": 1.9264, + "step": 15240 + }, + { + "epoch": 0.5421740289752022, + "grad_norm": 1.6700936555862427, + "learning_rate": 1.908014125101554e-05, + "loss": 1.8885, + "step": 15250 + }, + { + "epoch": 0.5425295529286286, + "grad_norm": 1.7281888723373413, + "learning_rate": 1.907849868621418e-05, + "loss": 1.8685, + "step": 15260 + }, + { + "epoch": 0.5428850768820549, + "grad_norm": 1.6173104047775269, + "learning_rate": 1.9076854727012344e-05, + "loss": 1.8415, + "step": 15270 + }, + { + "epoch": 0.5432406008354813, + "grad_norm": 1.607061743736267, + "learning_rate": 1.9075209373662535e-05, + "loss": 1.9398, + "step": 15280 + }, + { + "epoch": 0.5435961247889076, + "grad_norm": 1.6720659732818604, + "learning_rate": 1.907356262641747e-05, + "loss": 1.888, + "step": 15290 + }, + { + "epoch": 0.543951648742334, + "grad_norm": 1.563422679901123, + "learning_rate": 1.9071914485530074e-05, + "loss": 1.8557, + "step": 15300 + }, + { + "epoch": 0.5443071726957603, + "grad_norm": 1.5465205907821655, + "learning_rate": 1.9070264951253495e-05, + "loss": 1.8867, + "step": 15310 + }, + { + "epoch": 0.5446626966491868, + "grad_norm": 1.6641415357589722, + "learning_rate": 1.906861402384109e-05, + "loss": 1.8501, + "step": 15320 + }, + { + "epoch": 0.5450182206026131, + "grad_norm": 1.7145063877105713, + "learning_rate": 1.9066961703546432e-05, + "loss": 1.893, + "step": 15330 + }, + { + "epoch": 0.5453737445560395, + "grad_norm": 1.5639148950576782, + "learning_rate": 1.906530799062331e-05, + "loss": 1.8897, + "step": 15340 + }, + { + "epoch": 0.5457292685094658, + "grad_norm": 1.6158241033554077, + "learning_rate": 1.9063652885325718e-05, + "loss": 1.885, + "step": 15350 + }, + { + "epoch": 0.5460847924628922, + "grad_norm": 1.5545011758804321, + "learning_rate": 1.906199638790787e-05, + "loss": 1.8207, + "step": 15360 + }, + { + "epoch": 0.5464403164163185, + "grad_norm": 1.7516465187072754, + "learning_rate": 1.90603384986242e-05, + "loss": 1.8562, + "step": 15370 + }, + { + "epoch": 0.5467958403697449, + "grad_norm": 1.626286506652832, + "learning_rate": 1.9058679217729345e-05, + "loss": 1.8643, + "step": 15380 + }, + { + "epoch": 0.5471513643231712, + "grad_norm": 1.6006121635437012, + "learning_rate": 1.905701854547816e-05, + "loss": 1.8397, + "step": 15390 + }, + { + "epoch": 0.5475068882765977, + "grad_norm": 1.7442126274108887, + "learning_rate": 1.9055356482125712e-05, + "loss": 1.8768, + "step": 15400 + }, + { + "epoch": 0.547862412230024, + "grad_norm": 1.5240834951400757, + "learning_rate": 1.9053693027927287e-05, + "loss": 1.881, + "step": 15410 + }, + { + "epoch": 0.5482179361834504, + "grad_norm": 1.5544195175170898, + "learning_rate": 1.905202818313838e-05, + "loss": 1.9523, + "step": 15420 + }, + { + "epoch": 0.5485734601368767, + "grad_norm": 1.69491708278656, + "learning_rate": 1.90503619480147e-05, + "loss": 1.88, + "step": 15430 + }, + { + "epoch": 0.5489289840903031, + "grad_norm": 1.677173376083374, + "learning_rate": 1.9048694322812174e-05, + "loss": 1.8832, + "step": 15440 + }, + { + "epoch": 0.5492845080437294, + "grad_norm": 1.7110683917999268, + "learning_rate": 1.9047025307786932e-05, + "loss": 1.8634, + "step": 15450 + }, + { + "epoch": 0.5496400319971558, + "grad_norm": 1.6487089395523071, + "learning_rate": 1.904535490319533e-05, + "loss": 1.8999, + "step": 15460 + }, + { + "epoch": 0.5499955559505821, + "grad_norm": 1.5610681772232056, + "learning_rate": 1.904368310929393e-05, + "loss": 1.8442, + "step": 15470 + }, + { + "epoch": 0.5503510799040086, + "grad_norm": 1.6055057048797607, + "learning_rate": 1.904200992633951e-05, + "loss": 1.9059, + "step": 15480 + }, + { + "epoch": 0.5507066038574349, + "grad_norm": 1.6054152250289917, + "learning_rate": 1.9040335354589056e-05, + "loss": 1.8852, + "step": 15490 + }, + { + "epoch": 0.5510621278108613, + "grad_norm": 1.562628149986267, + "learning_rate": 1.9038659394299775e-05, + "loss": 1.8882, + "step": 15500 + }, + { + "epoch": 0.5514176517642876, + "grad_norm": 1.6260756254196167, + "learning_rate": 1.9036982045729088e-05, + "loss": 1.8988, + "step": 15510 + }, + { + "epoch": 0.551773175717714, + "grad_norm": 1.6427818536758423, + "learning_rate": 1.9035303309134617e-05, + "loss": 1.8722, + "step": 15520 + }, + { + "epoch": 0.5521286996711403, + "grad_norm": 1.6877665519714355, + "learning_rate": 1.903362318477421e-05, + "loss": 1.8747, + "step": 15530 + }, + { + "epoch": 0.5524842236245667, + "grad_norm": 1.5642024278640747, + "learning_rate": 1.9031941672905923e-05, + "loss": 1.8376, + "step": 15540 + }, + { + "epoch": 0.552839747577993, + "grad_norm": 1.6180360317230225, + "learning_rate": 1.9030258773788028e-05, + "loss": 1.9095, + "step": 15550 + }, + { + "epoch": 0.5531952715314195, + "grad_norm": 1.6126481294631958, + "learning_rate": 1.9028574487679004e-05, + "loss": 1.9073, + "step": 15560 + }, + { + "epoch": 0.5535507954848458, + "grad_norm": 1.5349082946777344, + "learning_rate": 1.902688881483755e-05, + "loss": 1.8635, + "step": 15570 + }, + { + "epoch": 0.5539063194382722, + "grad_norm": 1.5188136100769043, + "learning_rate": 1.902520175552257e-05, + "loss": 1.8821, + "step": 15580 + }, + { + "epoch": 0.5542618433916985, + "grad_norm": 1.6134661436080933, + "learning_rate": 1.9023513309993192e-05, + "loss": 1.8955, + "step": 15590 + }, + { + "epoch": 0.5546173673451249, + "grad_norm": 1.5765665769577026, + "learning_rate": 1.9021823478508744e-05, + "loss": 1.8862, + "step": 15600 + }, + { + "epoch": 0.5549728912985512, + "grad_norm": 1.6248475313186646, + "learning_rate": 1.902013226132878e-05, + "loss": 1.8517, + "step": 15610 + }, + { + "epoch": 0.5553284152519776, + "grad_norm": 1.5957690477371216, + "learning_rate": 1.9018439658713055e-05, + "loss": 1.8592, + "step": 15620 + }, + { + "epoch": 0.5556839392054039, + "grad_norm": 1.6602823734283447, + "learning_rate": 1.9016745670921547e-05, + "loss": 1.8864, + "step": 15630 + }, + { + "epoch": 0.5560394631588303, + "grad_norm": 1.805245041847229, + "learning_rate": 1.9015050298214436e-05, + "loss": 1.8493, + "step": 15640 + }, + { + "epoch": 0.5563949871122567, + "grad_norm": 1.6458933353424072, + "learning_rate": 1.9013353540852124e-05, + "loss": 1.8952, + "step": 15650 + }, + { + "epoch": 0.5567505110656831, + "grad_norm": 1.6207075119018555, + "learning_rate": 1.9011655399095226e-05, + "loss": 1.8594, + "step": 15660 + }, + { + "epoch": 0.5571060350191094, + "grad_norm": 1.592824935913086, + "learning_rate": 1.900995587320456e-05, + "loss": 1.8967, + "step": 15670 + }, + { + "epoch": 0.5574615589725358, + "grad_norm": 1.684127926826477, + "learning_rate": 1.9008254963441163e-05, + "loss": 1.8629, + "step": 15680 + }, + { + "epoch": 0.5578170829259621, + "grad_norm": 1.7365632057189941, + "learning_rate": 1.9006552670066288e-05, + "loss": 1.8812, + "step": 15690 + }, + { + "epoch": 0.5581726068793885, + "grad_norm": 1.6320279836654663, + "learning_rate": 1.9004848993341398e-05, + "loss": 1.8456, + "step": 15700 + }, + { + "epoch": 0.5585281308328148, + "grad_norm": 1.6249363422393799, + "learning_rate": 1.900314393352816e-05, + "loss": 1.8636, + "step": 15710 + }, + { + "epoch": 0.5588836547862412, + "grad_norm": 1.6237016916275024, + "learning_rate": 1.900143749088846e-05, + "loss": 1.8946, + "step": 15720 + }, + { + "epoch": 0.5592391787396676, + "grad_norm": 1.5896066427230835, + "learning_rate": 1.8999729665684406e-05, + "loss": 1.8769, + "step": 15730 + }, + { + "epoch": 0.559594702693094, + "grad_norm": 1.6980187892913818, + "learning_rate": 1.8998020458178303e-05, + "loss": 1.8685, + "step": 15740 + }, + { + "epoch": 0.5599502266465203, + "grad_norm": 1.6312180757522583, + "learning_rate": 1.8996309868632675e-05, + "loss": 1.8814, + "step": 15750 + }, + { + "epoch": 0.5603057505999467, + "grad_norm": 1.6168968677520752, + "learning_rate": 1.899459789731026e-05, + "loss": 1.9114, + "step": 15760 + }, + { + "epoch": 0.560661274553373, + "grad_norm": 1.6933541297912598, + "learning_rate": 1.8992884544474e-05, + "loss": 1.8725, + "step": 15770 + }, + { + "epoch": 0.5610167985067994, + "grad_norm": 1.6688987016677856, + "learning_rate": 1.8991169810387067e-05, + "loss": 1.8497, + "step": 15780 + }, + { + "epoch": 0.5613723224602257, + "grad_norm": 1.6841078996658325, + "learning_rate": 1.898945369531282e-05, + "loss": 1.8917, + "step": 15790 + }, + { + "epoch": 0.5617278464136521, + "grad_norm": 1.6986109018325806, + "learning_rate": 1.8987736199514853e-05, + "loss": 1.8933, + "step": 15800 + }, + { + "epoch": 0.5620833703670784, + "grad_norm": 1.6620608568191528, + "learning_rate": 1.898601732325696e-05, + "loss": 1.8725, + "step": 15810 + }, + { + "epoch": 0.5624388943205049, + "grad_norm": 1.6265780925750732, + "learning_rate": 1.8984297066803146e-05, + "loss": 1.8615, + "step": 15820 + }, + { + "epoch": 0.5627944182739312, + "grad_norm": 1.47080397605896, + "learning_rate": 1.8982575430417636e-05, + "loss": 1.9122, + "step": 15830 + }, + { + "epoch": 0.5631499422273576, + "grad_norm": 1.6252546310424805, + "learning_rate": 1.898085241436486e-05, + "loss": 1.8958, + "step": 15840 + }, + { + "epoch": 0.5635054661807839, + "grad_norm": 1.6627707481384277, + "learning_rate": 1.8979128018909464e-05, + "loss": 1.875, + "step": 15850 + }, + { + "epoch": 0.5638609901342103, + "grad_norm": 1.7096972465515137, + "learning_rate": 1.8977402244316304e-05, + "loss": 1.8635, + "step": 15860 + }, + { + "epoch": 0.5642165140876366, + "grad_norm": 1.6762901544570923, + "learning_rate": 1.897567509085045e-05, + "loss": 1.8821, + "step": 15870 + }, + { + "epoch": 0.564572038041063, + "grad_norm": 1.6387391090393066, + "learning_rate": 1.897394655877718e-05, + "loss": 1.877, + "step": 15880 + }, + { + "epoch": 0.5649275619944893, + "grad_norm": 1.643298625946045, + "learning_rate": 1.8972216648361984e-05, + "loss": 1.851, + "step": 15890 + }, + { + "epoch": 0.5652830859479158, + "grad_norm": 1.658422589302063, + "learning_rate": 1.8970485359870567e-05, + "loss": 1.8817, + "step": 15900 + }, + { + "epoch": 0.5656386099013421, + "grad_norm": 1.6579492092132568, + "learning_rate": 1.8968752693568842e-05, + "loss": 1.8954, + "step": 15910 + }, + { + "epoch": 0.5659941338547685, + "grad_norm": 1.616687536239624, + "learning_rate": 1.896701864972294e-05, + "loss": 1.8478, + "step": 15920 + }, + { + "epoch": 0.5663496578081948, + "grad_norm": 1.60679292678833, + "learning_rate": 1.8965283228599196e-05, + "loss": 1.9111, + "step": 15930 + }, + { + "epoch": 0.5667051817616212, + "grad_norm": 1.5435000658035278, + "learning_rate": 1.8963546430464165e-05, + "loss": 1.8981, + "step": 15940 + }, + { + "epoch": 0.5670607057150475, + "grad_norm": 1.6048617362976074, + "learning_rate": 1.8961808255584596e-05, + "loss": 1.8765, + "step": 15950 + }, + { + "epoch": 0.5674162296684739, + "grad_norm": 1.6783527135849, + "learning_rate": 1.8960068704227476e-05, + "loss": 1.8805, + "step": 15960 + }, + { + "epoch": 0.5677717536219002, + "grad_norm": 1.6484862565994263, + "learning_rate": 1.8958327776659985e-05, + "loss": 1.8929, + "step": 15970 + }, + { + "epoch": 0.5681272775753267, + "grad_norm": 1.6223924160003662, + "learning_rate": 1.895658547314951e-05, + "loss": 1.8706, + "step": 15980 + }, + { + "epoch": 0.568482801528753, + "grad_norm": 1.5521831512451172, + "learning_rate": 1.895484179396367e-05, + "loss": 1.9197, + "step": 15990 + }, + { + "epoch": 0.5688383254821794, + "grad_norm": 1.5772056579589844, + "learning_rate": 1.8953096739370275e-05, + "loss": 1.8611, + "step": 16000 + }, + { + "epoch": 0.5691938494356057, + "grad_norm": 1.6748151779174805, + "learning_rate": 1.895135030963736e-05, + "loss": 1.8791, + "step": 16010 + }, + { + "epoch": 0.5695493733890321, + "grad_norm": 1.585634708404541, + "learning_rate": 1.8949602505033157e-05, + "loss": 1.9258, + "step": 16020 + }, + { + "epoch": 0.5699048973424584, + "grad_norm": 1.6667970418930054, + "learning_rate": 1.8947853325826128e-05, + "loss": 1.848, + "step": 16030 + }, + { + "epoch": 0.5702604212958848, + "grad_norm": 1.658412218093872, + "learning_rate": 1.8946102772284933e-05, + "loss": 1.8529, + "step": 16040 + }, + { + "epoch": 0.5706159452493111, + "grad_norm": 1.6104261875152588, + "learning_rate": 1.894435084467844e-05, + "loss": 1.8531, + "step": 16050 + }, + { + "epoch": 0.5709714692027376, + "grad_norm": 1.6428375244140625, + "learning_rate": 1.8942597543275744e-05, + "loss": 1.8516, + "step": 16060 + }, + { + "epoch": 0.5713269931561639, + "grad_norm": 1.7161728143692017, + "learning_rate": 1.8940842868346134e-05, + "loss": 1.8683, + "step": 16070 + }, + { + "epoch": 0.5716825171095903, + "grad_norm": 1.6344237327575684, + "learning_rate": 1.8939086820159117e-05, + "loss": 1.8701, + "step": 16080 + }, + { + "epoch": 0.5720380410630166, + "grad_norm": 1.6507536172866821, + "learning_rate": 1.8937329398984416e-05, + "loss": 1.8342, + "step": 16090 + }, + { + "epoch": 0.572393565016443, + "grad_norm": 1.627859115600586, + "learning_rate": 1.8935570605091955e-05, + "loss": 1.8757, + "step": 16100 + }, + { + "epoch": 0.5727490889698693, + "grad_norm": 1.6992720365524292, + "learning_rate": 1.8933810438751875e-05, + "loss": 1.889, + "step": 16110 + }, + { + "epoch": 0.5731046129232957, + "grad_norm": 1.5473157167434692, + "learning_rate": 1.8932048900234527e-05, + "loss": 1.8439, + "step": 16120 + }, + { + "epoch": 0.573460136876722, + "grad_norm": 1.617222547531128, + "learning_rate": 1.8930285989810474e-05, + "loss": 1.8851, + "step": 16130 + }, + { + "epoch": 0.5738156608301485, + "grad_norm": 1.5728873014450073, + "learning_rate": 1.8928521707750486e-05, + "loss": 1.874, + "step": 16140 + }, + { + "epoch": 0.5741711847835748, + "grad_norm": 1.623225212097168, + "learning_rate": 1.8926756054325545e-05, + "loss": 1.8365, + "step": 16150 + }, + { + "epoch": 0.5745267087370012, + "grad_norm": 1.7246637344360352, + "learning_rate": 1.8924989029806845e-05, + "loss": 1.876, + "step": 16160 + }, + { + "epoch": 0.5748822326904275, + "grad_norm": 1.6464879512786865, + "learning_rate": 1.8923220634465787e-05, + "loss": 1.8393, + "step": 16170 + }, + { + "epoch": 0.5752377566438539, + "grad_norm": 1.6673486232757568, + "learning_rate": 1.8921450868573993e-05, + "loss": 1.8756, + "step": 16180 + }, + { + "epoch": 0.5755932805972802, + "grad_norm": 1.6323940753936768, + "learning_rate": 1.8919679732403284e-05, + "loss": 1.8998, + "step": 16190 + }, + { + "epoch": 0.5759488045507066, + "grad_norm": 1.5756714344024658, + "learning_rate": 1.8917907226225695e-05, + "loss": 1.9193, + "step": 16200 + }, + { + "epoch": 0.5763043285041329, + "grad_norm": 1.5643407106399536, + "learning_rate": 1.891613335031347e-05, + "loss": 1.8532, + "step": 16210 + }, + { + "epoch": 0.5766598524575594, + "grad_norm": 1.7220146656036377, + "learning_rate": 1.891435810493907e-05, + "loss": 1.8412, + "step": 16220 + }, + { + "epoch": 0.5770153764109857, + "grad_norm": 1.6633564233779907, + "learning_rate": 1.8912581490375153e-05, + "loss": 1.8517, + "step": 16230 + }, + { + "epoch": 0.5773709003644121, + "grad_norm": 1.605879545211792, + "learning_rate": 1.8910803506894602e-05, + "loss": 1.9017, + "step": 16240 + }, + { + "epoch": 0.5777264243178384, + "grad_norm": 1.656543254852295, + "learning_rate": 1.8909024154770508e-05, + "loss": 1.9126, + "step": 16250 + }, + { + "epoch": 0.5780819482712648, + "grad_norm": 1.6617867946624756, + "learning_rate": 1.8907243434276162e-05, + "loss": 1.8808, + "step": 16260 + }, + { + "epoch": 0.5784374722246911, + "grad_norm": 1.6808924674987793, + "learning_rate": 1.8905461345685073e-05, + "loss": 1.8935, + "step": 16270 + }, + { + "epoch": 0.5787929961781175, + "grad_norm": 1.7385969161987305, + "learning_rate": 1.8903677889270957e-05, + "loss": 1.8294, + "step": 16280 + }, + { + "epoch": 0.5791485201315438, + "grad_norm": 1.5762659311294556, + "learning_rate": 1.8901893065307745e-05, + "loss": 1.8564, + "step": 16290 + }, + { + "epoch": 0.5795040440849702, + "grad_norm": 1.6158792972564697, + "learning_rate": 1.8900106874069577e-05, + "loss": 1.8664, + "step": 16300 + }, + { + "epoch": 0.5798595680383966, + "grad_norm": 1.6304837465286255, + "learning_rate": 1.8898319315830793e-05, + "loss": 1.8929, + "step": 16310 + }, + { + "epoch": 0.580215091991823, + "grad_norm": 1.7296535968780518, + "learning_rate": 1.8896530390865955e-05, + "loss": 1.856, + "step": 16320 + }, + { + "epoch": 0.5805706159452493, + "grad_norm": 1.59303879737854, + "learning_rate": 1.8894740099449837e-05, + "loss": 1.907, + "step": 16330 + }, + { + "epoch": 0.5809261398986757, + "grad_norm": 1.5917261838912964, + "learning_rate": 1.88929484418574e-05, + "loss": 1.8705, + "step": 16340 + }, + { + "epoch": 0.581281663852102, + "grad_norm": 1.5613733530044556, + "learning_rate": 1.889115541836385e-05, + "loss": 1.8972, + "step": 16350 + }, + { + "epoch": 0.5816371878055284, + "grad_norm": 1.5381407737731934, + "learning_rate": 1.8889361029244574e-05, + "loss": 1.8423, + "step": 16360 + }, + { + "epoch": 0.5819927117589547, + "grad_norm": 1.6103670597076416, + "learning_rate": 1.8887565274775177e-05, + "loss": 1.8679, + "step": 16370 + }, + { + "epoch": 0.5823482357123811, + "grad_norm": 1.6302522420883179, + "learning_rate": 1.888576815523148e-05, + "loss": 1.8749, + "step": 16380 + }, + { + "epoch": 0.5827037596658075, + "grad_norm": 1.6894242763519287, + "learning_rate": 1.888396967088951e-05, + "loss": 1.8551, + "step": 16390 + }, + { + "epoch": 0.5830592836192339, + "grad_norm": 1.6959004402160645, + "learning_rate": 1.88821698220255e-05, + "loss": 1.8604, + "step": 16400 + }, + { + "epoch": 0.5834148075726602, + "grad_norm": 1.767952561378479, + "learning_rate": 1.88803686089159e-05, + "loss": 1.9134, + "step": 16410 + }, + { + "epoch": 0.5837703315260866, + "grad_norm": 1.6960941553115845, + "learning_rate": 1.8878566031837362e-05, + "loss": 1.8982, + "step": 16420 + }, + { + "epoch": 0.5841258554795129, + "grad_norm": 1.559275507926941, + "learning_rate": 1.8876762091066746e-05, + "loss": 1.8515, + "step": 16430 + }, + { + "epoch": 0.5844813794329393, + "grad_norm": 1.6764837503433228, + "learning_rate": 1.8874956786881137e-05, + "loss": 1.8995, + "step": 16440 + }, + { + "epoch": 0.5848369033863656, + "grad_norm": 1.6259223222732544, + "learning_rate": 1.8873150119557807e-05, + "loss": 1.8746, + "step": 16450 + }, + { + "epoch": 0.585192427339792, + "grad_norm": 1.640162706375122, + "learning_rate": 1.8871342089374253e-05, + "loss": 1.8844, + "step": 16460 + }, + { + "epoch": 0.5855479512932183, + "grad_norm": 1.6161834001541138, + "learning_rate": 1.886953269660818e-05, + "loss": 1.8626, + "step": 16470 + }, + { + "epoch": 0.5859034752466448, + "grad_norm": 1.7492423057556152, + "learning_rate": 1.8867721941537497e-05, + "loss": 1.9021, + "step": 16480 + }, + { + "epoch": 0.5862589992000711, + "grad_norm": 1.652024269104004, + "learning_rate": 1.886590982444033e-05, + "loss": 1.8882, + "step": 16490 + }, + { + "epoch": 0.5866145231534975, + "grad_norm": 1.52196204662323, + "learning_rate": 1.8864096345594996e-05, + "loss": 1.8257, + "step": 16500 + }, + { + "epoch": 0.5869700471069238, + "grad_norm": 1.7476404905319214, + "learning_rate": 1.8862281505280044e-05, + "loss": 1.8592, + "step": 16510 + }, + { + "epoch": 0.5873255710603502, + "grad_norm": 1.5485260486602783, + "learning_rate": 1.886046530377422e-05, + "loss": 1.8941, + "step": 16520 + }, + { + "epoch": 0.5876810950137765, + "grad_norm": 1.5296543836593628, + "learning_rate": 1.885864774135648e-05, + "loss": 1.9159, + "step": 16530 + }, + { + "epoch": 0.5880366189672029, + "grad_norm": 1.6564058065414429, + "learning_rate": 1.8856828818305993e-05, + "loss": 1.8667, + "step": 16540 + }, + { + "epoch": 0.5883921429206292, + "grad_norm": 1.655967354774475, + "learning_rate": 1.885500853490213e-05, + "loss": 1.8859, + "step": 16550 + }, + { + "epoch": 0.5887476668740557, + "grad_norm": 1.608729362487793, + "learning_rate": 1.885318689142448e-05, + "loss": 1.8904, + "step": 16560 + }, + { + "epoch": 0.589103190827482, + "grad_norm": 1.6501888036727905, + "learning_rate": 1.8851363888152832e-05, + "loss": 1.8914, + "step": 16570 + }, + { + "epoch": 0.5894587147809084, + "grad_norm": 1.6261394023895264, + "learning_rate": 1.8849539525367188e-05, + "loss": 1.9402, + "step": 16580 + }, + { + "epoch": 0.5898142387343347, + "grad_norm": 1.6277108192443848, + "learning_rate": 1.884771380334776e-05, + "loss": 1.91, + "step": 16590 + }, + { + "epoch": 0.5901697626877611, + "grad_norm": 1.6134692430496216, + "learning_rate": 1.884588672237497e-05, + "loss": 1.9328, + "step": 16600 + }, + { + "epoch": 0.5905252866411874, + "grad_norm": 1.6460951566696167, + "learning_rate": 1.8844058282729443e-05, + "loss": 1.8587, + "step": 16610 + }, + { + "epoch": 0.5908808105946138, + "grad_norm": 1.6859729290008545, + "learning_rate": 1.8842228484692016e-05, + "loss": 1.8527, + "step": 16620 + }, + { + "epoch": 0.5912363345480401, + "grad_norm": 1.6426470279693604, + "learning_rate": 1.8840397328543733e-05, + "loss": 1.8648, + "step": 16630 + }, + { + "epoch": 0.5915918585014666, + "grad_norm": 1.6782881021499634, + "learning_rate": 1.883856481456585e-05, + "loss": 1.864, + "step": 16640 + }, + { + "epoch": 0.5919473824548929, + "grad_norm": 1.62833833694458, + "learning_rate": 1.8836730943039833e-05, + "loss": 1.8915, + "step": 16650 + }, + { + "epoch": 0.5923029064083193, + "grad_norm": 1.571418285369873, + "learning_rate": 1.8834895714247347e-05, + "loss": 1.8538, + "step": 16660 + }, + { + "epoch": 0.5926584303617456, + "grad_norm": 1.594092845916748, + "learning_rate": 1.8833059128470275e-05, + "loss": 1.854, + "step": 16670 + }, + { + "epoch": 0.593013954315172, + "grad_norm": 1.636124610900879, + "learning_rate": 1.8831221185990706e-05, + "loss": 1.8864, + "step": 16680 + }, + { + "epoch": 0.5933694782685983, + "grad_norm": 1.6358932256698608, + "learning_rate": 1.8829381887090935e-05, + "loss": 1.8813, + "step": 16690 + }, + { + "epoch": 0.5937250022220247, + "grad_norm": 1.5682092905044556, + "learning_rate": 1.8827541232053466e-05, + "loss": 1.9199, + "step": 16700 + }, + { + "epoch": 0.594080526175451, + "grad_norm": 1.6156322956085205, + "learning_rate": 1.8825699221161013e-05, + "loss": 1.8817, + "step": 16710 + }, + { + "epoch": 0.5944360501288775, + "grad_norm": 1.5667834281921387, + "learning_rate": 1.8823855854696496e-05, + "loss": 1.8507, + "step": 16720 + }, + { + "epoch": 0.5947915740823038, + "grad_norm": 1.6435546875, + "learning_rate": 1.8822011132943048e-05, + "loss": 1.8548, + "step": 16730 + }, + { + "epoch": 0.5951470980357302, + "grad_norm": 1.58632493019104, + "learning_rate": 1.8820165056184004e-05, + "loss": 1.8876, + "step": 16740 + }, + { + "epoch": 0.5955026219891565, + "grad_norm": 1.6850043535232544, + "learning_rate": 1.8818317624702908e-05, + "loss": 1.8642, + "step": 16750 + }, + { + "epoch": 0.5958581459425829, + "grad_norm": 1.6237214803695679, + "learning_rate": 1.8816468838783518e-05, + "loss": 1.8476, + "step": 16760 + }, + { + "epoch": 0.5962136698960092, + "grad_norm": 1.6198740005493164, + "learning_rate": 1.8814618698709794e-05, + "loss": 1.8812, + "step": 16770 + }, + { + "epoch": 0.5965691938494356, + "grad_norm": 1.6180577278137207, + "learning_rate": 1.8812767204765906e-05, + "loss": 1.8844, + "step": 16780 + }, + { + "epoch": 0.5969247178028619, + "grad_norm": 1.6989701986312866, + "learning_rate": 1.881091435723623e-05, + "loss": 1.8733, + "step": 16790 + }, + { + "epoch": 0.5972802417562884, + "grad_norm": 1.6039592027664185, + "learning_rate": 1.8809060156405355e-05, + "loss": 1.9306, + "step": 16800 + }, + { + "epoch": 0.5976357657097147, + "grad_norm": 1.6900005340576172, + "learning_rate": 1.8807204602558076e-05, + "loss": 1.9098, + "step": 16810 + }, + { + "epoch": 0.5979912896631411, + "grad_norm": 1.7421122789382935, + "learning_rate": 1.880534769597939e-05, + "loss": 1.8799, + "step": 16820 + }, + { + "epoch": 0.5983468136165674, + "grad_norm": 1.5648573637008667, + "learning_rate": 1.8803489436954506e-05, + "loss": 1.908, + "step": 16830 + }, + { + "epoch": 0.5987023375699938, + "grad_norm": 1.618874430656433, + "learning_rate": 1.8801629825768845e-05, + "loss": 1.8763, + "step": 16840 + }, + { + "epoch": 0.5990578615234201, + "grad_norm": 1.5616846084594727, + "learning_rate": 1.8799768862708025e-05, + "loss": 1.9007, + "step": 16850 + }, + { + "epoch": 0.5994133854768465, + "grad_norm": 1.6301249265670776, + "learning_rate": 1.8797906548057888e-05, + "loss": 1.8384, + "step": 16860 + }, + { + "epoch": 0.5997689094302728, + "grad_norm": 1.6246309280395508, + "learning_rate": 1.8796042882104464e-05, + "loss": 1.8478, + "step": 16870 + }, + { + "epoch": 0.6001244333836993, + "grad_norm": 1.73928964138031, + "learning_rate": 1.8794177865134008e-05, + "loss": 1.8595, + "step": 16880 + }, + { + "epoch": 0.6004799573371256, + "grad_norm": 1.5528075695037842, + "learning_rate": 1.879231149743297e-05, + "loss": 1.8669, + "step": 16890 + }, + { + "epoch": 0.600835481290552, + "grad_norm": 1.6546438932418823, + "learning_rate": 1.8790443779288014e-05, + "loss": 1.9158, + "step": 16900 + }, + { + "epoch": 0.6011910052439783, + "grad_norm": 1.5551692247390747, + "learning_rate": 1.8788574710986012e-05, + "loss": 1.8575, + "step": 16910 + }, + { + "epoch": 0.6015465291974047, + "grad_norm": 1.635860800743103, + "learning_rate": 1.8786704292814037e-05, + "loss": 1.8457, + "step": 16920 + }, + { + "epoch": 0.601902053150831, + "grad_norm": 1.5776456594467163, + "learning_rate": 1.8784832525059377e-05, + "loss": 1.8652, + "step": 16930 + }, + { + "epoch": 0.6022575771042574, + "grad_norm": 1.719202995300293, + "learning_rate": 1.8782959408009524e-05, + "loss": 1.8412, + "step": 16940 + }, + { + "epoch": 0.6026131010576837, + "grad_norm": 1.6435538530349731, + "learning_rate": 1.8781084941952176e-05, + "loss": 1.8631, + "step": 16950 + }, + { + "epoch": 0.6029686250111101, + "grad_norm": 1.5469392538070679, + "learning_rate": 1.8779209127175234e-05, + "loss": 1.8409, + "step": 16960 + }, + { + "epoch": 0.6033241489645365, + "grad_norm": 1.5901070833206177, + "learning_rate": 1.8777331963966817e-05, + "loss": 1.8437, + "step": 16970 + }, + { + "epoch": 0.6036796729179629, + "grad_norm": 1.606291651725769, + "learning_rate": 1.877545345261525e-05, + "loss": 1.8883, + "step": 16980 + }, + { + "epoch": 0.6040351968713892, + "grad_norm": 1.5249083042144775, + "learning_rate": 1.8773573593409052e-05, + "loss": 1.8804, + "step": 16990 + }, + { + "epoch": 0.6043907208248156, + "grad_norm": 1.6308866739273071, + "learning_rate": 1.877169238663696e-05, + "loss": 1.846, + "step": 17000 + }, + { + "epoch": 0.6047462447782419, + "grad_norm": 1.6662529706954956, + "learning_rate": 1.8769809832587914e-05, + "loss": 1.874, + "step": 17010 + }, + { + "epoch": 0.6051017687316683, + "grad_norm": 1.7909958362579346, + "learning_rate": 1.876792593155107e-05, + "loss": 1.8738, + "step": 17020 + }, + { + "epoch": 0.6054572926850946, + "grad_norm": 1.6001943349838257, + "learning_rate": 1.8766040683815772e-05, + "loss": 1.8614, + "step": 17030 + }, + { + "epoch": 0.605812816638521, + "grad_norm": 1.5842840671539307, + "learning_rate": 1.8764154089671597e-05, + "loss": 1.8435, + "step": 17040 + }, + { + "epoch": 0.6061683405919474, + "grad_norm": 1.627424955368042, + "learning_rate": 1.87622661494083e-05, + "loss": 1.8811, + "step": 17050 + }, + { + "epoch": 0.6065238645453738, + "grad_norm": 1.738578200340271, + "learning_rate": 1.876037686331586e-05, + "loss": 1.8389, + "step": 17060 + }, + { + "epoch": 0.6068793884988001, + "grad_norm": 1.6790657043457031, + "learning_rate": 1.875848623168447e-05, + "loss": 1.847, + "step": 17070 + }, + { + "epoch": 0.6072349124522265, + "grad_norm": 1.7412775754928589, + "learning_rate": 1.87565942548045e-05, + "loss": 1.9113, + "step": 17080 + }, + { + "epoch": 0.6075904364056528, + "grad_norm": 1.5749247074127197, + "learning_rate": 1.8754700932966566e-05, + "loss": 1.8789, + "step": 17090 + }, + { + "epoch": 0.6079459603590792, + "grad_norm": 1.6527512073516846, + "learning_rate": 1.875280626646146e-05, + "loss": 1.877, + "step": 17100 + }, + { + "epoch": 0.6083014843125055, + "grad_norm": 1.606726050376892, + "learning_rate": 1.875091025558019e-05, + "loss": 1.8464, + "step": 17110 + }, + { + "epoch": 0.6086570082659319, + "grad_norm": 1.6692525148391724, + "learning_rate": 1.8749012900613974e-05, + "loss": 1.8497, + "step": 17120 + }, + { + "epoch": 0.6090125322193582, + "grad_norm": 1.6277382373809814, + "learning_rate": 1.874711420185423e-05, + "loss": 1.8919, + "step": 17130 + }, + { + "epoch": 0.6093680561727847, + "grad_norm": 1.5908669233322144, + "learning_rate": 1.874521415959259e-05, + "loss": 1.8778, + "step": 17140 + }, + { + "epoch": 0.609723580126211, + "grad_norm": 1.6495097875595093, + "learning_rate": 1.874331277412089e-05, + "loss": 1.8786, + "step": 17150 + }, + { + "epoch": 0.6100791040796374, + "grad_norm": 1.6455113887786865, + "learning_rate": 1.874141004573117e-05, + "loss": 1.8591, + "step": 17160 + }, + { + "epoch": 0.6104346280330637, + "grad_norm": 1.6391041278839111, + "learning_rate": 1.8739505974715672e-05, + "loss": 1.8667, + "step": 17170 + }, + { + "epoch": 0.6107901519864901, + "grad_norm": 1.6577038764953613, + "learning_rate": 1.873760056136686e-05, + "loss": 1.8706, + "step": 17180 + }, + { + "epoch": 0.6111456759399164, + "grad_norm": 1.6082369089126587, + "learning_rate": 1.873569380597738e-05, + "loss": 1.7906, + "step": 17190 + }, + { + "epoch": 0.6115011998933428, + "grad_norm": 1.674109697341919, + "learning_rate": 1.8733785708840107e-05, + "loss": 1.9073, + "step": 17200 + }, + { + "epoch": 0.6118567238467691, + "grad_norm": 1.5402380228042603, + "learning_rate": 1.8731876270248112e-05, + "loss": 1.8687, + "step": 17210 + }, + { + "epoch": 0.6122122478001956, + "grad_norm": 1.6595739126205444, + "learning_rate": 1.872996549049467e-05, + "loss": 1.8709, + "step": 17220 + }, + { + "epoch": 0.6125677717536219, + "grad_norm": 1.6010525226593018, + "learning_rate": 1.8728053369873266e-05, + "loss": 1.8679, + "step": 17230 + }, + { + "epoch": 0.6129232957070483, + "grad_norm": 1.5951142311096191, + "learning_rate": 1.872613990867759e-05, + "loss": 1.9297, + "step": 17240 + }, + { + "epoch": 0.6132788196604746, + "grad_norm": 1.5142821073532104, + "learning_rate": 1.8724225107201538e-05, + "loss": 1.855, + "step": 17250 + }, + { + "epoch": 0.613634343613901, + "grad_norm": 1.6318472623825073, + "learning_rate": 1.872230896573921e-05, + "loss": 1.8907, + "step": 17260 + }, + { + "epoch": 0.6139898675673273, + "grad_norm": 1.62212073802948, + "learning_rate": 1.8720391484584913e-05, + "loss": 1.86, + "step": 17270 + }, + { + "epoch": 0.6143453915207537, + "grad_norm": 1.6132879257202148, + "learning_rate": 1.8718472664033163e-05, + "loss": 1.8601, + "step": 17280 + }, + { + "epoch": 0.61470091547418, + "grad_norm": 1.6132336854934692, + "learning_rate": 1.8716552504378676e-05, + "loss": 1.8229, + "step": 17290 + }, + { + "epoch": 0.6150564394276065, + "grad_norm": 1.6995302438735962, + "learning_rate": 1.871463100591638e-05, + "loss": 1.8826, + "step": 17300 + }, + { + "epoch": 0.6154119633810328, + "grad_norm": 1.594828724861145, + "learning_rate": 1.87127081689414e-05, + "loss": 1.8921, + "step": 17310 + }, + { + "epoch": 0.6157674873344592, + "grad_norm": 1.5735836029052734, + "learning_rate": 1.8710783993749073e-05, + "loss": 1.8571, + "step": 17320 + }, + { + "epoch": 0.6161230112878855, + "grad_norm": 1.6112326383590698, + "learning_rate": 1.8708858480634946e-05, + "loss": 1.8564, + "step": 17330 + }, + { + "epoch": 0.6164785352413119, + "grad_norm": 1.7308849096298218, + "learning_rate": 1.8706931629894757e-05, + "loss": 1.8361, + "step": 17340 + }, + { + "epoch": 0.6168340591947382, + "grad_norm": 1.5627094507217407, + "learning_rate": 1.8705003441824467e-05, + "loss": 1.8924, + "step": 17350 + }, + { + "epoch": 0.6171895831481646, + "grad_norm": 1.608675479888916, + "learning_rate": 1.8703073916720225e-05, + "loss": 1.8852, + "step": 17360 + }, + { + "epoch": 0.6175451071015909, + "grad_norm": 1.6587059497833252, + "learning_rate": 1.87011430548784e-05, + "loss": 1.8536, + "step": 17370 + }, + { + "epoch": 0.6179006310550174, + "grad_norm": 1.6514772176742554, + "learning_rate": 1.869921085659556e-05, + "loss": 1.8676, + "step": 17380 + }, + { + "epoch": 0.6182561550084437, + "grad_norm": 1.6302834749221802, + "learning_rate": 1.8697277322168475e-05, + "loss": 1.8937, + "step": 17390 + }, + { + "epoch": 0.6186116789618701, + "grad_norm": 1.6737154722213745, + "learning_rate": 1.8695342451894122e-05, + "loss": 1.8453, + "step": 17400 + }, + { + "epoch": 0.6189672029152964, + "grad_norm": 1.6103057861328125, + "learning_rate": 1.8693406246069694e-05, + "loss": 1.877, + "step": 17410 + }, + { + "epoch": 0.6193227268687228, + "grad_norm": 1.6058531999588013, + "learning_rate": 1.8691468704992574e-05, + "loss": 1.8532, + "step": 17420 + }, + { + "epoch": 0.6196782508221491, + "grad_norm": 1.6504783630371094, + "learning_rate": 1.8689529828960355e-05, + "loss": 1.8712, + "step": 17430 + }, + { + "epoch": 0.6200337747755755, + "grad_norm": 1.5864588022232056, + "learning_rate": 1.868758961827084e-05, + "loss": 1.8637, + "step": 17440 + }, + { + "epoch": 0.6203892987290018, + "grad_norm": 1.63741135597229, + "learning_rate": 1.8685648073222028e-05, + "loss": 1.8425, + "step": 17450 + }, + { + "epoch": 0.6207448226824283, + "grad_norm": 1.5490930080413818, + "learning_rate": 1.8683705194112134e-05, + "loss": 1.8588, + "step": 17460 + }, + { + "epoch": 0.6211003466358546, + "grad_norm": 1.6922720670700073, + "learning_rate": 1.868176098123957e-05, + "loss": 1.8639, + "step": 17470 + }, + { + "epoch": 0.621455870589281, + "grad_norm": 1.6855324506759644, + "learning_rate": 1.867981543490295e-05, + "loss": 1.8502, + "step": 17480 + }, + { + "epoch": 0.6218113945427073, + "grad_norm": 1.539638638496399, + "learning_rate": 1.8677868555401108e-05, + "loss": 1.8658, + "step": 17490 + }, + { + "epoch": 0.6221669184961337, + "grad_norm": 1.695586919784546, + "learning_rate": 1.8675920343033063e-05, + "loss": 1.828, + "step": 17500 + }, + { + "epoch": 0.62252244244956, + "grad_norm": 1.626246452331543, + "learning_rate": 1.8673970798098054e-05, + "loss": 1.8621, + "step": 17510 + }, + { + "epoch": 0.6228779664029864, + "grad_norm": 1.833249807357788, + "learning_rate": 1.8672019920895513e-05, + "loss": 1.8883, + "step": 17520 + }, + { + "epoch": 0.6232334903564127, + "grad_norm": 1.746085524559021, + "learning_rate": 1.867006771172509e-05, + "loss": 1.8685, + "step": 17530 + }, + { + "epoch": 0.6235890143098392, + "grad_norm": 1.7327795028686523, + "learning_rate": 1.8668114170886627e-05, + "loss": 1.8666, + "step": 17540 + }, + { + "epoch": 0.6239445382632655, + "grad_norm": 1.605921745300293, + "learning_rate": 1.8666159298680177e-05, + "loss": 1.8678, + "step": 17550 + }, + { + "epoch": 0.6243000622166919, + "grad_norm": 1.5179563760757446, + "learning_rate": 1.866420309540599e-05, + "loss": 1.8845, + "step": 17560 + }, + { + "epoch": 0.6246555861701182, + "grad_norm": 1.6529120206832886, + "learning_rate": 1.8662245561364542e-05, + "loss": 1.8428, + "step": 17570 + }, + { + "epoch": 0.6250111101235446, + "grad_norm": 1.6774760484695435, + "learning_rate": 1.8660286696856486e-05, + "loss": 1.8814, + "step": 17580 + }, + { + "epoch": 0.6253666340769709, + "grad_norm": 1.6613315343856812, + "learning_rate": 1.8658326502182692e-05, + "loss": 1.8949, + "step": 17590 + }, + { + "epoch": 0.6257221580303973, + "grad_norm": 1.678603172302246, + "learning_rate": 1.8656364977644233e-05, + "loss": 1.8903, + "step": 17600 + }, + { + "epoch": 0.6260776819838236, + "grad_norm": 1.6080899238586426, + "learning_rate": 1.8654402123542392e-05, + "loss": 1.8357, + "step": 17610 + }, + { + "epoch": 0.62643320593725, + "grad_norm": 1.58425772190094, + "learning_rate": 1.865243794017865e-05, + "loss": 1.8993, + "step": 17620 + }, + { + "epoch": 0.6267887298906764, + "grad_norm": 1.6958409547805786, + "learning_rate": 1.865047242785469e-05, + "loss": 1.8097, + "step": 17630 + }, + { + "epoch": 0.6271442538441028, + "grad_norm": 1.5857570171356201, + "learning_rate": 1.86485055868724e-05, + "loss": 1.8372, + "step": 17640 + }, + { + "epoch": 0.6274997777975291, + "grad_norm": 1.654848337173462, + "learning_rate": 1.8646537417533886e-05, + "loss": 1.8653, + "step": 17650 + }, + { + "epoch": 0.6278553017509555, + "grad_norm": 1.7152830362319946, + "learning_rate": 1.8644567920141436e-05, + "loss": 1.8827, + "step": 17660 + }, + { + "epoch": 0.6282108257043818, + "grad_norm": 1.5285979509353638, + "learning_rate": 1.8642597094997552e-05, + "loss": 1.8487, + "step": 17670 + }, + { + "epoch": 0.6285663496578082, + "grad_norm": 1.608949899673462, + "learning_rate": 1.8640624942404945e-05, + "loss": 1.8678, + "step": 17680 + }, + { + "epoch": 0.6289218736112345, + "grad_norm": 1.5404757261276245, + "learning_rate": 1.8638651462666527e-05, + "loss": 1.8938, + "step": 17690 + }, + { + "epoch": 0.629277397564661, + "grad_norm": 1.5254802703857422, + "learning_rate": 1.8636676656085407e-05, + "loss": 1.8407, + "step": 17700 + }, + { + "epoch": 0.6296329215180873, + "grad_norm": 1.770894169807434, + "learning_rate": 1.8634700522964904e-05, + "loss": 1.8741, + "step": 17710 + }, + { + "epoch": 0.6299884454715137, + "grad_norm": 1.549384355545044, + "learning_rate": 1.863272306360854e-05, + "loss": 1.9207, + "step": 17720 + }, + { + "epoch": 0.63034396942494, + "grad_norm": 1.648349642753601, + "learning_rate": 1.8630744278320046e-05, + "loss": 1.8157, + "step": 17730 + }, + { + "epoch": 0.6306994933783664, + "grad_norm": 1.6413389444351196, + "learning_rate": 1.8628764167403345e-05, + "loss": 1.8864, + "step": 17740 + }, + { + "epoch": 0.6310550173317927, + "grad_norm": 1.6191598176956177, + "learning_rate": 1.8626782731162567e-05, + "loss": 1.8505, + "step": 17750 + }, + { + "epoch": 0.6314105412852191, + "grad_norm": 1.691037654876709, + "learning_rate": 1.862479996990205e-05, + "loss": 1.8194, + "step": 17760 + }, + { + "epoch": 0.6317660652386454, + "grad_norm": 1.6275644302368164, + "learning_rate": 1.8622815883926343e-05, + "loss": 1.8535, + "step": 17770 + }, + { + "epoch": 0.6321215891920718, + "grad_norm": 1.5613367557525635, + "learning_rate": 1.8620830473540174e-05, + "loss": 1.8215, + "step": 17780 + }, + { + "epoch": 0.6324771131454981, + "grad_norm": 1.5440773963928223, + "learning_rate": 1.86188437390485e-05, + "loss": 1.8206, + "step": 17790 + }, + { + "epoch": 0.6328326370989246, + "grad_norm": 1.7234688997268677, + "learning_rate": 1.861685568075647e-05, + "loss": 1.8477, + "step": 17800 + }, + { + "epoch": 0.6331881610523509, + "grad_norm": 1.5920257568359375, + "learning_rate": 1.8614866298969437e-05, + "loss": 1.8264, + "step": 17810 + }, + { + "epoch": 0.6335436850057773, + "grad_norm": 1.582025408744812, + "learning_rate": 1.8612875593992955e-05, + "loss": 1.8617, + "step": 17820 + }, + { + "epoch": 0.6338992089592036, + "grad_norm": 1.632765293121338, + "learning_rate": 1.861088356613278e-05, + "loss": 1.8548, + "step": 17830 + }, + { + "epoch": 0.63425473291263, + "grad_norm": 1.7121986150741577, + "learning_rate": 1.8608890215694883e-05, + "loss": 1.8589, + "step": 17840 + }, + { + "epoch": 0.6346102568660563, + "grad_norm": 1.6721488237380981, + "learning_rate": 1.8606895542985432e-05, + "loss": 1.8442, + "step": 17850 + }, + { + "epoch": 0.6349657808194827, + "grad_norm": 1.7103196382522583, + "learning_rate": 1.8604899548310786e-05, + "loss": 1.7886, + "step": 17860 + }, + { + "epoch": 0.635321304772909, + "grad_norm": 1.597957968711853, + "learning_rate": 1.8602902231977523e-05, + "loss": 1.8576, + "step": 17870 + }, + { + "epoch": 0.6356768287263355, + "grad_norm": 1.6161609888076782, + "learning_rate": 1.8600903594292415e-05, + "loss": 1.8836, + "step": 17880 + }, + { + "epoch": 0.6360323526797618, + "grad_norm": 1.5441991090774536, + "learning_rate": 1.8598903635562448e-05, + "loss": 1.889, + "step": 17890 + }, + { + "epoch": 0.6363878766331882, + "grad_norm": 1.5979788303375244, + "learning_rate": 1.8596902356094796e-05, + "loss": 1.8414, + "step": 17900 + }, + { + "epoch": 0.6367434005866145, + "grad_norm": 1.5707616806030273, + "learning_rate": 1.8594899756196847e-05, + "loss": 1.8864, + "step": 17910 + }, + { + "epoch": 0.6370989245400409, + "grad_norm": 1.68429434299469, + "learning_rate": 1.8592895836176183e-05, + "loss": 1.8358, + "step": 17920 + }, + { + "epoch": 0.6374544484934672, + "grad_norm": 1.5664743185043335, + "learning_rate": 1.8590890596340598e-05, + "loss": 1.871, + "step": 17930 + }, + { + "epoch": 0.6378099724468936, + "grad_norm": 1.7070468664169312, + "learning_rate": 1.858888403699808e-05, + "loss": 1.8993, + "step": 17940 + }, + { + "epoch": 0.6381654964003199, + "grad_norm": 1.5247119665145874, + "learning_rate": 1.8586876158456833e-05, + "loss": 1.8519, + "step": 17950 + }, + { + "epoch": 0.6385210203537464, + "grad_norm": 1.7177155017852783, + "learning_rate": 1.8584866961025245e-05, + "loss": 1.8297, + "step": 17960 + }, + { + "epoch": 0.6388765443071727, + "grad_norm": 1.6578903198242188, + "learning_rate": 1.8582856445011918e-05, + "loss": 1.8982, + "step": 17970 + }, + { + "epoch": 0.6392320682605991, + "grad_norm": 1.6425923109054565, + "learning_rate": 1.8580844610725653e-05, + "loss": 1.8607, + "step": 17980 + }, + { + "epoch": 0.6395875922140254, + "grad_norm": 1.5677855014801025, + "learning_rate": 1.8578831458475465e-05, + "loss": 1.817, + "step": 17990 + }, + { + "epoch": 0.6399431161674518, + "grad_norm": 1.5550886392593384, + "learning_rate": 1.857681698857055e-05, + "loss": 1.8454, + "step": 18000 + }, + { + "epoch": 0.6402986401208781, + "grad_norm": 1.5829353332519531, + "learning_rate": 1.8574801201320324e-05, + "loss": 1.887, + "step": 18010 + }, + { + "epoch": 0.6406541640743045, + "grad_norm": 1.652955412864685, + "learning_rate": 1.8572784097034396e-05, + "loss": 1.8449, + "step": 18020 + }, + { + "epoch": 0.6410096880277308, + "grad_norm": 1.62899649143219, + "learning_rate": 1.8570765676022585e-05, + "loss": 1.8209, + "step": 18030 + }, + { + "epoch": 0.6413652119811573, + "grad_norm": 1.6158806085586548, + "learning_rate": 1.8568745938594905e-05, + "loss": 1.8784, + "step": 18040 + }, + { + "epoch": 0.6417207359345836, + "grad_norm": 1.5244545936584473, + "learning_rate": 1.8566724885061574e-05, + "loss": 1.8333, + "step": 18050 + }, + { + "epoch": 0.64207625988801, + "grad_norm": 1.621248722076416, + "learning_rate": 1.8564702515733016e-05, + "loss": 1.8925, + "step": 18060 + }, + { + "epoch": 0.6424317838414363, + "grad_norm": 1.6901414394378662, + "learning_rate": 1.8562678830919854e-05, + "loss": 1.8802, + "step": 18070 + }, + { + "epoch": 0.6427873077948627, + "grad_norm": 1.7427568435668945, + "learning_rate": 1.856065383093291e-05, + "loss": 1.871, + "step": 18080 + }, + { + "epoch": 0.643142831748289, + "grad_norm": 1.60033118724823, + "learning_rate": 1.855862751608321e-05, + "loss": 1.8079, + "step": 18090 + }, + { + "epoch": 0.6434983557017154, + "grad_norm": 1.7531661987304688, + "learning_rate": 1.8556599886681992e-05, + "loss": 1.8374, + "step": 18100 + }, + { + "epoch": 0.6438538796551417, + "grad_norm": 1.6297991275787354, + "learning_rate": 1.855457094304068e-05, + "loss": 1.8442, + "step": 18110 + }, + { + "epoch": 0.6442094036085682, + "grad_norm": 1.7636593580245972, + "learning_rate": 1.8552540685470908e-05, + "loss": 1.8322, + "step": 18120 + }, + { + "epoch": 0.6445649275619945, + "grad_norm": 1.6729167699813843, + "learning_rate": 1.855050911428451e-05, + "loss": 1.8663, + "step": 18130 + }, + { + "epoch": 0.6449204515154209, + "grad_norm": 1.8134124279022217, + "learning_rate": 1.8548476229793525e-05, + "loss": 1.8381, + "step": 18140 + }, + { + "epoch": 0.6452759754688472, + "grad_norm": 1.7415553331375122, + "learning_rate": 1.854644203231019e-05, + "loss": 1.884, + "step": 18150 + }, + { + "epoch": 0.6456314994222736, + "grad_norm": 1.5733280181884766, + "learning_rate": 1.854440652214695e-05, + "loss": 1.8278, + "step": 18160 + }, + { + "epoch": 0.6459870233756999, + "grad_norm": 1.6073261499404907, + "learning_rate": 1.8542369699616437e-05, + "loss": 1.8908, + "step": 18170 + }, + { + "epoch": 0.6463425473291263, + "grad_norm": 1.7024801969528198, + "learning_rate": 1.8540331565031506e-05, + "loss": 1.8522, + "step": 18180 + }, + { + "epoch": 0.6466980712825526, + "grad_norm": 1.533766508102417, + "learning_rate": 1.853829211870519e-05, + "loss": 1.8356, + "step": 18190 + }, + { + "epoch": 0.647053595235979, + "grad_norm": 1.5265889167785645, + "learning_rate": 1.8536251360950737e-05, + "loss": 1.8897, + "step": 18200 + }, + { + "epoch": 0.6474091191894054, + "grad_norm": 1.5848630666732788, + "learning_rate": 1.8534209292081603e-05, + "loss": 1.8548, + "step": 18210 + }, + { + "epoch": 0.6477646431428318, + "grad_norm": 1.622403621673584, + "learning_rate": 1.8532165912411425e-05, + "loss": 1.829, + "step": 18220 + }, + { + "epoch": 0.6481201670962581, + "grad_norm": 1.649625301361084, + "learning_rate": 1.8530121222254064e-05, + "loss": 1.8602, + "step": 18230 + }, + { + "epoch": 0.6484756910496845, + "grad_norm": 1.5093032121658325, + "learning_rate": 1.852807522192357e-05, + "loss": 1.7971, + "step": 18240 + }, + { + "epoch": 0.6488312150031108, + "grad_norm": 1.608844518661499, + "learning_rate": 1.852602791173419e-05, + "loss": 1.8989, + "step": 18250 + }, + { + "epoch": 0.6491867389565372, + "grad_norm": 1.6054035425186157, + "learning_rate": 1.8523979292000385e-05, + "loss": 1.8686, + "step": 18260 + }, + { + "epoch": 0.6495422629099635, + "grad_norm": 1.569411039352417, + "learning_rate": 1.8521929363036802e-05, + "loss": 1.8637, + "step": 18270 + }, + { + "epoch": 0.64989778686339, + "grad_norm": 1.8090263605117798, + "learning_rate": 1.8519878125158305e-05, + "loss": 1.8861, + "step": 18280 + }, + { + "epoch": 0.6502533108168163, + "grad_norm": 1.5960959196090698, + "learning_rate": 1.8517825578679946e-05, + "loss": 1.8546, + "step": 18290 + }, + { + "epoch": 0.6506088347702427, + "grad_norm": 1.5911707878112793, + "learning_rate": 1.8515771723916987e-05, + "loss": 1.8012, + "step": 18300 + }, + { + "epoch": 0.650964358723669, + "grad_norm": 1.6279571056365967, + "learning_rate": 1.8513716561184883e-05, + "loss": 1.8322, + "step": 18310 + }, + { + "epoch": 0.6513198826770954, + "grad_norm": 1.5484329462051392, + "learning_rate": 1.85116600907993e-05, + "loss": 1.8195, + "step": 18320 + }, + { + "epoch": 0.6516754066305217, + "grad_norm": 1.628717303276062, + "learning_rate": 1.8509602313076095e-05, + "loss": 1.8482, + "step": 18330 + }, + { + "epoch": 0.6520309305839481, + "grad_norm": 1.7035515308380127, + "learning_rate": 1.8507543228331327e-05, + "loss": 1.8507, + "step": 18340 + }, + { + "epoch": 0.6523864545373744, + "grad_norm": 1.6159610748291016, + "learning_rate": 1.8505482836881262e-05, + "loss": 1.9007, + "step": 18350 + }, + { + "epoch": 0.6527419784908008, + "grad_norm": 1.68350088596344, + "learning_rate": 1.8503421139042366e-05, + "loss": 1.8793, + "step": 18360 + }, + { + "epoch": 0.6530975024442272, + "grad_norm": 1.5864074230194092, + "learning_rate": 1.85013581351313e-05, + "loss": 1.8496, + "step": 18370 + }, + { + "epoch": 0.6534530263976536, + "grad_norm": 1.6897317171096802, + "learning_rate": 1.849929382546493e-05, + "loss": 1.841, + "step": 18380 + }, + { + "epoch": 0.6538085503510799, + "grad_norm": 1.6994426250457764, + "learning_rate": 1.8497228210360317e-05, + "loss": 1.8071, + "step": 18390 + }, + { + "epoch": 0.6541640743045063, + "grad_norm": 1.728031873703003, + "learning_rate": 1.8495161290134726e-05, + "loss": 1.8652, + "step": 18400 + }, + { + "epoch": 0.6545195982579326, + "grad_norm": 1.640601634979248, + "learning_rate": 1.8493093065105627e-05, + "loss": 1.8008, + "step": 18410 + }, + { + "epoch": 0.654875122211359, + "grad_norm": 1.699002742767334, + "learning_rate": 1.849102353559069e-05, + "loss": 1.8631, + "step": 18420 + }, + { + "epoch": 0.6552306461647853, + "grad_norm": 1.7025487422943115, + "learning_rate": 1.848895270190777e-05, + "loss": 1.8935, + "step": 18430 + }, + { + "epoch": 0.6555861701182117, + "grad_norm": 1.599440574645996, + "learning_rate": 1.848688056437495e-05, + "loss": 1.8361, + "step": 18440 + }, + { + "epoch": 0.655941694071638, + "grad_norm": 1.6044576168060303, + "learning_rate": 1.848480712331048e-05, + "loss": 1.8326, + "step": 18450 + }, + { + "epoch": 0.6562972180250645, + "grad_norm": 1.664693832397461, + "learning_rate": 1.848273237903284e-05, + "loss": 1.8676, + "step": 18460 + }, + { + "epoch": 0.6566527419784908, + "grad_norm": 1.6578155755996704, + "learning_rate": 1.8480656331860692e-05, + "loss": 1.8822, + "step": 18470 + }, + { + "epoch": 0.6570082659319172, + "grad_norm": 1.5991641283035278, + "learning_rate": 1.847857898211291e-05, + "loss": 1.8952, + "step": 18480 + }, + { + "epoch": 0.6573637898853435, + "grad_norm": 1.6421630382537842, + "learning_rate": 1.847650033010855e-05, + "loss": 1.8613, + "step": 18490 + }, + { + "epoch": 0.6577193138387699, + "grad_norm": 1.5893738269805908, + "learning_rate": 1.8474420376166893e-05, + "loss": 1.8507, + "step": 18500 + }, + { + "epoch": 0.6580748377921962, + "grad_norm": 1.6135833263397217, + "learning_rate": 1.84723391206074e-05, + "loss": 1.8942, + "step": 18510 + }, + { + "epoch": 0.6584303617456226, + "grad_norm": 1.5302294492721558, + "learning_rate": 1.847025656374974e-05, + "loss": 1.8624, + "step": 18520 + }, + { + "epoch": 0.658785885699049, + "grad_norm": 1.6853063106536865, + "learning_rate": 1.846817270591378e-05, + "loss": 1.8471, + "step": 18530 + }, + { + "epoch": 0.6591414096524754, + "grad_norm": 1.6253217458724976, + "learning_rate": 1.846608754741959e-05, + "loss": 1.8594, + "step": 18540 + }, + { + "epoch": 0.6594969336059017, + "grad_norm": 1.6094721555709839, + "learning_rate": 1.8464001088587436e-05, + "loss": 1.8688, + "step": 18550 + }, + { + "epoch": 0.6598524575593281, + "grad_norm": 1.6854521036148071, + "learning_rate": 1.8461913329737782e-05, + "loss": 1.8704, + "step": 18560 + }, + { + "epoch": 0.6602079815127544, + "grad_norm": 1.702346682548523, + "learning_rate": 1.8459824271191298e-05, + "loss": 1.8795, + "step": 18570 + }, + { + "epoch": 0.6605635054661808, + "grad_norm": 1.6592522859573364, + "learning_rate": 1.8457733913268848e-05, + "loss": 1.8835, + "step": 18580 + }, + { + "epoch": 0.6609190294196071, + "grad_norm": 1.5579978227615356, + "learning_rate": 1.84556422562915e-05, + "loss": 1.8281, + "step": 18590 + }, + { + "epoch": 0.6612745533730335, + "grad_norm": 1.6369131803512573, + "learning_rate": 1.8453549300580523e-05, + "loss": 1.8923, + "step": 18600 + }, + { + "epoch": 0.6616300773264598, + "grad_norm": 1.640381097793579, + "learning_rate": 1.8451455046457373e-05, + "loss": 1.8469, + "step": 18610 + }, + { + "epoch": 0.6619856012798863, + "grad_norm": 1.5950998067855835, + "learning_rate": 1.8449359494243722e-05, + "loss": 1.8305, + "step": 18620 + }, + { + "epoch": 0.6623411252333126, + "grad_norm": 1.5842294692993164, + "learning_rate": 1.8447262644261427e-05, + "loss": 1.8725, + "step": 18630 + }, + { + "epoch": 0.662696649186739, + "grad_norm": 1.5862181186676025, + "learning_rate": 1.8445164496832558e-05, + "loss": 1.8628, + "step": 18640 + }, + { + "epoch": 0.6630521731401653, + "grad_norm": 1.6465989351272583, + "learning_rate": 1.844306505227937e-05, + "loss": 1.879, + "step": 18650 + }, + { + "epoch": 0.6634076970935917, + "grad_norm": 1.61064612865448, + "learning_rate": 1.844096431092433e-05, + "loss": 1.8565, + "step": 18660 + }, + { + "epoch": 0.663763221047018, + "grad_norm": 1.6815714836120605, + "learning_rate": 1.84388622730901e-05, + "loss": 1.8892, + "step": 18670 + }, + { + "epoch": 0.6641187450004444, + "grad_norm": 1.5897061824798584, + "learning_rate": 1.843675893909953e-05, + "loss": 1.8605, + "step": 18680 + }, + { + "epoch": 0.6644742689538707, + "grad_norm": 1.6312036514282227, + "learning_rate": 1.8434654309275692e-05, + "loss": 1.8798, + "step": 18690 + }, + { + "epoch": 0.6648297929072972, + "grad_norm": 1.5694767236709595, + "learning_rate": 1.843254838394184e-05, + "loss": 1.8647, + "step": 18700 + }, + { + "epoch": 0.6651853168607235, + "grad_norm": 1.6571435928344727, + "learning_rate": 1.8430441163421422e-05, + "loss": 1.898, + "step": 18710 + }, + { + "epoch": 0.6655408408141499, + "grad_norm": 1.5843617916107178, + "learning_rate": 1.8428332648038103e-05, + "loss": 1.8884, + "step": 18720 + }, + { + "epoch": 0.6658963647675762, + "grad_norm": 1.666013240814209, + "learning_rate": 1.8426222838115733e-05, + "loss": 1.8537, + "step": 18730 + }, + { + "epoch": 0.6662518887210026, + "grad_norm": 1.6343095302581787, + "learning_rate": 1.8424111733978374e-05, + "loss": 1.8917, + "step": 18740 + }, + { + "epoch": 0.6666074126744289, + "grad_norm": 1.6593196392059326, + "learning_rate": 1.8421999335950267e-05, + "loss": 1.8811, + "step": 18750 + }, + { + "epoch": 0.6669629366278553, + "grad_norm": 1.6329734325408936, + "learning_rate": 1.841988564435587e-05, + "loss": 1.858, + "step": 18760 + }, + { + "epoch": 0.6673184605812816, + "grad_norm": 1.6318451166152954, + "learning_rate": 1.8417770659519832e-05, + "loss": 1.8396, + "step": 18770 + }, + { + "epoch": 0.667673984534708, + "grad_norm": 1.611892580986023, + "learning_rate": 1.8415654381767e-05, + "loss": 1.8838, + "step": 18780 + }, + { + "epoch": 0.6680295084881344, + "grad_norm": 1.6420646905899048, + "learning_rate": 1.8413536811422423e-05, + "loss": 1.8507, + "step": 18790 + }, + { + "epoch": 0.6683850324415608, + "grad_norm": 1.7339340448379517, + "learning_rate": 1.8411417948811343e-05, + "loss": 1.8491, + "step": 18800 + }, + { + "epoch": 0.6687405563949871, + "grad_norm": 1.7298238277435303, + "learning_rate": 1.8409297794259205e-05, + "loss": 1.8828, + "step": 18810 + }, + { + "epoch": 0.6690960803484135, + "grad_norm": 1.5757405757904053, + "learning_rate": 1.8407176348091656e-05, + "loss": 1.8271, + "step": 18820 + }, + { + "epoch": 0.6694516043018398, + "grad_norm": 1.5821876525878906, + "learning_rate": 1.8405053610634528e-05, + "loss": 1.8613, + "step": 18830 + }, + { + "epoch": 0.6698071282552662, + "grad_norm": 1.5011639595031738, + "learning_rate": 1.8402929582213872e-05, + "loss": 1.9149, + "step": 18840 + }, + { + "epoch": 0.6701626522086925, + "grad_norm": 1.745612621307373, + "learning_rate": 1.8400804263155914e-05, + "loss": 1.8757, + "step": 18850 + }, + { + "epoch": 0.670518176162119, + "grad_norm": 1.7035101652145386, + "learning_rate": 1.8398677653787098e-05, + "loss": 1.8415, + "step": 18860 + }, + { + "epoch": 0.6708737001155453, + "grad_norm": 1.5889948606491089, + "learning_rate": 1.839654975443405e-05, + "loss": 1.8464, + "step": 18870 + }, + { + "epoch": 0.6712292240689717, + "grad_norm": 1.5949231386184692, + "learning_rate": 1.8394420565423613e-05, + "loss": 1.8797, + "step": 18880 + }, + { + "epoch": 0.671584748022398, + "grad_norm": 1.6531540155410767, + "learning_rate": 1.8392290087082806e-05, + "loss": 1.8343, + "step": 18890 + }, + { + "epoch": 0.6719402719758244, + "grad_norm": 1.5893442630767822, + "learning_rate": 1.839015831973886e-05, + "loss": 1.8457, + "step": 18900 + }, + { + "epoch": 0.6722957959292507, + "grad_norm": 1.6916165351867676, + "learning_rate": 1.8388025263719208e-05, + "loss": 1.8506, + "step": 18910 + }, + { + "epoch": 0.6726513198826771, + "grad_norm": 1.5590561628341675, + "learning_rate": 1.8385890919351467e-05, + "loss": 1.8665, + "step": 18920 + }, + { + "epoch": 0.6730068438361034, + "grad_norm": 1.6616570949554443, + "learning_rate": 1.8383755286963455e-05, + "loss": 1.8355, + "step": 18930 + }, + { + "epoch": 0.6733623677895298, + "grad_norm": 1.6090588569641113, + "learning_rate": 1.8381618366883207e-05, + "loss": 1.8628, + "step": 18940 + }, + { + "epoch": 0.6737178917429562, + "grad_norm": 1.7417858839035034, + "learning_rate": 1.8379480159438924e-05, + "loss": 1.8802, + "step": 18950 + }, + { + "epoch": 0.6740734156963826, + "grad_norm": 1.5920038223266602, + "learning_rate": 1.837734066495903e-05, + "loss": 1.8936, + "step": 18960 + }, + { + "epoch": 0.6744289396498089, + "grad_norm": 1.5697931051254272, + "learning_rate": 1.8375199883772138e-05, + "loss": 1.8345, + "step": 18970 + }, + { + "epoch": 0.6747844636032353, + "grad_norm": 1.6053522825241089, + "learning_rate": 1.8373057816207054e-05, + "loss": 1.8272, + "step": 18980 + }, + { + "epoch": 0.6751399875566616, + "grad_norm": 1.5402426719665527, + "learning_rate": 1.8370914462592793e-05, + "loss": 1.8213, + "step": 18990 + }, + { + "epoch": 0.675495511510088, + "grad_norm": 1.6520720720291138, + "learning_rate": 1.8368769823258553e-05, + "loss": 1.8394, + "step": 19000 + }, + { + "epoch": 0.6758510354635143, + "grad_norm": 1.6406983137130737, + "learning_rate": 1.836662389853374e-05, + "loss": 1.8974, + "step": 19010 + }, + { + "epoch": 0.6762065594169407, + "grad_norm": 1.491894006729126, + "learning_rate": 1.836447668874796e-05, + "loss": 1.8567, + "step": 19020 + }, + { + "epoch": 0.676562083370367, + "grad_norm": 1.6023941040039062, + "learning_rate": 1.8362328194231003e-05, + "loss": 1.8623, + "step": 19030 + }, + { + "epoch": 0.6769176073237935, + "grad_norm": 1.6651532649993896, + "learning_rate": 1.836017841531287e-05, + "loss": 1.8445, + "step": 19040 + }, + { + "epoch": 0.6772731312772198, + "grad_norm": 1.6000490188598633, + "learning_rate": 1.8358027352323747e-05, + "loss": 1.8502, + "step": 19050 + }, + { + "epoch": 0.6776286552306462, + "grad_norm": 1.562110424041748, + "learning_rate": 1.835587500559403e-05, + "loss": 1.8502, + "step": 19060 + }, + { + "epoch": 0.6779841791840725, + "grad_norm": 1.6804255247116089, + "learning_rate": 1.8353721375454304e-05, + "loss": 1.8289, + "step": 19070 + }, + { + "epoch": 0.6783397031374989, + "grad_norm": 1.663631796836853, + "learning_rate": 1.8351566462235352e-05, + "loss": 1.8722, + "step": 19080 + }, + { + "epoch": 0.6786952270909252, + "grad_norm": 1.6991126537322998, + "learning_rate": 1.8349410266268156e-05, + "loss": 1.8386, + "step": 19090 + }, + { + "epoch": 0.6790507510443516, + "grad_norm": 1.6776806116104126, + "learning_rate": 1.83472527878839e-05, + "loss": 1.8723, + "step": 19100 + }, + { + "epoch": 0.679406274997778, + "grad_norm": 1.6884881258010864, + "learning_rate": 1.834509402741395e-05, + "loss": 1.8302, + "step": 19110 + }, + { + "epoch": 0.6797617989512044, + "grad_norm": 1.5213853120803833, + "learning_rate": 1.8342933985189876e-05, + "loss": 1.8069, + "step": 19120 + }, + { + "epoch": 0.6801173229046307, + "grad_norm": 1.5394185781478882, + "learning_rate": 1.8340772661543458e-05, + "loss": 1.8559, + "step": 19130 + }, + { + "epoch": 0.6804728468580571, + "grad_norm": 1.665880560874939, + "learning_rate": 1.8338610056806656e-05, + "loss": 1.7783, + "step": 19140 + }, + { + "epoch": 0.6808283708114834, + "grad_norm": 1.7000386714935303, + "learning_rate": 1.8336446171311633e-05, + "loss": 1.9049, + "step": 19150 + }, + { + "epoch": 0.6811838947649098, + "grad_norm": 1.6025673151016235, + "learning_rate": 1.8334281005390748e-05, + "loss": 1.8844, + "step": 19160 + }, + { + "epoch": 0.6815394187183361, + "grad_norm": 1.6624587774276733, + "learning_rate": 1.8332114559376552e-05, + "loss": 1.8551, + "step": 19170 + }, + { + "epoch": 0.6818949426717625, + "grad_norm": 1.759018063545227, + "learning_rate": 1.8329946833601808e-05, + "loss": 1.8738, + "step": 19180 + }, + { + "epoch": 0.6822504666251888, + "grad_norm": 1.6430401802062988, + "learning_rate": 1.8327777828399457e-05, + "loss": 1.862, + "step": 19190 + }, + { + "epoch": 0.6826059905786153, + "grad_norm": 1.6533403396606445, + "learning_rate": 1.8325607544102647e-05, + "loss": 1.9021, + "step": 19200 + }, + { + "epoch": 0.6829615145320416, + "grad_norm": 1.5370020866394043, + "learning_rate": 1.832343598104472e-05, + "loss": 1.8838, + "step": 19210 + }, + { + "epoch": 0.683317038485468, + "grad_norm": 1.6419110298156738, + "learning_rate": 1.8321263139559218e-05, + "loss": 1.8398, + "step": 19220 + }, + { + "epoch": 0.6836725624388943, + "grad_norm": 1.6041102409362793, + "learning_rate": 1.831908901997987e-05, + "loss": 1.839, + "step": 19230 + }, + { + "epoch": 0.6840280863923207, + "grad_norm": 1.629111886024475, + "learning_rate": 1.8316913622640607e-05, + "loss": 1.8722, + "step": 19240 + }, + { + "epoch": 0.684383610345747, + "grad_norm": 1.5613747835159302, + "learning_rate": 1.8314736947875565e-05, + "loss": 1.8715, + "step": 19250 + }, + { + "epoch": 0.6847391342991734, + "grad_norm": 1.6306045055389404, + "learning_rate": 1.831255899601906e-05, + "loss": 1.835, + "step": 19260 + }, + { + "epoch": 0.6850946582525997, + "grad_norm": 1.7598761320114136, + "learning_rate": 1.831037976740561e-05, + "loss": 1.8467, + "step": 19270 + }, + { + "epoch": 0.6854501822060262, + "grad_norm": 1.5451831817626953, + "learning_rate": 1.8308199262369935e-05, + "loss": 1.8815, + "step": 19280 + }, + { + "epoch": 0.6858057061594525, + "grad_norm": 1.6422587633132935, + "learning_rate": 1.830601748124695e-05, + "loss": 1.8161, + "step": 19290 + }, + { + "epoch": 0.6861612301128789, + "grad_norm": 1.535333275794983, + "learning_rate": 1.8303834424371752e-05, + "loss": 1.8079, + "step": 19300 + }, + { + "epoch": 0.6865167540663052, + "grad_norm": 1.7117635011672974, + "learning_rate": 1.8301650092079655e-05, + "loss": 1.8575, + "step": 19310 + }, + { + "epoch": 0.6868722780197316, + "grad_norm": 1.553102970123291, + "learning_rate": 1.829946448470616e-05, + "loss": 1.8938, + "step": 19320 + }, + { + "epoch": 0.6872278019731579, + "grad_norm": 1.6548651456832886, + "learning_rate": 1.8297277602586952e-05, + "loss": 1.8293, + "step": 19330 + }, + { + "epoch": 0.6875833259265843, + "grad_norm": 1.6911052465438843, + "learning_rate": 1.8295089446057935e-05, + "loss": 1.8591, + "step": 19340 + }, + { + "epoch": 0.6879388498800106, + "grad_norm": 1.631972074508667, + "learning_rate": 1.8292900015455182e-05, + "loss": 1.8898, + "step": 19350 + }, + { + "epoch": 0.6882943738334371, + "grad_norm": 1.6100616455078125, + "learning_rate": 1.829070931111499e-05, + "loss": 1.8708, + "step": 19360 + }, + { + "epoch": 0.6886498977868634, + "grad_norm": 1.6359871625900269, + "learning_rate": 1.828851733337383e-05, + "loss": 1.896, + "step": 19370 + }, + { + "epoch": 0.6890054217402898, + "grad_norm": 1.5799747705459595, + "learning_rate": 1.8286324082568376e-05, + "loss": 1.83, + "step": 19380 + }, + { + "epoch": 0.6893609456937161, + "grad_norm": 1.637993335723877, + "learning_rate": 1.8284129559035497e-05, + "loss": 1.8788, + "step": 19390 + }, + { + "epoch": 0.6897164696471425, + "grad_norm": 1.6041759252548218, + "learning_rate": 1.828193376311226e-05, + "loss": 1.8239, + "step": 19400 + }, + { + "epoch": 0.6900719936005688, + "grad_norm": 1.6677876710891724, + "learning_rate": 1.8279736695135927e-05, + "loss": 1.8772, + "step": 19410 + }, + { + "epoch": 0.6904275175539952, + "grad_norm": 1.6391844749450684, + "learning_rate": 1.8277538355443952e-05, + "loss": 1.9006, + "step": 19420 + }, + { + "epoch": 0.6907830415074215, + "grad_norm": 1.6198102235794067, + "learning_rate": 1.8275338744373985e-05, + "loss": 1.8581, + "step": 19430 + }, + { + "epoch": 0.691138565460848, + "grad_norm": 1.7414060831069946, + "learning_rate": 1.827313786226387e-05, + "loss": 1.879, + "step": 19440 + }, + { + "epoch": 0.6914940894142743, + "grad_norm": 1.5909712314605713, + "learning_rate": 1.827093570945166e-05, + "loss": 1.8889, + "step": 19450 + }, + { + "epoch": 0.6918496133677007, + "grad_norm": 1.6056509017944336, + "learning_rate": 1.826873228627558e-05, + "loss": 1.8276, + "step": 19460 + }, + { + "epoch": 0.692205137321127, + "grad_norm": 1.7446337938308716, + "learning_rate": 1.8266527593074065e-05, + "loss": 1.8435, + "step": 19470 + }, + { + "epoch": 0.6925606612745534, + "grad_norm": 1.7024445533752441, + "learning_rate": 1.8264321630185745e-05, + "loss": 1.8977, + "step": 19480 + }, + { + "epoch": 0.6929161852279797, + "grad_norm": 1.7875020503997803, + "learning_rate": 1.826211439794944e-05, + "loss": 1.8498, + "step": 19490 + }, + { + "epoch": 0.6932717091814061, + "grad_norm": 1.6902720928192139, + "learning_rate": 1.8259905896704167e-05, + "loss": 1.8503, + "step": 19500 + }, + { + "epoch": 0.6936272331348324, + "grad_norm": 1.670092225074768, + "learning_rate": 1.825769612678914e-05, + "loss": 1.8279, + "step": 19510 + }, + { + "epoch": 0.6939827570882589, + "grad_norm": 1.6163181066513062, + "learning_rate": 1.825548508854376e-05, + "loss": 1.8397, + "step": 19520 + }, + { + "epoch": 0.6943382810416852, + "grad_norm": 1.6588308811187744, + "learning_rate": 1.8253272782307636e-05, + "loss": 1.8506, + "step": 19530 + }, + { + "epoch": 0.6946938049951116, + "grad_norm": 1.6108286380767822, + "learning_rate": 1.825105920842056e-05, + "loss": 1.8466, + "step": 19540 + }, + { + "epoch": 0.6950493289485379, + "grad_norm": 1.7074838876724243, + "learning_rate": 1.8248844367222526e-05, + "loss": 1.848, + "step": 19550 + }, + { + "epoch": 0.6954048529019643, + "grad_norm": 1.5951340198516846, + "learning_rate": 1.8246628259053716e-05, + "loss": 1.8351, + "step": 19560 + }, + { + "epoch": 0.6957603768553906, + "grad_norm": 1.5945497751235962, + "learning_rate": 1.8244410884254514e-05, + "loss": 1.7857, + "step": 19570 + }, + { + "epoch": 0.696115900808817, + "grad_norm": 1.590248942375183, + "learning_rate": 1.8242192243165488e-05, + "loss": 1.8183, + "step": 19580 + }, + { + "epoch": 0.6964714247622433, + "grad_norm": 1.6188017129898071, + "learning_rate": 1.8239972336127415e-05, + "loss": 1.8749, + "step": 19590 + }, + { + "epoch": 0.6968269487156697, + "grad_norm": 1.7116796970367432, + "learning_rate": 1.8237751163481258e-05, + "loss": 1.8329, + "step": 19600 + }, + { + "epoch": 0.697182472669096, + "grad_norm": 1.6828880310058594, + "learning_rate": 1.8235528725568174e-05, + "loss": 1.8843, + "step": 19610 + }, + { + "epoch": 0.6975379966225225, + "grad_norm": 1.5773065090179443, + "learning_rate": 1.8233305022729513e-05, + "loss": 1.7972, + "step": 19620 + }, + { + "epoch": 0.6978935205759488, + "grad_norm": 1.7473276853561401, + "learning_rate": 1.8231080055306824e-05, + "loss": 1.8221, + "step": 19630 + }, + { + "epoch": 0.6982490445293752, + "grad_norm": 1.6167118549346924, + "learning_rate": 1.8228853823641847e-05, + "loss": 1.8207, + "step": 19640 + }, + { + "epoch": 0.6986045684828015, + "grad_norm": 1.5284688472747803, + "learning_rate": 1.822662632807652e-05, + "loss": 1.868, + "step": 19650 + }, + { + "epoch": 0.6989600924362279, + "grad_norm": 1.6059260368347168, + "learning_rate": 1.822439756895297e-05, + "loss": 1.8879, + "step": 19660 + }, + { + "epoch": 0.6993156163896542, + "grad_norm": 1.6518758535385132, + "learning_rate": 1.8222167546613518e-05, + "loss": 1.8398, + "step": 19670 + }, + { + "epoch": 0.6996711403430806, + "grad_norm": 1.7268545627593994, + "learning_rate": 1.8219936261400686e-05, + "loss": 1.8351, + "step": 19680 + }, + { + "epoch": 0.700026664296507, + "grad_norm": 1.6293846368789673, + "learning_rate": 1.8217703713657186e-05, + "loss": 1.8332, + "step": 19690 + }, + { + "epoch": 0.7003821882499334, + "grad_norm": 1.6023805141448975, + "learning_rate": 1.8215469903725917e-05, + "loss": 1.8326, + "step": 19700 + }, + { + "epoch": 0.7007377122033597, + "grad_norm": 1.5513273477554321, + "learning_rate": 1.8213234831949984e-05, + "loss": 1.8217, + "step": 19710 + }, + { + "epoch": 0.7010932361567861, + "grad_norm": 1.6523417234420776, + "learning_rate": 1.821099849867268e-05, + "loss": 1.8465, + "step": 19720 + }, + { + "epoch": 0.7014487601102124, + "grad_norm": 1.6089566946029663, + "learning_rate": 1.8208760904237488e-05, + "loss": 1.8304, + "step": 19730 + }, + { + "epoch": 0.7018042840636388, + "grad_norm": 1.6778578758239746, + "learning_rate": 1.8206522048988092e-05, + "loss": 1.8375, + "step": 19740 + }, + { + "epoch": 0.7021598080170651, + "grad_norm": 1.645105242729187, + "learning_rate": 1.820428193326836e-05, + "loss": 1.8401, + "step": 19750 + }, + { + "epoch": 0.7025153319704915, + "grad_norm": 1.6396735906600952, + "learning_rate": 1.8202040557422373e-05, + "loss": 1.8505, + "step": 19760 + }, + { + "epoch": 0.7028708559239178, + "grad_norm": 1.620423674583435, + "learning_rate": 1.8199797921794372e-05, + "loss": 1.8303, + "step": 19770 + }, + { + "epoch": 0.7032263798773443, + "grad_norm": 1.6041761636734009, + "learning_rate": 1.8197554026728834e-05, + "loss": 1.8677, + "step": 19780 + }, + { + "epoch": 0.7035819038307706, + "grad_norm": 1.585405707359314, + "learning_rate": 1.819530887257039e-05, + "loss": 1.7923, + "step": 19790 + }, + { + "epoch": 0.703937427784197, + "grad_norm": 1.6111605167388916, + "learning_rate": 1.819306245966389e-05, + "loss": 1.84, + "step": 19800 + }, + { + "epoch": 0.7042929517376233, + "grad_norm": 1.5929813385009766, + "learning_rate": 1.819081478835437e-05, + "loss": 1.8554, + "step": 19810 + }, + { + "epoch": 0.7046484756910497, + "grad_norm": 1.593061923980713, + "learning_rate": 1.818856585898705e-05, + "loss": 1.8259, + "step": 19820 + }, + { + "epoch": 0.705003999644476, + "grad_norm": 1.6575653553009033, + "learning_rate": 1.818631567190736e-05, + "loss": 1.8342, + "step": 19830 + }, + { + "epoch": 0.7053595235979024, + "grad_norm": 1.563677430152893, + "learning_rate": 1.8184064227460912e-05, + "loss": 1.8102, + "step": 19840 + }, + { + "epoch": 0.7057150475513287, + "grad_norm": 1.7218552827835083, + "learning_rate": 1.818181152599351e-05, + "loss": 1.801, + "step": 19850 + }, + { + "epoch": 0.7060705715047552, + "grad_norm": 1.6597864627838135, + "learning_rate": 1.817955756785116e-05, + "loss": 1.8578, + "step": 19860 + }, + { + "epoch": 0.7064260954581815, + "grad_norm": 1.6149235963821411, + "learning_rate": 1.8177302353380053e-05, + "loss": 1.8695, + "step": 19870 + }, + { + "epoch": 0.7067816194116079, + "grad_norm": 1.667304277420044, + "learning_rate": 1.817504588292658e-05, + "loss": 1.9101, + "step": 19880 + }, + { + "epoch": 0.7071371433650342, + "grad_norm": 1.4457412958145142, + "learning_rate": 1.8172788156837312e-05, + "loss": 1.8208, + "step": 19890 + }, + { + "epoch": 0.7074926673184606, + "grad_norm": 1.7635536193847656, + "learning_rate": 1.817052917545903e-05, + "loss": 1.8476, + "step": 19900 + }, + { + "epoch": 0.7078481912718869, + "grad_norm": 1.6602329015731812, + "learning_rate": 1.8168268939138696e-05, + "loss": 1.8366, + "step": 19910 + }, + { + "epoch": 0.7082037152253133, + "grad_norm": 1.601374864578247, + "learning_rate": 1.8166007448223467e-05, + "loss": 1.8494, + "step": 19920 + }, + { + "epoch": 0.7085592391787396, + "grad_norm": 1.667270302772522, + "learning_rate": 1.8163744703060698e-05, + "loss": 1.8107, + "step": 19930 + }, + { + "epoch": 0.7089147631321661, + "grad_norm": 1.6129560470581055, + "learning_rate": 1.816148070399793e-05, + "loss": 1.8337, + "step": 19940 + }, + { + "epoch": 0.7092702870855924, + "grad_norm": 1.6151974201202393, + "learning_rate": 1.8159215451382894e-05, + "loss": 1.8616, + "step": 19950 + }, + { + "epoch": 0.7096258110390188, + "grad_norm": 1.6923866271972656, + "learning_rate": 1.815694894556353e-05, + "loss": 1.8846, + "step": 19960 + }, + { + "epoch": 0.7099813349924451, + "grad_norm": 1.7572191953659058, + "learning_rate": 1.8154681186887946e-05, + "loss": 1.9025, + "step": 19970 + }, + { + "epoch": 0.7103368589458715, + "grad_norm": 1.613426685333252, + "learning_rate": 1.8152412175704464e-05, + "loss": 1.808, + "step": 19980 + }, + { + "epoch": 0.7106923828992978, + "grad_norm": 1.656322956085205, + "learning_rate": 1.8150141912361586e-05, + "loss": 1.8517, + "step": 19990 + }, + { + "epoch": 0.7110479068527242, + "grad_norm": 1.6725047826766968, + "learning_rate": 1.8147870397208017e-05, + "loss": 1.8071, + "step": 20000 + }, + { + "epoch": 0.7114034308061505, + "grad_norm": 1.5440058708190918, + "learning_rate": 1.814559763059264e-05, + "loss": 1.9088, + "step": 20010 + }, + { + "epoch": 0.711758954759577, + "grad_norm": 1.6006666421890259, + "learning_rate": 1.8143323612864542e-05, + "loss": 1.8601, + "step": 20020 + }, + { + "epoch": 0.7121144787130033, + "grad_norm": 1.6369351148605347, + "learning_rate": 1.8141048344372994e-05, + "loss": 1.823, + "step": 20030 + }, + { + "epoch": 0.7124700026664297, + "grad_norm": 1.6196844577789307, + "learning_rate": 1.8138771825467462e-05, + "loss": 1.8227, + "step": 20040 + }, + { + "epoch": 0.712825526619856, + "grad_norm": 1.5950103998184204, + "learning_rate": 1.8136494056497614e-05, + "loss": 1.8067, + "step": 20050 + }, + { + "epoch": 0.7131810505732824, + "grad_norm": 1.6997015476226807, + "learning_rate": 1.8134215037813293e-05, + "loss": 1.7878, + "step": 20060 + }, + { + "epoch": 0.7135365745267087, + "grad_norm": 1.6241003274917603, + "learning_rate": 1.813193476976454e-05, + "loss": 1.8432, + "step": 20070 + }, + { + "epoch": 0.7138920984801351, + "grad_norm": 1.6806402206420898, + "learning_rate": 1.81296532527016e-05, + "loss": 1.8331, + "step": 20080 + }, + { + "epoch": 0.7142476224335614, + "grad_norm": 1.7081027030944824, + "learning_rate": 1.812737048697489e-05, + "loss": 1.8474, + "step": 20090 + }, + { + "epoch": 0.7146031463869879, + "grad_norm": 1.6192286014556885, + "learning_rate": 1.8125086472935034e-05, + "loss": 1.8737, + "step": 20100 + }, + { + "epoch": 0.7149586703404142, + "grad_norm": 1.6434106826782227, + "learning_rate": 1.812280121093284e-05, + "loss": 1.8605, + "step": 20110 + }, + { + "epoch": 0.7153141942938406, + "grad_norm": 1.6780080795288086, + "learning_rate": 1.812051470131931e-05, + "loss": 1.8252, + "step": 20120 + }, + { + "epoch": 0.7156697182472669, + "grad_norm": 1.6471208333969116, + "learning_rate": 1.811822694444564e-05, + "loss": 1.8707, + "step": 20130 + }, + { + "epoch": 0.7160252422006933, + "grad_norm": 1.6043684482574463, + "learning_rate": 1.8115937940663212e-05, + "loss": 1.8365, + "step": 20140 + }, + { + "epoch": 0.7163807661541196, + "grad_norm": 1.75435209274292, + "learning_rate": 1.8113647690323603e-05, + "loss": 1.8368, + "step": 20150 + }, + { + "epoch": 0.716736290107546, + "grad_norm": 1.616033911705017, + "learning_rate": 1.8111356193778577e-05, + "loss": 1.8305, + "step": 20160 + }, + { + "epoch": 0.7170918140609723, + "grad_norm": 1.5769684314727783, + "learning_rate": 1.8109063451380105e-05, + "loss": 1.8513, + "step": 20170 + }, + { + "epoch": 0.7174473380143987, + "grad_norm": 1.749438762664795, + "learning_rate": 1.8106769463480328e-05, + "loss": 1.8363, + "step": 20180 + }, + { + "epoch": 0.7178028619678251, + "grad_norm": 1.6878730058670044, + "learning_rate": 1.810447423043159e-05, + "loss": 1.7915, + "step": 20190 + }, + { + "epoch": 0.7181583859212515, + "grad_norm": 1.5220178365707397, + "learning_rate": 1.810217775258643e-05, + "loss": 1.8436, + "step": 20200 + }, + { + "epoch": 0.7185139098746778, + "grad_norm": 1.58015775680542, + "learning_rate": 1.8099880030297567e-05, + "loss": 1.8548, + "step": 20210 + }, + { + "epoch": 0.7188694338281042, + "grad_norm": 1.6119468212127686, + "learning_rate": 1.809758106391792e-05, + "loss": 1.8813, + "step": 20220 + }, + { + "epoch": 0.7192249577815305, + "grad_norm": 1.719566822052002, + "learning_rate": 1.809528085380059e-05, + "loss": 1.855, + "step": 20230 + }, + { + "epoch": 0.7195804817349569, + "grad_norm": 1.649501085281372, + "learning_rate": 1.8092979400298877e-05, + "loss": 1.8391, + "step": 20240 + }, + { + "epoch": 0.7199360056883832, + "grad_norm": 1.703221321105957, + "learning_rate": 1.8090676703766276e-05, + "loss": 1.9056, + "step": 20250 + }, + { + "epoch": 0.7202915296418096, + "grad_norm": 1.6187703609466553, + "learning_rate": 1.808837276455646e-05, + "loss": 1.8322, + "step": 20260 + }, + { + "epoch": 0.720647053595236, + "grad_norm": 1.7238303422927856, + "learning_rate": 1.80860675830233e-05, + "loss": 1.8094, + "step": 20270 + }, + { + "epoch": 0.7210025775486624, + "grad_norm": 1.573097586631775, + "learning_rate": 1.8083761159520862e-05, + "loss": 1.8923, + "step": 20280 + }, + { + "epoch": 0.7213581015020887, + "grad_norm": 1.6466491222381592, + "learning_rate": 1.8081453494403395e-05, + "loss": 1.8035, + "step": 20290 + }, + { + "epoch": 0.7217136254555151, + "grad_norm": 1.5461498498916626, + "learning_rate": 1.807914458802534e-05, + "loss": 1.8699, + "step": 20300 + }, + { + "epoch": 0.7220691494089414, + "grad_norm": 1.7276533842086792, + "learning_rate": 1.807683444074134e-05, + "loss": 1.8877, + "step": 20310 + }, + { + "epoch": 0.7224246733623678, + "grad_norm": 1.6024223566055298, + "learning_rate": 1.8074523052906204e-05, + "loss": 1.8637, + "step": 20320 + }, + { + "epoch": 0.7227801973157941, + "grad_norm": 1.538771390914917, + "learning_rate": 1.807221042487496e-05, + "loss": 1.8827, + "step": 20330 + }, + { + "epoch": 0.7231357212692205, + "grad_norm": 1.727202296257019, + "learning_rate": 1.8069896557002805e-05, + "loss": 1.8885, + "step": 20340 + }, + { + "epoch": 0.7234912452226469, + "grad_norm": 1.6250510215759277, + "learning_rate": 1.8067581449645137e-05, + "loss": 1.8255, + "step": 20350 + }, + { + "epoch": 0.7238467691760733, + "grad_norm": 1.5995020866394043, + "learning_rate": 1.8065265103157546e-05, + "loss": 1.834, + "step": 20360 + }, + { + "epoch": 0.7242022931294996, + "grad_norm": 1.6498067378997803, + "learning_rate": 1.80629475178958e-05, + "loss": 1.8596, + "step": 20370 + }, + { + "epoch": 0.724557817082926, + "grad_norm": 1.680664300918579, + "learning_rate": 1.8060628694215875e-05, + "loss": 1.8338, + "step": 20380 + }, + { + "epoch": 0.7249133410363523, + "grad_norm": 1.6563605070114136, + "learning_rate": 1.805830863247392e-05, + "loss": 1.8466, + "step": 20390 + }, + { + "epoch": 0.7252688649897787, + "grad_norm": 1.557987928390503, + "learning_rate": 1.8055987333026286e-05, + "loss": 1.8276, + "step": 20400 + }, + { + "epoch": 0.725624388943205, + "grad_norm": 1.6893463134765625, + "learning_rate": 1.8053664796229508e-05, + "loss": 1.7999, + "step": 20410 + }, + { + "epoch": 0.7259799128966314, + "grad_norm": 1.7025909423828125, + "learning_rate": 1.8051341022440315e-05, + "loss": 1.8007, + "step": 20420 + }, + { + "epoch": 0.7263354368500577, + "grad_norm": 1.5952210426330566, + "learning_rate": 1.8049016012015626e-05, + "loss": 1.8007, + "step": 20430 + }, + { + "epoch": 0.7266909608034842, + "grad_norm": 1.640624761581421, + "learning_rate": 1.804668976531254e-05, + "loss": 1.8254, + "step": 20440 + }, + { + "epoch": 0.7270464847569105, + "grad_norm": 1.5979695320129395, + "learning_rate": 1.8044362282688365e-05, + "loss": 1.8436, + "step": 20450 + }, + { + "epoch": 0.7274020087103369, + "grad_norm": 1.5913257598876953, + "learning_rate": 1.804203356450058e-05, + "loss": 1.8661, + "step": 20460 + }, + { + "epoch": 0.7277575326637632, + "grad_norm": 1.637255072593689, + "learning_rate": 1.803970361110686e-05, + "loss": 1.8876, + "step": 20470 + }, + { + "epoch": 0.7281130566171896, + "grad_norm": 1.7362253665924072, + "learning_rate": 1.8037372422865076e-05, + "loss": 1.8607, + "step": 20480 + }, + { + "epoch": 0.7284685805706159, + "grad_norm": 1.586946964263916, + "learning_rate": 1.8035040000133284e-05, + "loss": 1.8653, + "step": 20490 + }, + { + "epoch": 0.7288241045240423, + "grad_norm": 1.6418026685714722, + "learning_rate": 1.803270634326973e-05, + "loss": 1.8445, + "step": 20500 + }, + { + "epoch": 0.7291796284774686, + "grad_norm": 1.6286134719848633, + "learning_rate": 1.803037145263284e-05, + "loss": 1.8348, + "step": 20510 + }, + { + "epoch": 0.7295351524308951, + "grad_norm": 1.7448618412017822, + "learning_rate": 1.802803532858125e-05, + "loss": 1.8338, + "step": 20520 + }, + { + "epoch": 0.7298906763843214, + "grad_norm": 1.7649121284484863, + "learning_rate": 1.8025697971473774e-05, + "loss": 1.8344, + "step": 20530 + }, + { + "epoch": 0.7302462003377478, + "grad_norm": 1.579267144203186, + "learning_rate": 1.8023359381669406e-05, + "loss": 1.8344, + "step": 20540 + }, + { + "epoch": 0.7306017242911741, + "grad_norm": 1.6077048778533936, + "learning_rate": 1.8021019559527343e-05, + "loss": 1.8123, + "step": 20550 + }, + { + "epoch": 0.7309572482446005, + "grad_norm": 1.6492174863815308, + "learning_rate": 1.8018678505406972e-05, + "loss": 1.8624, + "step": 20560 + }, + { + "epoch": 0.7313127721980268, + "grad_norm": 1.5793914794921875, + "learning_rate": 1.801633621966786e-05, + "loss": 1.817, + "step": 20570 + }, + { + "epoch": 0.7316682961514532, + "grad_norm": 1.6046448945999146, + "learning_rate": 1.8013992702669763e-05, + "loss": 1.8189, + "step": 20580 + }, + { + "epoch": 0.7320238201048795, + "grad_norm": 1.5873152017593384, + "learning_rate": 1.801164795477264e-05, + "loss": 1.9034, + "step": 20590 + }, + { + "epoch": 0.732379344058306, + "grad_norm": 1.6548078060150146, + "learning_rate": 1.800930197633662e-05, + "loss": 1.8272, + "step": 20600 + }, + { + "epoch": 0.7327348680117323, + "grad_norm": 1.6251716613769531, + "learning_rate": 1.8006954767722037e-05, + "loss": 1.8533, + "step": 20610 + }, + { + "epoch": 0.7330903919651587, + "grad_norm": 1.6041985750198364, + "learning_rate": 1.8004606329289408e-05, + "loss": 1.8543, + "step": 20620 + }, + { + "epoch": 0.733445915918585, + "grad_norm": 1.686060905456543, + "learning_rate": 1.800225666139943e-05, + "loss": 1.8671, + "step": 20630 + }, + { + "epoch": 0.7338014398720114, + "grad_norm": 1.6034380197525024, + "learning_rate": 1.799990576441301e-05, + "loss": 1.8507, + "step": 20640 + }, + { + "epoch": 0.7341569638254377, + "grad_norm": 1.6069995164871216, + "learning_rate": 1.799755363869122e-05, + "loss": 1.8578, + "step": 20650 + }, + { + "epoch": 0.7345124877788641, + "grad_norm": 1.68937087059021, + "learning_rate": 1.799520028459534e-05, + "loss": 1.8286, + "step": 20660 + }, + { + "epoch": 0.7348680117322904, + "grad_norm": 1.651024580001831, + "learning_rate": 1.7992845702486824e-05, + "loss": 1.8287, + "step": 20670 + }, + { + "epoch": 0.7352235356857169, + "grad_norm": 1.6019994020462036, + "learning_rate": 1.7990489892727322e-05, + "loss": 1.8236, + "step": 20680 + }, + { + "epoch": 0.7355790596391432, + "grad_norm": 1.6435049772262573, + "learning_rate": 1.7988132855678676e-05, + "loss": 1.8916, + "step": 20690 + }, + { + "epoch": 0.7359345835925696, + "grad_norm": 1.6221009492874146, + "learning_rate": 1.7985774591702907e-05, + "loss": 1.8688, + "step": 20700 + }, + { + "epoch": 0.7362901075459959, + "grad_norm": 1.6398005485534668, + "learning_rate": 1.7983415101162235e-05, + "loss": 1.8534, + "step": 20710 + }, + { + "epoch": 0.7366456314994223, + "grad_norm": 1.5574488639831543, + "learning_rate": 1.798105438441906e-05, + "loss": 1.8703, + "step": 20720 + }, + { + "epoch": 0.7370011554528486, + "grad_norm": 1.7029882669448853, + "learning_rate": 1.797869244183597e-05, + "loss": 1.8173, + "step": 20730 + }, + { + "epoch": 0.737356679406275, + "grad_norm": 1.6728116273880005, + "learning_rate": 1.797632927377575e-05, + "loss": 1.8568, + "step": 20740 + }, + { + "epoch": 0.7377122033597013, + "grad_norm": 1.5825793743133545, + "learning_rate": 1.7973964880601364e-05, + "loss": 1.825, + "step": 20750 + }, + { + "epoch": 0.7380677273131278, + "grad_norm": 1.7137134075164795, + "learning_rate": 1.797159926267597e-05, + "loss": 1.8368, + "step": 20760 + }, + { + "epoch": 0.7384232512665541, + "grad_norm": 1.5658918619155884, + "learning_rate": 1.796923242036291e-05, + "loss": 1.8178, + "step": 20770 + }, + { + "epoch": 0.7387787752199805, + "grad_norm": 1.6084784269332886, + "learning_rate": 1.7966864354025722e-05, + "loss": 1.8116, + "step": 20780 + }, + { + "epoch": 0.7391342991734068, + "grad_norm": 1.6091594696044922, + "learning_rate": 1.796449506402812e-05, + "loss": 1.835, + "step": 20790 + }, + { + "epoch": 0.7394898231268332, + "grad_norm": 1.8020347356796265, + "learning_rate": 1.7962124550734013e-05, + "loss": 1.819, + "step": 20800 + }, + { + "epoch": 0.7398453470802595, + "grad_norm": 1.6283994913101196, + "learning_rate": 1.7959752814507498e-05, + "loss": 1.8651, + "step": 20810 + }, + { + "epoch": 0.7402008710336859, + "grad_norm": 1.6090404987335205, + "learning_rate": 1.7957379855712858e-05, + "loss": 1.8372, + "step": 20820 + }, + { + "epoch": 0.7405563949871122, + "grad_norm": 1.6721382141113281, + "learning_rate": 1.7955005674714567e-05, + "loss": 1.8023, + "step": 20830 + }, + { + "epoch": 0.7409119189405386, + "grad_norm": 1.683385968208313, + "learning_rate": 1.795263027187728e-05, + "loss": 1.8386, + "step": 20840 + }, + { + "epoch": 0.741267442893965, + "grad_norm": 1.7011699676513672, + "learning_rate": 1.795025364756585e-05, + "loss": 1.8509, + "step": 20850 + }, + { + "epoch": 0.7416229668473914, + "grad_norm": 1.6731226444244385, + "learning_rate": 1.7947875802145307e-05, + "loss": 1.84, + "step": 20860 + }, + { + "epoch": 0.7419784908008177, + "grad_norm": 1.5665476322174072, + "learning_rate": 1.7945496735980872e-05, + "loss": 1.842, + "step": 20870 + }, + { + "epoch": 0.7423340147542441, + "grad_norm": 1.6955523490905762, + "learning_rate": 1.794311644943796e-05, + "loss": 1.862, + "step": 20880 + }, + { + "epoch": 0.7426895387076704, + "grad_norm": 1.7335302829742432, + "learning_rate": 1.7940734942882164e-05, + "loss": 1.8447, + "step": 20890 + }, + { + "epoch": 0.7430450626610968, + "grad_norm": 1.6597357988357544, + "learning_rate": 1.7938352216679267e-05, + "loss": 1.8865, + "step": 20900 + }, + { + "epoch": 0.7434005866145231, + "grad_norm": 1.6029428243637085, + "learning_rate": 1.793596827119525e-05, + "loss": 1.8061, + "step": 20910 + }, + { + "epoch": 0.7437561105679495, + "grad_norm": 1.6280450820922852, + "learning_rate": 1.7933583106796263e-05, + "loss": 1.8168, + "step": 20920 + }, + { + "epoch": 0.7441116345213759, + "grad_norm": 1.4758005142211914, + "learning_rate": 1.7931196723848652e-05, + "loss": 1.8337, + "step": 20930 + }, + { + "epoch": 0.7444671584748023, + "grad_norm": 1.6232197284698486, + "learning_rate": 1.7928809122718955e-05, + "loss": 1.817, + "step": 20940 + }, + { + "epoch": 0.7448226824282286, + "grad_norm": 1.726462960243225, + "learning_rate": 1.792642030377389e-05, + "loss": 1.8219, + "step": 20950 + }, + { + "epoch": 0.745178206381655, + "grad_norm": 1.620409369468689, + "learning_rate": 1.7924030267380365e-05, + "loss": 1.8492, + "step": 20960 + }, + { + "epoch": 0.7455337303350813, + "grad_norm": 1.5886775255203247, + "learning_rate": 1.7921639013905477e-05, + "loss": 1.815, + "step": 20970 + }, + { + "epoch": 0.7458892542885077, + "grad_norm": 1.500231146812439, + "learning_rate": 1.7919246543716502e-05, + "loss": 1.8441, + "step": 20980 + }, + { + "epoch": 0.746244778241934, + "grad_norm": 1.6466845273971558, + "learning_rate": 1.7916852857180913e-05, + "loss": 1.8523, + "step": 20990 + }, + { + "epoch": 0.7466003021953604, + "grad_norm": 1.5844566822052002, + "learning_rate": 1.7914457954666368e-05, + "loss": 1.861, + "step": 21000 + }, + { + "epoch": 0.7469558261487868, + "grad_norm": 1.6205304861068726, + "learning_rate": 1.79120618365407e-05, + "loss": 1.865, + "step": 21010 + }, + { + "epoch": 0.7473113501022132, + "grad_norm": 1.576826810836792, + "learning_rate": 1.7909664503171947e-05, + "loss": 1.8186, + "step": 21020 + }, + { + "epoch": 0.7476668740556395, + "grad_norm": 1.6844556331634521, + "learning_rate": 1.790726595492832e-05, + "loss": 1.8328, + "step": 21030 + }, + { + "epoch": 0.7480223980090659, + "grad_norm": 1.679314136505127, + "learning_rate": 1.7904866192178215e-05, + "loss": 1.8652, + "step": 21040 + }, + { + "epoch": 0.7483779219624922, + "grad_norm": 1.7030935287475586, + "learning_rate": 1.7902465215290233e-05, + "loss": 1.8043, + "step": 21050 + }, + { + "epoch": 0.7487334459159186, + "grad_norm": 1.6377439498901367, + "learning_rate": 1.790006302463314e-05, + "loss": 1.8494, + "step": 21060 + }, + { + "epoch": 0.7490889698693449, + "grad_norm": 1.6536033153533936, + "learning_rate": 1.7897659620575905e-05, + "loss": 1.8108, + "step": 21070 + }, + { + "epoch": 0.7494444938227713, + "grad_norm": 1.8303552865982056, + "learning_rate": 1.789525500348767e-05, + "loss": 1.8273, + "step": 21080 + }, + { + "epoch": 0.7498000177761976, + "grad_norm": 1.5810844898223877, + "learning_rate": 1.7892849173737764e-05, + "loss": 1.8176, + "step": 21090 + }, + { + "epoch": 0.7501555417296241, + "grad_norm": 1.5926487445831299, + "learning_rate": 1.789044213169572e-05, + "loss": 1.8238, + "step": 21100 + }, + { + "epoch": 0.7505110656830504, + "grad_norm": 1.6886205673217773, + "learning_rate": 1.7888033877731233e-05, + "loss": 1.8556, + "step": 21110 + }, + { + "epoch": 0.7508665896364768, + "grad_norm": 1.6423076391220093, + "learning_rate": 1.7885624412214204e-05, + "loss": 1.8218, + "step": 21120 + }, + { + "epoch": 0.7512221135899031, + "grad_norm": 1.5861601829528809, + "learning_rate": 1.7883213735514708e-05, + "loss": 1.8104, + "step": 21130 + }, + { + "epoch": 0.7515776375433295, + "grad_norm": 1.657173752784729, + "learning_rate": 1.788080184800301e-05, + "loss": 1.8889, + "step": 21140 + }, + { + "epoch": 0.7519331614967558, + "grad_norm": 1.7222282886505127, + "learning_rate": 1.787838875004956e-05, + "loss": 1.8262, + "step": 21150 + }, + { + "epoch": 0.7522886854501822, + "grad_norm": 1.6098462343215942, + "learning_rate": 1.7875974442024996e-05, + "loss": 1.7838, + "step": 21160 + }, + { + "epoch": 0.7526442094036085, + "grad_norm": 1.7243529558181763, + "learning_rate": 1.7873558924300143e-05, + "loss": 1.8078, + "step": 21170 + }, + { + "epoch": 0.752999733357035, + "grad_norm": 1.546335220336914, + "learning_rate": 1.7871142197246e-05, + "loss": 1.8422, + "step": 21180 + }, + { + "epoch": 0.7533552573104613, + "grad_norm": 1.7641113996505737, + "learning_rate": 1.786872426123378e-05, + "loss": 1.8102, + "step": 21190 + }, + { + "epoch": 0.7537107812638877, + "grad_norm": 1.620474100112915, + "learning_rate": 1.7866305116634843e-05, + "loss": 1.8326, + "step": 21200 + }, + { + "epoch": 0.754066305217314, + "grad_norm": 1.7397280931472778, + "learning_rate": 1.7863884763820762e-05, + "loss": 1.8162, + "step": 21210 + }, + { + "epoch": 0.7544218291707404, + "grad_norm": 1.6662113666534424, + "learning_rate": 1.786146320316329e-05, + "loss": 1.7993, + "step": 21220 + }, + { + "epoch": 0.7547773531241667, + "grad_norm": 1.6224888563156128, + "learning_rate": 1.785904043503436e-05, + "loss": 1.8431, + "step": 21230 + }, + { + "epoch": 0.7551328770775931, + "grad_norm": 1.585585117340088, + "learning_rate": 1.7856616459806097e-05, + "loss": 1.8341, + "step": 21240 + }, + { + "epoch": 0.7554884010310194, + "grad_norm": 1.7950774431228638, + "learning_rate": 1.7854191277850806e-05, + "loss": 1.832, + "step": 21250 + }, + { + "epoch": 0.7558439249844459, + "grad_norm": 1.6702255010604858, + "learning_rate": 1.785176488954098e-05, + "loss": 1.8655, + "step": 21260 + }, + { + "epoch": 0.7561994489378722, + "grad_norm": 1.6955862045288086, + "learning_rate": 1.7849337295249302e-05, + "loss": 1.8188, + "step": 21270 + }, + { + "epoch": 0.7565549728912986, + "grad_norm": 1.7578234672546387, + "learning_rate": 1.784690849534863e-05, + "loss": 1.8362, + "step": 21280 + }, + { + "epoch": 0.7569104968447249, + "grad_norm": 1.7137067317962646, + "learning_rate": 1.7844478490212012e-05, + "loss": 1.8034, + "step": 21290 + }, + { + "epoch": 0.7572660207981513, + "grad_norm": 1.7158740758895874, + "learning_rate": 1.7842047280212683e-05, + "loss": 1.8079, + "step": 21300 + }, + { + "epoch": 0.7576215447515776, + "grad_norm": 1.6505048274993896, + "learning_rate": 1.7839614865724064e-05, + "loss": 1.8146, + "step": 21310 + }, + { + "epoch": 0.757977068705004, + "grad_norm": 1.6587018966674805, + "learning_rate": 1.7837181247119756e-05, + "loss": 1.8379, + "step": 21320 + }, + { + "epoch": 0.7583325926584303, + "grad_norm": 1.5471725463867188, + "learning_rate": 1.7834746424773545e-05, + "loss": 1.866, + "step": 21330 + }, + { + "epoch": 0.7586881166118568, + "grad_norm": 1.5741422176361084, + "learning_rate": 1.7832310399059406e-05, + "loss": 1.8118, + "step": 21340 + }, + { + "epoch": 0.7590436405652831, + "grad_norm": 1.6571557521820068, + "learning_rate": 1.7829873170351505e-05, + "loss": 1.8439, + "step": 21350 + }, + { + "epoch": 0.7593991645187095, + "grad_norm": 1.5959396362304688, + "learning_rate": 1.7827434739024176e-05, + "loss": 1.8135, + "step": 21360 + }, + { + "epoch": 0.7597546884721358, + "grad_norm": 1.6723634004592896, + "learning_rate": 1.782499510545195e-05, + "loss": 1.8359, + "step": 21370 + }, + { + "epoch": 0.7601102124255622, + "grad_norm": 1.68485426902771, + "learning_rate": 1.7822554270009535e-05, + "loss": 1.8569, + "step": 21380 + }, + { + "epoch": 0.7604657363789885, + "grad_norm": 1.6855770349502563, + "learning_rate": 1.7820112233071837e-05, + "loss": 1.8868, + "step": 21390 + }, + { + "epoch": 0.7608212603324149, + "grad_norm": 1.7310736179351807, + "learning_rate": 1.781766899501393e-05, + "loss": 1.8345, + "step": 21400 + }, + { + "epoch": 0.7611767842858412, + "grad_norm": 1.6306757926940918, + "learning_rate": 1.781522455621108e-05, + "loss": 1.7802, + "step": 21410 + }, + { + "epoch": 0.7615323082392677, + "grad_norm": 1.6657934188842773, + "learning_rate": 1.7812778917038746e-05, + "loss": 1.7914, + "step": 21420 + }, + { + "epoch": 0.761887832192694, + "grad_norm": 1.5563294887542725, + "learning_rate": 1.7810332077872555e-05, + "loss": 1.8376, + "step": 21430 + }, + { + "epoch": 0.7622433561461204, + "grad_norm": 1.5677244663238525, + "learning_rate": 1.7807884039088326e-05, + "loss": 1.8033, + "step": 21440 + }, + { + "epoch": 0.7625988800995467, + "grad_norm": 1.5867130756378174, + "learning_rate": 1.7805434801062064e-05, + "loss": 1.8307, + "step": 21450 + }, + { + "epoch": 0.7629544040529731, + "grad_norm": 1.6075420379638672, + "learning_rate": 1.780298436416996e-05, + "loss": 1.8702, + "step": 21460 + }, + { + "epoch": 0.7633099280063994, + "grad_norm": 1.6167482137680054, + "learning_rate": 1.780053272878838e-05, + "loss": 1.8321, + "step": 21470 + }, + { + "epoch": 0.7636654519598258, + "grad_norm": 1.7103121280670166, + "learning_rate": 1.7798079895293884e-05, + "loss": 1.8619, + "step": 21480 + }, + { + "epoch": 0.7640209759132521, + "grad_norm": 1.6230714321136475, + "learning_rate": 1.779562586406321e-05, + "loss": 1.834, + "step": 21490 + }, + { + "epoch": 0.7643764998666785, + "grad_norm": 1.6490651369094849, + "learning_rate": 1.779317063547328e-05, + "loss": 1.8218, + "step": 21500 + }, + { + "epoch": 0.7647320238201049, + "grad_norm": 1.604256510734558, + "learning_rate": 1.779071420990121e-05, + "loss": 1.8535, + "step": 21510 + }, + { + "epoch": 0.7650875477735313, + "grad_norm": 1.7728480100631714, + "learning_rate": 1.7788256587724276e-05, + "loss": 1.8961, + "step": 21520 + }, + { + "epoch": 0.7654430717269576, + "grad_norm": 1.6369065046310425, + "learning_rate": 1.778579776931997e-05, + "loss": 1.7931, + "step": 21530 + }, + { + "epoch": 0.765798595680384, + "grad_norm": 1.5738849639892578, + "learning_rate": 1.7783337755065943e-05, + "loss": 1.83, + "step": 21540 + }, + { + "epoch": 0.7661541196338103, + "grad_norm": 1.7136260271072388, + "learning_rate": 1.7780876545340037e-05, + "loss": 1.8003, + "step": 21550 + }, + { + "epoch": 0.7665096435872367, + "grad_norm": 1.6939854621887207, + "learning_rate": 1.7778414140520283e-05, + "loss": 1.7789, + "step": 21560 + }, + { + "epoch": 0.766865167540663, + "grad_norm": 1.7023991346359253, + "learning_rate": 1.7775950540984887e-05, + "loss": 1.8382, + "step": 21570 + }, + { + "epoch": 0.7672206914940894, + "grad_norm": 1.6543914079666138, + "learning_rate": 1.777348574711224e-05, + "loss": 1.8753, + "step": 21580 + }, + { + "epoch": 0.7675762154475158, + "grad_norm": 1.6325597763061523, + "learning_rate": 1.777101975928093e-05, + "loss": 1.8415, + "step": 21590 + }, + { + "epoch": 0.7679317394009422, + "grad_norm": 1.754928708076477, + "learning_rate": 1.7768552577869702e-05, + "loss": 1.8399, + "step": 21600 + }, + { + "epoch": 0.7682872633543685, + "grad_norm": 1.6532490253448486, + "learning_rate": 1.776608420325751e-05, + "loss": 1.8202, + "step": 21610 + }, + { + "epoch": 0.7686427873077949, + "grad_norm": 1.6243687868118286, + "learning_rate": 1.7763614635823478e-05, + "loss": 1.8437, + "step": 21620 + }, + { + "epoch": 0.7689983112612212, + "grad_norm": 1.6320301294326782, + "learning_rate": 1.7761143875946918e-05, + "loss": 1.8326, + "step": 21630 + }, + { + "epoch": 0.7693538352146476, + "grad_norm": 1.5413025617599487, + "learning_rate": 1.7758671924007318e-05, + "loss": 1.8824, + "step": 21640 + }, + { + "epoch": 0.7697093591680739, + "grad_norm": 1.5559817552566528, + "learning_rate": 1.775619878038436e-05, + "loss": 1.8206, + "step": 21650 + }, + { + "epoch": 0.7700648831215003, + "grad_norm": 1.6865465641021729, + "learning_rate": 1.77537244454579e-05, + "loss": 1.7951, + "step": 21660 + }, + { + "epoch": 0.7704204070749266, + "grad_norm": 1.5911492109298706, + "learning_rate": 1.7751248919607982e-05, + "loss": 1.8744, + "step": 21670 + }, + { + "epoch": 0.7707759310283531, + "grad_norm": 1.618155837059021, + "learning_rate": 1.774877220321483e-05, + "loss": 1.8521, + "step": 21680 + }, + { + "epoch": 0.7711314549817794, + "grad_norm": 1.6658058166503906, + "learning_rate": 1.7746294296658853e-05, + "loss": 1.8676, + "step": 21690 + }, + { + "epoch": 0.7714869789352058, + "grad_norm": 1.526122808456421, + "learning_rate": 1.774381520032064e-05, + "loss": 1.8191, + "step": 21700 + }, + { + "epoch": 0.7718425028886321, + "grad_norm": 1.6593496799468994, + "learning_rate": 1.774133491458097e-05, + "loss": 1.8658, + "step": 21710 + }, + { + "epoch": 0.7721980268420585, + "grad_norm": 1.6845260858535767, + "learning_rate": 1.7738853439820796e-05, + "loss": 1.8233, + "step": 21720 + }, + { + "epoch": 0.7725535507954848, + "grad_norm": 1.6050273180007935, + "learning_rate": 1.7736370776421255e-05, + "loss": 1.8582, + "step": 21730 + }, + { + "epoch": 0.7729090747489112, + "grad_norm": 1.564800500869751, + "learning_rate": 1.7733886924763668e-05, + "loss": 1.8745, + "step": 21740 + }, + { + "epoch": 0.7732645987023375, + "grad_norm": 1.6082314252853394, + "learning_rate": 1.7731401885229546e-05, + "loss": 1.8409, + "step": 21750 + }, + { + "epoch": 0.773620122655764, + "grad_norm": 1.6496139764785767, + "learning_rate": 1.772891565820057e-05, + "loss": 1.8375, + "step": 21760 + }, + { + "epoch": 0.7739756466091903, + "grad_norm": 1.5334914922714233, + "learning_rate": 1.7726428244058605e-05, + "loss": 1.8176, + "step": 21770 + }, + { + "epoch": 0.7743311705626167, + "grad_norm": 1.6704349517822266, + "learning_rate": 1.7723939643185705e-05, + "loss": 1.8231, + "step": 21780 + }, + { + "epoch": 0.774686694516043, + "grad_norm": 1.5811091661453247, + "learning_rate": 1.7721449855964114e-05, + "loss": 1.8218, + "step": 21790 + }, + { + "epoch": 0.7750422184694694, + "grad_norm": 1.62055242061615, + "learning_rate": 1.7718958882776233e-05, + "loss": 1.8821, + "step": 21800 + }, + { + "epoch": 0.7753977424228957, + "grad_norm": 1.6094154119491577, + "learning_rate": 1.7716466724004667e-05, + "loss": 1.8133, + "step": 21810 + }, + { + "epoch": 0.7757532663763221, + "grad_norm": 1.6229323148727417, + "learning_rate": 1.7713973380032194e-05, + "loss": 1.812, + "step": 21820 + }, + { + "epoch": 0.7761087903297484, + "grad_norm": 1.5259974002838135, + "learning_rate": 1.771147885124178e-05, + "loss": 1.8637, + "step": 21830 + }, + { + "epoch": 0.7764643142831749, + "grad_norm": 1.6902457475662231, + "learning_rate": 1.770898313801656e-05, + "loss": 1.8161, + "step": 21840 + }, + { + "epoch": 0.7768198382366012, + "grad_norm": 1.6666581630706787, + "learning_rate": 1.7706486240739875e-05, + "loss": 1.8374, + "step": 21850 + }, + { + "epoch": 0.7771753621900276, + "grad_norm": 1.5865390300750732, + "learning_rate": 1.7703988159795214e-05, + "loss": 1.843, + "step": 21860 + }, + { + "epoch": 0.7775308861434539, + "grad_norm": 1.5332224369049072, + "learning_rate": 1.770148889556628e-05, + "loss": 1.7983, + "step": 21870 + }, + { + "epoch": 0.7778864100968803, + "grad_norm": 1.8453980684280396, + "learning_rate": 1.769898844843694e-05, + "loss": 1.8229, + "step": 21880 + }, + { + "epoch": 0.7782419340503066, + "grad_norm": 1.5702221393585205, + "learning_rate": 1.7696486818791247e-05, + "loss": 1.8288, + "step": 21890 + }, + { + "epoch": 0.778597458003733, + "grad_norm": 1.593741774559021, + "learning_rate": 1.7693984007013436e-05, + "loss": 1.8044, + "step": 21900 + }, + { + "epoch": 0.7789529819571593, + "grad_norm": 1.6769473552703857, + "learning_rate": 1.7691480013487926e-05, + "loss": 1.8076, + "step": 21910 + }, + { + "epoch": 0.7793085059105858, + "grad_norm": 1.734527349472046, + "learning_rate": 1.768897483859931e-05, + "loss": 1.8317, + "step": 21920 + }, + { + "epoch": 0.7796640298640121, + "grad_norm": 1.6322834491729736, + "learning_rate": 1.7686468482732367e-05, + "loss": 1.771, + "step": 21930 + }, + { + "epoch": 0.7800195538174385, + "grad_norm": 1.663217306137085, + "learning_rate": 1.7683960946272062e-05, + "loss": 1.8372, + "step": 21940 + }, + { + "epoch": 0.7803750777708648, + "grad_norm": 1.5416820049285889, + "learning_rate": 1.7681452229603532e-05, + "loss": 1.8194, + "step": 21950 + }, + { + "epoch": 0.7807306017242912, + "grad_norm": 1.6156864166259766, + "learning_rate": 1.7678942333112104e-05, + "loss": 1.8473, + "step": 21960 + }, + { + "epoch": 0.7810861256777175, + "grad_norm": 1.7344961166381836, + "learning_rate": 1.7676431257183283e-05, + "loss": 1.8233, + "step": 21970 + }, + { + "epoch": 0.7814416496311439, + "grad_norm": 1.6741567850112915, + "learning_rate": 1.7673919002202752e-05, + "loss": 1.8435, + "step": 21980 + }, + { + "epoch": 0.7817971735845702, + "grad_norm": 1.6223064661026, + "learning_rate": 1.7671405568556377e-05, + "loss": 1.855, + "step": 21990 + }, + { + "epoch": 0.7821526975379967, + "grad_norm": 1.665303349494934, + "learning_rate": 1.7668890956630204e-05, + "loss": 1.8395, + "step": 22000 + }, + { + "epoch": 0.782508221491423, + "grad_norm": 1.5375319719314575, + "learning_rate": 1.7666375166810466e-05, + "loss": 1.8605, + "step": 22010 + }, + { + "epoch": 0.7828637454448494, + "grad_norm": 1.5674186944961548, + "learning_rate": 1.7663858199483575e-05, + "loss": 1.8481, + "step": 22020 + }, + { + "epoch": 0.7832192693982757, + "grad_norm": 1.6025656461715698, + "learning_rate": 1.766134005503611e-05, + "loss": 1.8756, + "step": 22030 + }, + { + "epoch": 0.7835747933517021, + "grad_norm": 1.5113604068756104, + "learning_rate": 1.7658820733854857e-05, + "loss": 1.8223, + "step": 22040 + }, + { + "epoch": 0.7839303173051284, + "grad_norm": 1.7100368738174438, + "learning_rate": 1.765630023632676e-05, + "loss": 1.829, + "step": 22050 + }, + { + "epoch": 0.7842858412585548, + "grad_norm": 1.693702220916748, + "learning_rate": 1.7653778562838947e-05, + "loss": 1.8651, + "step": 22060 + }, + { + "epoch": 0.7846413652119811, + "grad_norm": 1.573425054550171, + "learning_rate": 1.765125571377874e-05, + "loss": 1.8451, + "step": 22070 + }, + { + "epoch": 0.7849968891654076, + "grad_norm": 1.6213774681091309, + "learning_rate": 1.7648731689533626e-05, + "loss": 1.8439, + "step": 22080 + }, + { + "epoch": 0.7853524131188339, + "grad_norm": 1.5298799276351929, + "learning_rate": 1.764620649049128e-05, + "loss": 1.7725, + "step": 22090 + }, + { + "epoch": 0.7857079370722603, + "grad_norm": 1.7361146211624146, + "learning_rate": 1.7643680117039567e-05, + "loss": 1.8386, + "step": 22100 + }, + { + "epoch": 0.7860634610256866, + "grad_norm": 1.5186938047409058, + "learning_rate": 1.764115256956651e-05, + "loss": 1.8684, + "step": 22110 + }, + { + "epoch": 0.786418984979113, + "grad_norm": 1.6889774799346924, + "learning_rate": 1.763862384846033e-05, + "loss": 1.8854, + "step": 22120 + }, + { + "epoch": 0.7867745089325393, + "grad_norm": 1.6956360340118408, + "learning_rate": 1.763609395410942e-05, + "loss": 1.8322, + "step": 22130 + }, + { + "epoch": 0.7871300328859657, + "grad_norm": 1.5466184616088867, + "learning_rate": 1.7633562886902357e-05, + "loss": 1.8281, + "step": 22140 + }, + { + "epoch": 0.787485556839392, + "grad_norm": 1.5690945386886597, + "learning_rate": 1.76310306472279e-05, + "loss": 1.8312, + "step": 22150 + }, + { + "epoch": 0.7878410807928184, + "grad_norm": 1.5939441919326782, + "learning_rate": 1.762849723547498e-05, + "loss": 1.8282, + "step": 22160 + }, + { + "epoch": 0.7881966047462448, + "grad_norm": 1.6708072423934937, + "learning_rate": 1.7625962652032718e-05, + "loss": 1.8702, + "step": 22170 + }, + { + "epoch": 0.7885521286996712, + "grad_norm": 1.666355013847351, + "learning_rate": 1.7623426897290406e-05, + "loss": 1.8174, + "step": 22180 + }, + { + "epoch": 0.7889076526530975, + "grad_norm": 1.6208865642547607, + "learning_rate": 1.7620889971637524e-05, + "loss": 1.8469, + "step": 22190 + }, + { + "epoch": 0.7892631766065239, + "grad_norm": 1.5655211210250854, + "learning_rate": 1.7618351875463723e-05, + "loss": 1.874, + "step": 22200 + }, + { + "epoch": 0.7896187005599502, + "grad_norm": 1.7172608375549316, + "learning_rate": 1.761581260915884e-05, + "loss": 1.8563, + "step": 22210 + }, + { + "epoch": 0.7899742245133766, + "grad_norm": 1.7233494520187378, + "learning_rate": 1.7613272173112894e-05, + "loss": 1.803, + "step": 22220 + }, + { + "epoch": 0.7903297484668029, + "grad_norm": 1.6641350984573364, + "learning_rate": 1.761073056771608e-05, + "loss": 1.8351, + "step": 22230 + }, + { + "epoch": 0.7906852724202293, + "grad_norm": 1.4610012769699097, + "learning_rate": 1.7608187793358766e-05, + "loss": 1.8061, + "step": 22240 + }, + { + "epoch": 0.7910407963736557, + "grad_norm": 1.6382813453674316, + "learning_rate": 1.7605643850431512e-05, + "loss": 1.7839, + "step": 22250 + }, + { + "epoch": 0.7913963203270821, + "grad_norm": 1.718408465385437, + "learning_rate": 1.7603098739325053e-05, + "loss": 1.8069, + "step": 22260 + }, + { + "epoch": 0.7917518442805084, + "grad_norm": 1.616042137145996, + "learning_rate": 1.76005524604303e-05, + "loss": 1.8654, + "step": 22270 + }, + { + "epoch": 0.7921073682339348, + "grad_norm": 1.6004387140274048, + "learning_rate": 1.759800501413834e-05, + "loss": 1.8279, + "step": 22280 + }, + { + "epoch": 0.7924628921873611, + "grad_norm": 1.590248942375183, + "learning_rate": 1.759545640084045e-05, + "loss": 1.8654, + "step": 22290 + }, + { + "epoch": 0.7928184161407875, + "grad_norm": 1.6553932428359985, + "learning_rate": 1.7592906620928085e-05, + "loss": 1.8634, + "step": 22300 + }, + { + "epoch": 0.7931739400942138, + "grad_norm": 1.6197720766067505, + "learning_rate": 1.759035567479287e-05, + "loss": 1.8303, + "step": 22310 + }, + { + "epoch": 0.7935294640476402, + "grad_norm": 1.6615225076675415, + "learning_rate": 1.7587803562826613e-05, + "loss": 1.794, + "step": 22320 + }, + { + "epoch": 0.7938849880010665, + "grad_norm": 1.5763661861419678, + "learning_rate": 1.7585250285421307e-05, + "loss": 1.7818, + "step": 22330 + }, + { + "epoch": 0.794240511954493, + "grad_norm": 1.6498136520385742, + "learning_rate": 1.7582695842969117e-05, + "loss": 1.8302, + "step": 22340 + }, + { + "epoch": 0.7945960359079193, + "grad_norm": 1.6580342054367065, + "learning_rate": 1.7580140235862386e-05, + "loss": 1.8225, + "step": 22350 + }, + { + "epoch": 0.7949515598613457, + "grad_norm": 1.6113598346710205, + "learning_rate": 1.7577583464493643e-05, + "loss": 1.8366, + "step": 22360 + }, + { + "epoch": 0.795307083814772, + "grad_norm": 1.7083193063735962, + "learning_rate": 1.7575025529255593e-05, + "loss": 1.8212, + "step": 22370 + }, + { + "epoch": 0.7956626077681984, + "grad_norm": 1.5660282373428345, + "learning_rate": 1.7572466430541123e-05, + "loss": 1.8118, + "step": 22380 + }, + { + "epoch": 0.7960181317216247, + "grad_norm": 1.6036208868026733, + "learning_rate": 1.7569906168743284e-05, + "loss": 1.8508, + "step": 22390 + }, + { + "epoch": 0.7963736556750511, + "grad_norm": 1.7010608911514282, + "learning_rate": 1.7567344744255315e-05, + "loss": 1.8082, + "step": 22400 + }, + { + "epoch": 0.7967291796284774, + "grad_norm": 1.5989950895309448, + "learning_rate": 1.756478215747065e-05, + "loss": 1.8377, + "step": 22410 + }, + { + "epoch": 0.7970847035819039, + "grad_norm": 1.6668442487716675, + "learning_rate": 1.7562218408782876e-05, + "loss": 1.8462, + "step": 22420 + }, + { + "epoch": 0.7974402275353302, + "grad_norm": 1.6710630655288696, + "learning_rate": 1.7559653498585767e-05, + "loss": 1.8333, + "step": 22430 + }, + { + "epoch": 0.7977957514887566, + "grad_norm": 1.6532939672470093, + "learning_rate": 1.755708742727328e-05, + "loss": 1.8146, + "step": 22440 + }, + { + "epoch": 0.7981512754421829, + "grad_norm": 1.5563749074935913, + "learning_rate": 1.755452019523955e-05, + "loss": 1.7988, + "step": 22450 + }, + { + "epoch": 0.7985067993956093, + "grad_norm": 1.6442334651947021, + "learning_rate": 1.7551951802878885e-05, + "loss": 1.8098, + "step": 22460 + }, + { + "epoch": 0.7988623233490356, + "grad_norm": 1.5837472677230835, + "learning_rate": 1.7549382250585772e-05, + "loss": 1.8456, + "step": 22470 + }, + { + "epoch": 0.799217847302462, + "grad_norm": 1.6369633674621582, + "learning_rate": 1.7546811538754882e-05, + "loss": 1.8455, + "step": 22480 + }, + { + "epoch": 0.7995733712558883, + "grad_norm": 1.636970043182373, + "learning_rate": 1.7544239667781057e-05, + "loss": 1.8105, + "step": 22490 + }, + { + "epoch": 0.7999288952093148, + "grad_norm": 1.5929034948349, + "learning_rate": 1.7541666638059323e-05, + "loss": 1.8375, + "step": 22500 + }, + { + "epoch": 0.8002844191627411, + "grad_norm": 1.5676238536834717, + "learning_rate": 1.753909244998488e-05, + "loss": 1.8694, + "step": 22510 + }, + { + "epoch": 0.8006399431161675, + "grad_norm": 1.7536896467208862, + "learning_rate": 1.7536517103953105e-05, + "loss": 1.8125, + "step": 22520 + }, + { + "epoch": 0.8009954670695938, + "grad_norm": 1.5093269348144531, + "learning_rate": 1.753394060035956e-05, + "loss": 1.7803, + "step": 22530 + }, + { + "epoch": 0.8013509910230202, + "grad_norm": 1.655174970626831, + "learning_rate": 1.7531362939599973e-05, + "loss": 1.7836, + "step": 22540 + }, + { + "epoch": 0.8017065149764465, + "grad_norm": 1.609383463859558, + "learning_rate": 1.7528784122070265e-05, + "loss": 1.8658, + "step": 22550 + }, + { + "epoch": 0.8020620389298729, + "grad_norm": 1.585519790649414, + "learning_rate": 1.7526204148166523e-05, + "loss": 1.8446, + "step": 22560 + }, + { + "epoch": 0.8024175628832992, + "grad_norm": 1.6932191848754883, + "learning_rate": 1.7523623018285008e-05, + "loss": 1.8236, + "step": 22570 + }, + { + "epoch": 0.8027730868367257, + "grad_norm": 1.6669684648513794, + "learning_rate": 1.7521040732822175e-05, + "loss": 1.7894, + "step": 22580 + }, + { + "epoch": 0.803128610790152, + "grad_norm": 1.7060233354568481, + "learning_rate": 1.751845729217464e-05, + "loss": 1.8038, + "step": 22590 + }, + { + "epoch": 0.8034841347435784, + "grad_norm": 1.5615458488464355, + "learning_rate": 1.751587269673921e-05, + "loss": 1.8553, + "step": 22600 + }, + { + "epoch": 0.8038396586970047, + "grad_norm": 1.607182502746582, + "learning_rate": 1.7513286946912852e-05, + "loss": 1.8425, + "step": 22610 + }, + { + "epoch": 0.8041951826504311, + "grad_norm": 1.6426219940185547, + "learning_rate": 1.7510700043092735e-05, + "loss": 1.8096, + "step": 22620 + }, + { + "epoch": 0.8045507066038574, + "grad_norm": 1.7619822025299072, + "learning_rate": 1.7508111985676177e-05, + "loss": 1.8166, + "step": 22630 + }, + { + "epoch": 0.8049062305572838, + "grad_norm": 1.593801498413086, + "learning_rate": 1.7505522775060697e-05, + "loss": 1.8535, + "step": 22640 + }, + { + "epoch": 0.8052617545107101, + "grad_norm": 1.6533886194229126, + "learning_rate": 1.750293241164398e-05, + "loss": 1.8856, + "step": 22650 + }, + { + "epoch": 0.8056172784641366, + "grad_norm": 1.7397966384887695, + "learning_rate": 1.750034089582389e-05, + "loss": 1.8097, + "step": 22660 + }, + { + "epoch": 0.8059728024175629, + "grad_norm": 1.6742297410964966, + "learning_rate": 1.7497748227998462e-05, + "loss": 1.8489, + "step": 22670 + }, + { + "epoch": 0.8063283263709893, + "grad_norm": 1.584993839263916, + "learning_rate": 1.749515440856592e-05, + "loss": 1.893, + "step": 22680 + }, + { + "epoch": 0.8066838503244156, + "grad_norm": 1.5936106443405151, + "learning_rate": 1.7492559437924654e-05, + "loss": 1.8456, + "step": 22690 + }, + { + "epoch": 0.807039374277842, + "grad_norm": 1.5889955759048462, + "learning_rate": 1.7489963316473236e-05, + "loss": 1.8055, + "step": 22700 + }, + { + "epoch": 0.8073948982312683, + "grad_norm": 1.6362390518188477, + "learning_rate": 1.7487366044610418e-05, + "loss": 1.8642, + "step": 22710 + }, + { + "epoch": 0.8077504221846947, + "grad_norm": 1.6781909465789795, + "learning_rate": 1.748476762273512e-05, + "loss": 1.8462, + "step": 22720 + }, + { + "epoch": 0.808105946138121, + "grad_norm": 1.6672321557998657, + "learning_rate": 1.7482168051246448e-05, + "loss": 1.8106, + "step": 22730 + }, + { + "epoch": 0.8084614700915475, + "grad_norm": 1.6293582916259766, + "learning_rate": 1.747956733054367e-05, + "loss": 1.7957, + "step": 22740 + }, + { + "epoch": 0.8088169940449738, + "grad_norm": 1.6819753646850586, + "learning_rate": 1.7476965461026253e-05, + "loss": 1.8337, + "step": 22750 + }, + { + "epoch": 0.8091725179984002, + "grad_norm": 1.6322966814041138, + "learning_rate": 1.7474362443093823e-05, + "loss": 1.856, + "step": 22760 + }, + { + "epoch": 0.8095280419518265, + "grad_norm": 1.5908470153808594, + "learning_rate": 1.747175827714618e-05, + "loss": 1.8528, + "step": 22770 + }, + { + "epoch": 0.8098835659052529, + "grad_norm": 1.7300654649734497, + "learning_rate": 1.7469152963583323e-05, + "loss": 1.7907, + "step": 22780 + }, + { + "epoch": 0.8102390898586792, + "grad_norm": 1.5681836605072021, + "learning_rate": 1.7466546502805397e-05, + "loss": 1.7838, + "step": 22790 + }, + { + "epoch": 0.8105946138121056, + "grad_norm": 1.7767341136932373, + "learning_rate": 1.7463938895212745e-05, + "loss": 1.8224, + "step": 22800 + }, + { + "epoch": 0.8109501377655319, + "grad_norm": 1.7716939449310303, + "learning_rate": 1.7461330141205878e-05, + "loss": 1.8844, + "step": 22810 + }, + { + "epoch": 0.8113056617189583, + "grad_norm": 1.6044155359268188, + "learning_rate": 1.745872024118548e-05, + "loss": 1.8127, + "step": 22820 + }, + { + "epoch": 0.8116611856723847, + "grad_norm": 1.6654382944107056, + "learning_rate": 1.7456109195552425e-05, + "loss": 1.8143, + "step": 22830 + }, + { + "epoch": 0.8120167096258111, + "grad_norm": 1.6145553588867188, + "learning_rate": 1.7453497004707748e-05, + "loss": 1.8573, + "step": 22840 + }, + { + "epoch": 0.8123722335792374, + "grad_norm": 1.6546883583068848, + "learning_rate": 1.745088366905266e-05, + "loss": 1.8458, + "step": 22850 + }, + { + "epoch": 0.8127277575326638, + "grad_norm": 1.6224075555801392, + "learning_rate": 1.744826918898856e-05, + "loss": 1.8157, + "step": 22860 + }, + { + "epoch": 0.8130832814860901, + "grad_norm": 1.6021678447723389, + "learning_rate": 1.7445653564917016e-05, + "loss": 1.8435, + "step": 22870 + }, + { + "epoch": 0.8134388054395165, + "grad_norm": 1.6641144752502441, + "learning_rate": 1.7443036797239767e-05, + "loss": 1.8098, + "step": 22880 + }, + { + "epoch": 0.8137943293929428, + "grad_norm": 1.5793033838272095, + "learning_rate": 1.7440418886358735e-05, + "loss": 1.8159, + "step": 22890 + }, + { + "epoch": 0.8141498533463692, + "grad_norm": 1.817555546760559, + "learning_rate": 1.743779983267601e-05, + "loss": 1.8254, + "step": 22900 + }, + { + "epoch": 0.8145053772997956, + "grad_norm": 1.6587536334991455, + "learning_rate": 1.7435179636593874e-05, + "loss": 1.8243, + "step": 22910 + }, + { + "epoch": 0.814860901253222, + "grad_norm": 1.5765866041183472, + "learning_rate": 1.7432558298514758e-05, + "loss": 1.7785, + "step": 22920 + }, + { + "epoch": 0.8152164252066483, + "grad_norm": 1.6833018064498901, + "learning_rate": 1.742993581884129e-05, + "loss": 1.8263, + "step": 22930 + }, + { + "epoch": 0.8155719491600747, + "grad_norm": 1.6549988985061646, + "learning_rate": 1.742731219797627e-05, + "loss": 1.8692, + "step": 22940 + }, + { + "epoch": 0.815927473113501, + "grad_norm": 1.6467914581298828, + "learning_rate": 1.7424687436322664e-05, + "loss": 1.8037, + "step": 22950 + }, + { + "epoch": 0.8162829970669274, + "grad_norm": 1.562791347503662, + "learning_rate": 1.742206153428362e-05, + "loss": 1.8544, + "step": 22960 + }, + { + "epoch": 0.8166385210203537, + "grad_norm": 1.605326771736145, + "learning_rate": 1.7419434492262465e-05, + "loss": 1.8014, + "step": 22970 + }, + { + "epoch": 0.8169940449737801, + "grad_norm": 1.6755915880203247, + "learning_rate": 1.7416806310662688e-05, + "loss": 1.7874, + "step": 22980 + }, + { + "epoch": 0.8173495689272064, + "grad_norm": 1.6122695207595825, + "learning_rate": 1.741417698988797e-05, + "loss": 1.8217, + "step": 22990 + }, + { + "epoch": 0.8177050928806329, + "grad_norm": 1.6413447856903076, + "learning_rate": 1.7411546530342148e-05, + "loss": 1.8296, + "step": 23000 + }, + { + "epoch": 0.8180606168340592, + "grad_norm": 1.6276954412460327, + "learning_rate": 1.7408914932429254e-05, + "loss": 1.8272, + "step": 23010 + }, + { + "epoch": 0.8184161407874856, + "grad_norm": 1.5439822673797607, + "learning_rate": 1.7406282196553477e-05, + "loss": 1.8266, + "step": 23020 + }, + { + "epoch": 0.8187716647409119, + "grad_norm": 1.7403044700622559, + "learning_rate": 1.7403648323119195e-05, + "loss": 1.8242, + "step": 23030 + }, + { + "epoch": 0.8191271886943383, + "grad_norm": 1.7068008184432983, + "learning_rate": 1.740101331253095e-05, + "loss": 1.8083, + "step": 23040 + }, + { + "epoch": 0.8194827126477646, + "grad_norm": 1.5520436763763428, + "learning_rate": 1.7398377165193464e-05, + "loss": 1.8036, + "step": 23050 + }, + { + "epoch": 0.819838236601191, + "grad_norm": 1.6793220043182373, + "learning_rate": 1.7395739881511637e-05, + "loss": 1.7867, + "step": 23060 + }, + { + "epoch": 0.8201937605546173, + "grad_norm": 1.6804542541503906, + "learning_rate": 1.7393101461890536e-05, + "loss": 1.8301, + "step": 23070 + }, + { + "epoch": 0.8205492845080438, + "grad_norm": 1.6650656461715698, + "learning_rate": 1.7390461906735403e-05, + "loss": 1.8543, + "step": 23080 + }, + { + "epoch": 0.8209048084614701, + "grad_norm": 1.6909743547439575, + "learning_rate": 1.738782121645166e-05, + "loss": 1.8101, + "step": 23090 + }, + { + "epoch": 0.8212603324148965, + "grad_norm": 1.7118736505508423, + "learning_rate": 1.7385179391444903e-05, + "loss": 1.7889, + "step": 23100 + }, + { + "epoch": 0.8216158563683228, + "grad_norm": 1.5747915506362915, + "learning_rate": 1.7382536432120892e-05, + "loss": 1.818, + "step": 23110 + }, + { + "epoch": 0.8219713803217492, + "grad_norm": 1.6179157495498657, + "learning_rate": 1.7379892338885577e-05, + "loss": 1.8661, + "step": 23120 + }, + { + "epoch": 0.8223269042751755, + "grad_norm": 1.6345182657241821, + "learning_rate": 1.737724711214507e-05, + "loss": 1.8063, + "step": 23130 + }, + { + "epoch": 0.8226824282286019, + "grad_norm": 1.6581584215164185, + "learning_rate": 1.7374600752305663e-05, + "loss": 1.801, + "step": 23140 + }, + { + "epoch": 0.8230379521820282, + "grad_norm": 1.647096872329712, + "learning_rate": 1.7371953259773818e-05, + "loss": 1.8225, + "step": 23150 + }, + { + "epoch": 0.8233934761354547, + "grad_norm": 1.7149620056152344, + "learning_rate": 1.7369304634956176e-05, + "loss": 1.8333, + "step": 23160 + }, + { + "epoch": 0.823749000088881, + "grad_norm": 1.6080249547958374, + "learning_rate": 1.7366654878259547e-05, + "loss": 1.7963, + "step": 23170 + }, + { + "epoch": 0.8241045240423074, + "grad_norm": 1.6695677042007446, + "learning_rate": 1.7364003990090923e-05, + "loss": 1.862, + "step": 23180 + }, + { + "epoch": 0.8244600479957337, + "grad_norm": 1.6274735927581787, + "learning_rate": 1.7361351970857454e-05, + "loss": 1.8066, + "step": 23190 + }, + { + "epoch": 0.8248155719491601, + "grad_norm": 1.7230697870254517, + "learning_rate": 1.735869882096648e-05, + "loss": 1.7928, + "step": 23200 + }, + { + "epoch": 0.8251710959025864, + "grad_norm": 1.6167101860046387, + "learning_rate": 1.7356044540825504e-05, + "loss": 1.8395, + "step": 23210 + }, + { + "epoch": 0.8255266198560128, + "grad_norm": 1.6141185760498047, + "learning_rate": 1.735338913084221e-05, + "loss": 1.8165, + "step": 23220 + }, + { + "epoch": 0.8258821438094391, + "grad_norm": 1.613447904586792, + "learning_rate": 1.7350732591424452e-05, + "loss": 1.8036, + "step": 23230 + }, + { + "epoch": 0.8262376677628656, + "grad_norm": 1.720850944519043, + "learning_rate": 1.734807492298026e-05, + "loss": 1.831, + "step": 23240 + }, + { + "epoch": 0.8265931917162919, + "grad_norm": 1.5681285858154297, + "learning_rate": 1.734541612591783e-05, + "loss": 1.805, + "step": 23250 + }, + { + "epoch": 0.8269487156697183, + "grad_norm": 1.7178831100463867, + "learning_rate": 1.734275620064554e-05, + "loss": 1.8113, + "step": 23260 + }, + { + "epoch": 0.8273042396231446, + "grad_norm": 1.5972732305526733, + "learning_rate": 1.7340095147571937e-05, + "loss": 1.8014, + "step": 23270 + }, + { + "epoch": 0.827659763576571, + "grad_norm": 1.5485990047454834, + "learning_rate": 1.733743296710574e-05, + "loss": 1.8566, + "step": 23280 + }, + { + "epoch": 0.8280152875299973, + "grad_norm": 1.5806162357330322, + "learning_rate": 1.7334769659655846e-05, + "loss": 1.8122, + "step": 23290 + }, + { + "epoch": 0.8283708114834237, + "grad_norm": 1.6596871614456177, + "learning_rate": 1.7332105225631325e-05, + "loss": 1.8234, + "step": 23300 + }, + { + "epoch": 0.82872633543685, + "grad_norm": 1.6023002862930298, + "learning_rate": 1.7329439665441413e-05, + "loss": 1.8423, + "step": 23310 + }, + { + "epoch": 0.8290818593902765, + "grad_norm": 1.663435697555542, + "learning_rate": 1.7326772979495522e-05, + "loss": 1.8248, + "step": 23320 + }, + { + "epoch": 0.8294373833437028, + "grad_norm": 1.6272053718566895, + "learning_rate": 1.7324105168203243e-05, + "loss": 1.827, + "step": 23330 + }, + { + "epoch": 0.8297929072971292, + "grad_norm": 1.6218414306640625, + "learning_rate": 1.732143623197433e-05, + "loss": 1.7947, + "step": 23340 + }, + { + "epoch": 0.8301484312505555, + "grad_norm": 1.7325440645217896, + "learning_rate": 1.731876617121872e-05, + "loss": 1.8324, + "step": 23350 + }, + { + "epoch": 0.8305039552039819, + "grad_norm": 1.6133369207382202, + "learning_rate": 1.731609498634651e-05, + "loss": 1.8411, + "step": 23360 + }, + { + "epoch": 0.8308594791574082, + "grad_norm": 1.5473828315734863, + "learning_rate": 1.731342267776799e-05, + "loss": 1.8226, + "step": 23370 + }, + { + "epoch": 0.8312150031108346, + "grad_norm": 1.601293683052063, + "learning_rate": 1.7310749245893598e-05, + "loss": 1.8239, + "step": 23380 + }, + { + "epoch": 0.8315705270642609, + "grad_norm": 1.6454339027404785, + "learning_rate": 1.7308074691133962e-05, + "loss": 1.7772, + "step": 23390 + }, + { + "epoch": 0.8319260510176874, + "grad_norm": 1.5938347578048706, + "learning_rate": 1.7305399013899874e-05, + "loss": 1.8488, + "step": 23400 + }, + { + "epoch": 0.8322815749711137, + "grad_norm": 1.5726723670959473, + "learning_rate": 1.7302722214602302e-05, + "loss": 1.7704, + "step": 23410 + }, + { + "epoch": 0.8326370989245401, + "grad_norm": 1.7508751153945923, + "learning_rate": 1.7300044293652388e-05, + "loss": 1.8709, + "step": 23420 + }, + { + "epoch": 0.8329926228779664, + "grad_norm": 1.6652165651321411, + "learning_rate": 1.7297365251461445e-05, + "loss": 1.8546, + "step": 23430 + }, + { + "epoch": 0.8333481468313928, + "grad_norm": 1.731452226638794, + "learning_rate": 1.7294685088440947e-05, + "loss": 1.8703, + "step": 23440 + }, + { + "epoch": 0.8337036707848191, + "grad_norm": 1.6252065896987915, + "learning_rate": 1.729200380500256e-05, + "loss": 1.7988, + "step": 23450 + }, + { + "epoch": 0.8340591947382455, + "grad_norm": 1.6331079006195068, + "learning_rate": 1.7289321401558115e-05, + "loss": 1.8475, + "step": 23460 + }, + { + "epoch": 0.8344147186916718, + "grad_norm": 1.627914547920227, + "learning_rate": 1.7286637878519604e-05, + "loss": 1.8565, + "step": 23470 + }, + { + "epoch": 0.8347702426450982, + "grad_norm": 1.6057466268539429, + "learning_rate": 1.7283953236299198e-05, + "loss": 1.8619, + "step": 23480 + }, + { + "epoch": 0.8351257665985246, + "grad_norm": 1.6166599988937378, + "learning_rate": 1.728126747530925e-05, + "loss": 1.8413, + "step": 23490 + }, + { + "epoch": 0.835481290551951, + "grad_norm": 1.684545636177063, + "learning_rate": 1.727858059596227e-05, + "loss": 1.8224, + "step": 23500 + }, + { + "epoch": 0.8358368145053773, + "grad_norm": 1.6563124656677246, + "learning_rate": 1.727589259867095e-05, + "loss": 1.8531, + "step": 23510 + }, + { + "epoch": 0.8361923384588037, + "grad_norm": 1.5739567279815674, + "learning_rate": 1.7273203483848148e-05, + "loss": 1.7771, + "step": 23520 + }, + { + "epoch": 0.83654786241223, + "grad_norm": 1.6655970811843872, + "learning_rate": 1.727051325190689e-05, + "loss": 1.8566, + "step": 23530 + }, + { + "epoch": 0.8369033863656564, + "grad_norm": 1.5226960182189941, + "learning_rate": 1.7267821903260382e-05, + "loss": 1.8377, + "step": 23540 + }, + { + "epoch": 0.8372589103190827, + "grad_norm": 1.6616383790969849, + "learning_rate": 1.7265129438322004e-05, + "loss": 1.8156, + "step": 23550 + }, + { + "epoch": 0.8376144342725091, + "grad_norm": 1.642339825630188, + "learning_rate": 1.7262435857505295e-05, + "loss": 1.7684, + "step": 23560 + }, + { + "epoch": 0.8379699582259355, + "grad_norm": 1.724085807800293, + "learning_rate": 1.7259741161223976e-05, + "loss": 1.818, + "step": 23570 + }, + { + "epoch": 0.8383254821793619, + "grad_norm": 1.6169931888580322, + "learning_rate": 1.725704534989193e-05, + "loss": 1.8103, + "step": 23580 + }, + { + "epoch": 0.8386810061327882, + "grad_norm": 1.6618402004241943, + "learning_rate": 1.7254348423923222e-05, + "loss": 1.7984, + "step": 23590 + }, + { + "epoch": 0.8390365300862146, + "grad_norm": 1.6586053371429443, + "learning_rate": 1.725165038373208e-05, + "loss": 1.7949, + "step": 23600 + }, + { + "epoch": 0.8393920540396409, + "grad_norm": 1.5934162139892578, + "learning_rate": 1.724895122973291e-05, + "loss": 1.8535, + "step": 23610 + }, + { + "epoch": 0.8397475779930673, + "grad_norm": 1.6691250801086426, + "learning_rate": 1.7246250962340282e-05, + "loss": 1.832, + "step": 23620 + }, + { + "epoch": 0.8401031019464936, + "grad_norm": 1.7262389659881592, + "learning_rate": 1.724354958196894e-05, + "loss": 1.7997, + "step": 23630 + }, + { + "epoch": 0.84045862589992, + "grad_norm": 1.5825564861297607, + "learning_rate": 1.7240847089033796e-05, + "loss": 1.8178, + "step": 23640 + }, + { + "epoch": 0.8408141498533463, + "grad_norm": 1.7624269723892212, + "learning_rate": 1.7238143483949945e-05, + "loss": 1.8354, + "step": 23650 + }, + { + "epoch": 0.8411696738067728, + "grad_norm": 1.574318528175354, + "learning_rate": 1.7235438767132633e-05, + "loss": 1.8421, + "step": 23660 + }, + { + "epoch": 0.8415251977601991, + "grad_norm": 1.6515188217163086, + "learning_rate": 1.7232732938997296e-05, + "loss": 1.8819, + "step": 23670 + }, + { + "epoch": 0.8418807217136255, + "grad_norm": 1.6864103078842163, + "learning_rate": 1.723002599995953e-05, + "loss": 1.796, + "step": 23680 + }, + { + "epoch": 0.8422362456670518, + "grad_norm": 1.6396405696868896, + "learning_rate": 1.72273179504351e-05, + "loss": 1.7978, + "step": 23690 + }, + { + "epoch": 0.8425917696204782, + "grad_norm": 1.7666069269180298, + "learning_rate": 1.722460879083995e-05, + "loss": 1.8018, + "step": 23700 + }, + { + "epoch": 0.8429472935739045, + "grad_norm": 1.7268292903900146, + "learning_rate": 1.722189852159019e-05, + "loss": 1.8005, + "step": 23710 + }, + { + "epoch": 0.8433028175273309, + "grad_norm": 1.5880026817321777, + "learning_rate": 1.7219187143102097e-05, + "loss": 1.7837, + "step": 23720 + }, + { + "epoch": 0.8436583414807572, + "grad_norm": 1.7330087423324585, + "learning_rate": 1.7216474655792124e-05, + "loss": 1.8226, + "step": 23730 + }, + { + "epoch": 0.8440138654341837, + "grad_norm": 1.6118395328521729, + "learning_rate": 1.7213761060076894e-05, + "loss": 1.8469, + "step": 23740 + }, + { + "epoch": 0.84436938938761, + "grad_norm": 1.6583901643753052, + "learning_rate": 1.7211046356373193e-05, + "loss": 1.8016, + "step": 23750 + }, + { + "epoch": 0.8447249133410364, + "grad_norm": 1.6012097597122192, + "learning_rate": 1.7208330545097985e-05, + "loss": 1.8326, + "step": 23760 + }, + { + "epoch": 0.8450804372944627, + "grad_norm": 1.5444685220718384, + "learning_rate": 1.7205613626668404e-05, + "loss": 1.8092, + "step": 23770 + }, + { + "epoch": 0.8454359612478891, + "grad_norm": 1.6934510469436646, + "learning_rate": 1.7202895601501746e-05, + "loss": 1.7749, + "step": 23780 + }, + { + "epoch": 0.8457914852013154, + "grad_norm": 1.6778422594070435, + "learning_rate": 1.720017647001549e-05, + "loss": 1.8355, + "step": 23790 + }, + { + "epoch": 0.8461470091547418, + "grad_norm": 1.6825305223464966, + "learning_rate": 1.719745623262727e-05, + "loss": 1.8183, + "step": 23800 + }, + { + "epoch": 0.8465025331081681, + "grad_norm": 1.6143616437911987, + "learning_rate": 1.7194734889754903e-05, + "loss": 1.832, + "step": 23810 + }, + { + "epoch": 0.8468580570615946, + "grad_norm": 1.676255226135254, + "learning_rate": 1.7192012441816367e-05, + "loss": 1.815, + "step": 23820 + }, + { + "epoch": 0.8472135810150209, + "grad_norm": 1.6026546955108643, + "learning_rate": 1.7189288889229817e-05, + "loss": 1.8267, + "step": 23830 + }, + { + "epoch": 0.8475691049684473, + "grad_norm": 1.616668462753296, + "learning_rate": 1.718656423241357e-05, + "loss": 1.8386, + "step": 23840 + }, + { + "epoch": 0.8479246289218736, + "grad_norm": 1.6833947896957397, + "learning_rate": 1.7183838471786114e-05, + "loss": 1.8074, + "step": 23850 + }, + { + "epoch": 0.8482801528753, + "grad_norm": 1.5751172304153442, + "learning_rate": 1.7181111607766113e-05, + "loss": 1.8259, + "step": 23860 + }, + { + "epoch": 0.8486356768287263, + "grad_norm": 1.5970380306243896, + "learning_rate": 1.7178383640772396e-05, + "loss": 1.8195, + "step": 23870 + }, + { + "epoch": 0.8489912007821527, + "grad_norm": 1.6919461488723755, + "learning_rate": 1.7175654571223962e-05, + "loss": 1.82, + "step": 23880 + }, + { + "epoch": 0.849346724735579, + "grad_norm": 1.6423909664154053, + "learning_rate": 1.7172924399539975e-05, + "loss": 1.8026, + "step": 23890 + }, + { + "epoch": 0.8497022486890055, + "grad_norm": 1.7619222402572632, + "learning_rate": 1.7170193126139775e-05, + "loss": 1.8738, + "step": 23900 + }, + { + "epoch": 0.8500577726424318, + "grad_norm": 1.6697863340377808, + "learning_rate": 1.7167460751442872e-05, + "loss": 1.7832, + "step": 23910 + }, + { + "epoch": 0.8504132965958582, + "grad_norm": 1.7078362703323364, + "learning_rate": 1.716472727586893e-05, + "loss": 1.7962, + "step": 23920 + }, + { + "epoch": 0.8507688205492845, + "grad_norm": 1.5840452909469604, + "learning_rate": 1.716199269983781e-05, + "loss": 1.8442, + "step": 23930 + }, + { + "epoch": 0.8511243445027109, + "grad_norm": 1.636093258857727, + "learning_rate": 1.7159257023769512e-05, + "loss": 1.8077, + "step": 23940 + }, + { + "epoch": 0.8514798684561372, + "grad_norm": 1.6575133800506592, + "learning_rate": 1.7156520248084226e-05, + "loss": 1.8022, + "step": 23950 + }, + { + "epoch": 0.8518353924095636, + "grad_norm": 1.7173900604248047, + "learning_rate": 1.7153782373202302e-05, + "loss": 1.7997, + "step": 23960 + }, + { + "epoch": 0.8521909163629899, + "grad_norm": 1.5716763734817505, + "learning_rate": 1.7151043399544262e-05, + "loss": 1.8387, + "step": 23970 + }, + { + "epoch": 0.8525464403164164, + "grad_norm": 1.622989296913147, + "learning_rate": 1.714830332753079e-05, + "loss": 1.8472, + "step": 23980 + }, + { + "epoch": 0.8529019642698427, + "grad_norm": 1.7212612628936768, + "learning_rate": 1.7145562157582748e-05, + "loss": 1.849, + "step": 23990 + }, + { + "epoch": 0.8532574882232691, + "grad_norm": 1.6640286445617676, + "learning_rate": 1.714281989012116e-05, + "loss": 1.8478, + "step": 24000 + }, + { + "epoch": 0.8536130121766954, + "grad_norm": 1.6266303062438965, + "learning_rate": 1.7140076525567223e-05, + "loss": 1.88, + "step": 24010 + }, + { + "epoch": 0.8539685361301218, + "grad_norm": 1.750396966934204, + "learning_rate": 1.7137332064342303e-05, + "loss": 1.8144, + "step": 24020 + }, + { + "epoch": 0.8543240600835481, + "grad_norm": 1.6980276107788086, + "learning_rate": 1.7134586506867926e-05, + "loss": 1.8469, + "step": 24030 + }, + { + "epoch": 0.8546795840369745, + "grad_norm": 1.5892568826675415, + "learning_rate": 1.7131839853565798e-05, + "loss": 1.8194, + "step": 24040 + }, + { + "epoch": 0.8550351079904008, + "grad_norm": 1.6542547941207886, + "learning_rate": 1.7129092104857786e-05, + "loss": 1.8202, + "step": 24050 + }, + { + "epoch": 0.8553906319438273, + "grad_norm": 1.5758212804794312, + "learning_rate": 1.7126343261165926e-05, + "loss": 1.7851, + "step": 24060 + }, + { + "epoch": 0.8557461558972536, + "grad_norm": 1.7901427745819092, + "learning_rate": 1.7123593322912423e-05, + "loss": 1.8496, + "step": 24070 + }, + { + "epoch": 0.85610167985068, + "grad_norm": 1.7235887050628662, + "learning_rate": 1.712084229051965e-05, + "loss": 1.8359, + "step": 24080 + }, + { + "epoch": 0.8564572038041063, + "grad_norm": 1.5175025463104248, + "learning_rate": 1.711809016441015e-05, + "loss": 1.8642, + "step": 24090 + }, + { + "epoch": 0.8568127277575327, + "grad_norm": 1.7177553176879883, + "learning_rate": 1.7115336945006633e-05, + "loss": 1.8374, + "step": 24100 + }, + { + "epoch": 0.857168251710959, + "grad_norm": 1.6444177627563477, + "learning_rate": 1.7112582632731972e-05, + "loss": 1.8006, + "step": 24110 + }, + { + "epoch": 0.8575237756643854, + "grad_norm": 1.6575236320495605, + "learning_rate": 1.710982722800922e-05, + "loss": 1.8396, + "step": 24120 + }, + { + "epoch": 0.8578792996178117, + "grad_norm": 1.6627295017242432, + "learning_rate": 1.710707073126158e-05, + "loss": 1.7414, + "step": 24130 + }, + { + "epoch": 0.8582348235712381, + "grad_norm": 1.7354328632354736, + "learning_rate": 1.7104313142912436e-05, + "loss": 1.8258, + "step": 24140 + }, + { + "epoch": 0.8585903475246645, + "grad_norm": 1.717443823814392, + "learning_rate": 1.7101554463385342e-05, + "loss": 1.8485, + "step": 24150 + }, + { + "epoch": 0.8589458714780909, + "grad_norm": 1.6752740144729614, + "learning_rate": 1.7098794693104008e-05, + "loss": 1.8267, + "step": 24160 + }, + { + "epoch": 0.8593013954315172, + "grad_norm": 1.7124459743499756, + "learning_rate": 1.709603383249232e-05, + "loss": 1.8576, + "step": 24170 + }, + { + "epoch": 0.8596569193849436, + "grad_norm": 1.6862879991531372, + "learning_rate": 1.7093271881974325e-05, + "loss": 1.8241, + "step": 24180 + }, + { + "epoch": 0.8600124433383699, + "grad_norm": 1.7252548933029175, + "learning_rate": 1.7090508841974243e-05, + "loss": 1.7997, + "step": 24190 + }, + { + "epoch": 0.8603679672917963, + "grad_norm": 1.6613682508468628, + "learning_rate": 1.7087744712916464e-05, + "loss": 1.8332, + "step": 24200 + }, + { + "epoch": 0.8607234912452226, + "grad_norm": 1.6545765399932861, + "learning_rate": 1.7084979495225537e-05, + "loss": 1.8372, + "step": 24210 + }, + { + "epoch": 0.861079015198649, + "grad_norm": 1.613324522972107, + "learning_rate": 1.708221318932618e-05, + "loss": 1.7797, + "step": 24220 + }, + { + "epoch": 0.8614345391520754, + "grad_norm": 1.7013195753097534, + "learning_rate": 1.7079445795643286e-05, + "loss": 1.7678, + "step": 24230 + }, + { + "epoch": 0.8617900631055018, + "grad_norm": 1.6265993118286133, + "learning_rate": 1.7076677314601907e-05, + "loss": 1.751, + "step": 24240 + }, + { + "epoch": 0.8621455870589281, + "grad_norm": 1.7178155183792114, + "learning_rate": 1.7073907746627263e-05, + "loss": 1.8304, + "step": 24250 + }, + { + "epoch": 0.8625011110123545, + "grad_norm": 1.7472935914993286, + "learning_rate": 1.707113709214474e-05, + "loss": 1.788, + "step": 24260 + }, + { + "epoch": 0.8628566349657808, + "grad_norm": 1.7131307125091553, + "learning_rate": 1.7068365351579902e-05, + "loss": 1.8344, + "step": 24270 + }, + { + "epoch": 0.8632121589192072, + "grad_norm": 1.7252979278564453, + "learning_rate": 1.706559252535846e-05, + "loss": 1.8178, + "step": 24280 + }, + { + "epoch": 0.8635676828726335, + "grad_norm": 1.554989218711853, + "learning_rate": 1.7062818613906307e-05, + "loss": 1.8321, + "step": 24290 + }, + { + "epoch": 0.8639232068260599, + "grad_norm": 1.7460143566131592, + "learning_rate": 1.7060043617649503e-05, + "loss": 1.7565, + "step": 24300 + }, + { + "epoch": 0.8642787307794862, + "grad_norm": 1.7684497833251953, + "learning_rate": 1.705726753701426e-05, + "loss": 1.8584, + "step": 24310 + }, + { + "epoch": 0.8646342547329127, + "grad_norm": 1.634946346282959, + "learning_rate": 1.705449037242698e-05, + "loss": 1.7973, + "step": 24320 + }, + { + "epoch": 0.864989778686339, + "grad_norm": 1.6846821308135986, + "learning_rate": 1.7051712124314205e-05, + "loss": 1.8134, + "step": 24330 + }, + { + "epoch": 0.8653453026397654, + "grad_norm": 1.6562668085098267, + "learning_rate": 1.7048932793102667e-05, + "loss": 1.8097, + "step": 24340 + }, + { + "epoch": 0.8657008265931917, + "grad_norm": 1.5983461141586304, + "learning_rate": 1.7046152379219247e-05, + "loss": 1.8386, + "step": 24350 + }, + { + "epoch": 0.8660563505466181, + "grad_norm": 1.7765803337097168, + "learning_rate": 1.7043370883091002e-05, + "loss": 1.8211, + "step": 24360 + }, + { + "epoch": 0.8664118745000444, + "grad_norm": 1.6079620122909546, + "learning_rate": 1.704058830514515e-05, + "loss": 1.8219, + "step": 24370 + }, + { + "epoch": 0.8667673984534708, + "grad_norm": 1.606056809425354, + "learning_rate": 1.703780464580908e-05, + "loss": 1.8003, + "step": 24380 + }, + { + "epoch": 0.8671229224068971, + "grad_norm": 1.673724889755249, + "learning_rate": 1.7035019905510344e-05, + "loss": 1.8186, + "step": 24390 + }, + { + "epoch": 0.8674784463603236, + "grad_norm": 1.5252655744552612, + "learning_rate": 1.703223408467666e-05, + "loss": 1.8745, + "step": 24400 + }, + { + "epoch": 0.8678339703137499, + "grad_norm": 1.60201895236969, + "learning_rate": 1.7029447183735915e-05, + "loss": 1.8072, + "step": 24410 + }, + { + "epoch": 0.8681894942671763, + "grad_norm": 1.7763891220092773, + "learning_rate": 1.7026659203116155e-05, + "loss": 1.7658, + "step": 24420 + }, + { + "epoch": 0.8685450182206026, + "grad_norm": 1.7289584875106812, + "learning_rate": 1.70238701432456e-05, + "loss": 1.8364, + "step": 24430 + }, + { + "epoch": 0.868900542174029, + "grad_norm": 1.692323088645935, + "learning_rate": 1.702108000455263e-05, + "loss": 1.81, + "step": 24440 + }, + { + "epoch": 0.8692560661274553, + "grad_norm": 1.7640386819839478, + "learning_rate": 1.7018288787465796e-05, + "loss": 1.8089, + "step": 24450 + }, + { + "epoch": 0.8696115900808817, + "grad_norm": 1.5894044637680054, + "learning_rate": 1.7015496492413807e-05, + "loss": 1.8043, + "step": 24460 + }, + { + "epoch": 0.869967114034308, + "grad_norm": 1.6370770931243896, + "learning_rate": 1.7012703119825542e-05, + "loss": 1.8437, + "step": 24470 + }, + { + "epoch": 0.8703226379877345, + "grad_norm": 1.8560885190963745, + "learning_rate": 1.700990867013005e-05, + "loss": 1.7915, + "step": 24480 + }, + { + "epoch": 0.8706781619411608, + "grad_norm": 1.6874840259552002, + "learning_rate": 1.7007113143756542e-05, + "loss": 1.7954, + "step": 24490 + }, + { + "epoch": 0.8710336858945872, + "grad_norm": 1.6154260635375977, + "learning_rate": 1.7004316541134387e-05, + "loss": 1.8019, + "step": 24500 + }, + { + "epoch": 0.8713892098480135, + "grad_norm": 1.7375465631484985, + "learning_rate": 1.7001518862693132e-05, + "loss": 1.7495, + "step": 24510 + }, + { + "epoch": 0.8717447338014399, + "grad_norm": 1.743693232536316, + "learning_rate": 1.6998720108862475e-05, + "loss": 1.8341, + "step": 24520 + }, + { + "epoch": 0.8721002577548662, + "grad_norm": 1.643489956855774, + "learning_rate": 1.6995920280072297e-05, + "loss": 1.859, + "step": 24530 + }, + { + "epoch": 0.8724557817082926, + "grad_norm": 1.6341086626052856, + "learning_rate": 1.6993119376752622e-05, + "loss": 1.7679, + "step": 24540 + }, + { + "epoch": 0.8728113056617189, + "grad_norm": 1.5878918170928955, + "learning_rate": 1.699031739933366e-05, + "loss": 1.825, + "step": 24550 + }, + { + "epoch": 0.8731668296151454, + "grad_norm": 1.651410460472107, + "learning_rate": 1.6987514348245776e-05, + "loss": 1.8001, + "step": 24560 + }, + { + "epoch": 0.8735223535685717, + "grad_norm": 1.656724214553833, + "learning_rate": 1.6984710223919503e-05, + "loss": 1.8573, + "step": 24570 + }, + { + "epoch": 0.8738778775219981, + "grad_norm": 1.650752067565918, + "learning_rate": 1.698190502678553e-05, + "loss": 1.8491, + "step": 24580 + }, + { + "epoch": 0.8742334014754244, + "grad_norm": 1.6090630292892456, + "learning_rate": 1.6979098757274725e-05, + "loss": 1.8489, + "step": 24590 + }, + { + "epoch": 0.8745889254288508, + "grad_norm": 1.8214991092681885, + "learning_rate": 1.6976291415818107e-05, + "loss": 1.8129, + "step": 24600 + }, + { + "epoch": 0.8749444493822771, + "grad_norm": 1.6846964359283447, + "learning_rate": 1.697348300284687e-05, + "loss": 1.8216, + "step": 24610 + }, + { + "epoch": 0.8752999733357035, + "grad_norm": 1.6882396936416626, + "learning_rate": 1.697067351879237e-05, + "loss": 1.8095, + "step": 24620 + }, + { + "epoch": 0.8756554972891298, + "grad_norm": 1.6624641418457031, + "learning_rate": 1.6967862964086124e-05, + "loss": 1.8097, + "step": 24630 + }, + { + "epoch": 0.8760110212425563, + "grad_norm": 1.591064453125, + "learning_rate": 1.6965051339159812e-05, + "loss": 1.7942, + "step": 24640 + }, + { + "epoch": 0.8763665451959826, + "grad_norm": 1.6376370191574097, + "learning_rate": 1.6962238644445288e-05, + "loss": 1.7863, + "step": 24650 + }, + { + "epoch": 0.876722069149409, + "grad_norm": 1.6289722919464111, + "learning_rate": 1.695942488037456e-05, + "loss": 1.7999, + "step": 24660 + }, + { + "epoch": 0.8770775931028353, + "grad_norm": 1.7085307836532593, + "learning_rate": 1.6956610047379808e-05, + "loss": 1.8486, + "step": 24670 + }, + { + "epoch": 0.8774331170562617, + "grad_norm": 1.6344826221466064, + "learning_rate": 1.6953794145893372e-05, + "loss": 1.8198, + "step": 24680 + }, + { + "epoch": 0.877788641009688, + "grad_norm": 1.6788198947906494, + "learning_rate": 1.6950977176347755e-05, + "loss": 1.7615, + "step": 24690 + }, + { + "epoch": 0.8781441649631144, + "grad_norm": 1.5258077383041382, + "learning_rate": 1.6948159139175624e-05, + "loss": 1.8519, + "step": 24700 + }, + { + "epoch": 0.8784996889165407, + "grad_norm": 1.676614761352539, + "learning_rate": 1.6945340034809816e-05, + "loss": 1.8018, + "step": 24710 + }, + { + "epoch": 0.8788552128699672, + "grad_norm": 1.624168872833252, + "learning_rate": 1.694251986368333e-05, + "loss": 1.8087, + "step": 24720 + }, + { + "epoch": 0.8792107368233935, + "grad_norm": 1.7765724658966064, + "learning_rate": 1.6939698626229318e-05, + "loss": 1.8441, + "step": 24730 + }, + { + "epoch": 0.8795662607768199, + "grad_norm": 1.6141785383224487, + "learning_rate": 1.693687632288111e-05, + "loss": 1.8544, + "step": 24740 + }, + { + "epoch": 0.8799217847302462, + "grad_norm": 1.7429922819137573, + "learning_rate": 1.6934052954072196e-05, + "loss": 1.8214, + "step": 24750 + }, + { + "epoch": 0.8802773086836726, + "grad_norm": 1.6700776815414429, + "learning_rate": 1.6931228520236223e-05, + "loss": 1.8609, + "step": 24760 + }, + { + "epoch": 0.8806328326370989, + "grad_norm": 1.5559288263320923, + "learning_rate": 1.692840302180701e-05, + "loss": 1.8422, + "step": 24770 + }, + { + "epoch": 0.8809883565905253, + "grad_norm": 1.7079417705535889, + "learning_rate": 1.692557645921853e-05, + "loss": 1.8145, + "step": 24780 + }, + { + "epoch": 0.8813438805439516, + "grad_norm": 1.636964201927185, + "learning_rate": 1.6922748832904937e-05, + "loss": 1.7924, + "step": 24790 + }, + { + "epoch": 0.881699404497378, + "grad_norm": 1.730451226234436, + "learning_rate": 1.6919920143300524e-05, + "loss": 1.7988, + "step": 24800 + }, + { + "epoch": 0.8820549284508044, + "grad_norm": 1.6825193166732788, + "learning_rate": 1.6917090390839766e-05, + "loss": 1.8124, + "step": 24810 + }, + { + "epoch": 0.8824104524042308, + "grad_norm": 1.5790181159973145, + "learning_rate": 1.6914259575957294e-05, + "loss": 1.7826, + "step": 24820 + }, + { + "epoch": 0.8827659763576571, + "grad_norm": 1.5225175619125366, + "learning_rate": 1.6911427699087902e-05, + "loss": 1.8169, + "step": 24830 + }, + { + "epoch": 0.8831215003110835, + "grad_norm": 1.7250087261199951, + "learning_rate": 1.6908594760666557e-05, + "loss": 1.8296, + "step": 24840 + }, + { + "epoch": 0.8834770242645098, + "grad_norm": 1.6308724880218506, + "learning_rate": 1.6905760761128367e-05, + "loss": 1.8544, + "step": 24850 + }, + { + "epoch": 0.8838325482179362, + "grad_norm": 1.5742348432540894, + "learning_rate": 1.690292570090863e-05, + "loss": 1.834, + "step": 24860 + }, + { + "epoch": 0.8841880721713625, + "grad_norm": 1.6509791612625122, + "learning_rate": 1.690008958044278e-05, + "loss": 1.8607, + "step": 24870 + }, + { + "epoch": 0.8845435961247889, + "grad_norm": 1.661698341369629, + "learning_rate": 1.689725240016644e-05, + "loss": 1.7723, + "step": 24880 + }, + { + "epoch": 0.8848991200782153, + "grad_norm": 1.709389090538025, + "learning_rate": 1.6894414160515373e-05, + "loss": 1.7786, + "step": 24890 + }, + { + "epoch": 0.8852546440316417, + "grad_norm": 1.6019915342330933, + "learning_rate": 1.6891574861925523e-05, + "loss": 1.819, + "step": 24900 + }, + { + "epoch": 0.885610167985068, + "grad_norm": 1.6010609865188599, + "learning_rate": 1.6888734504832984e-05, + "loss": 1.8405, + "step": 24910 + }, + { + "epoch": 0.8859656919384944, + "grad_norm": 1.6091557741165161, + "learning_rate": 1.6885893089674017e-05, + "loss": 1.8148, + "step": 24920 + }, + { + "epoch": 0.8863212158919207, + "grad_norm": 1.6396043300628662, + "learning_rate": 1.6883050616885043e-05, + "loss": 1.8317, + "step": 24930 + }, + { + "epoch": 0.8866767398453471, + "grad_norm": 1.6629292964935303, + "learning_rate": 1.6880207086902657e-05, + "loss": 1.8529, + "step": 24940 + }, + { + "epoch": 0.8870322637987734, + "grad_norm": 1.6890887022018433, + "learning_rate": 1.68773625001636e-05, + "loss": 1.7573, + "step": 24950 + }, + { + "epoch": 0.8873877877521998, + "grad_norm": 1.6500388383865356, + "learning_rate": 1.6874516857104782e-05, + "loss": 1.7858, + "step": 24960 + }, + { + "epoch": 0.8877433117056261, + "grad_norm": 1.7768943309783936, + "learning_rate": 1.6871670158163282e-05, + "loss": 1.8354, + "step": 24970 + }, + { + "epoch": 0.8880988356590526, + "grad_norm": 1.720981478691101, + "learning_rate": 1.6868822403776327e-05, + "loss": 1.7871, + "step": 24980 + }, + { + "epoch": 0.8884543596124789, + "grad_norm": 1.5580673217773438, + "learning_rate": 1.6865973594381322e-05, + "loss": 1.8176, + "step": 24990 + }, + { + "epoch": 0.8888098835659053, + "grad_norm": 1.6760094165802002, + "learning_rate": 1.6863123730415824e-05, + "loss": 1.846, + "step": 25000 + }, + { + "epoch": 0.8891654075193316, + "grad_norm": 1.5730007886886597, + "learning_rate": 1.686027281231755e-05, + "loss": 1.8024, + "step": 25010 + }, + { + "epoch": 0.889520931472758, + "grad_norm": 1.6264647245407104, + "learning_rate": 1.6857420840524387e-05, + "loss": 1.7785, + "step": 25020 + }, + { + "epoch": 0.8898764554261843, + "grad_norm": 1.6349519491195679, + "learning_rate": 1.685456781547438e-05, + "loss": 1.838, + "step": 25030 + }, + { + "epoch": 0.8902319793796107, + "grad_norm": 1.589114785194397, + "learning_rate": 1.6851713737605732e-05, + "loss": 1.8203, + "step": 25040 + }, + { + "epoch": 0.890587503333037, + "grad_norm": 1.6858025789260864, + "learning_rate": 1.684885860735682e-05, + "loss": 1.8338, + "step": 25050 + }, + { + "epoch": 0.8909430272864635, + "grad_norm": 1.7134677171707153, + "learning_rate": 1.6846002425166165e-05, + "loss": 1.8172, + "step": 25060 + }, + { + "epoch": 0.8912985512398898, + "grad_norm": 1.6317869424819946, + "learning_rate": 1.6843145191472463e-05, + "loss": 1.7995, + "step": 25070 + }, + { + "epoch": 0.8916540751933162, + "grad_norm": 1.6795090436935425, + "learning_rate": 1.6840286906714567e-05, + "loss": 1.8136, + "step": 25080 + }, + { + "epoch": 0.8920095991467425, + "grad_norm": 1.686813473701477, + "learning_rate": 1.6837427571331488e-05, + "loss": 1.7711, + "step": 25090 + }, + { + "epoch": 0.8923651231001689, + "grad_norm": 1.6473668813705444, + "learning_rate": 1.683456718576241e-05, + "loss": 1.807, + "step": 25100 + }, + { + "epoch": 0.8927206470535952, + "grad_norm": 1.6169072389602661, + "learning_rate": 1.683170575044666e-05, + "loss": 1.8179, + "step": 25110 + }, + { + "epoch": 0.8930761710070216, + "grad_norm": 1.5349276065826416, + "learning_rate": 1.6828843265823748e-05, + "loss": 1.8566, + "step": 25120 + }, + { + "epoch": 0.8934316949604479, + "grad_norm": 1.6595957279205322, + "learning_rate": 1.6825979732333323e-05, + "loss": 1.8345, + "step": 25130 + }, + { + "epoch": 0.8937872189138744, + "grad_norm": 1.6424028873443604, + "learning_rate": 1.6823115150415212e-05, + "loss": 1.8239, + "step": 25140 + }, + { + "epoch": 0.8941427428673007, + "grad_norm": 1.6583970785140991, + "learning_rate": 1.6820249520509392e-05, + "loss": 1.7771, + "step": 25150 + }, + { + "epoch": 0.8944982668207271, + "grad_norm": 1.641650676727295, + "learning_rate": 1.6817382843056012e-05, + "loss": 1.8091, + "step": 25160 + }, + { + "epoch": 0.8948537907741534, + "grad_norm": 1.6662938594818115, + "learning_rate": 1.6814515118495373e-05, + "loss": 1.84, + "step": 25170 + }, + { + "epoch": 0.8952093147275798, + "grad_norm": 1.6732330322265625, + "learning_rate": 1.681164634726794e-05, + "loss": 1.8464, + "step": 25180 + }, + { + "epoch": 0.8955648386810061, + "grad_norm": 1.6067739725112915, + "learning_rate": 1.6808776529814336e-05, + "loss": 1.8137, + "step": 25190 + }, + { + "epoch": 0.8959203626344325, + "grad_norm": 1.6159056425094604, + "learning_rate": 1.6805905666575347e-05, + "loss": 1.8179, + "step": 25200 + }, + { + "epoch": 0.8962758865878588, + "grad_norm": 1.5482304096221924, + "learning_rate": 1.680303375799192e-05, + "loss": 1.7954, + "step": 25210 + }, + { + "epoch": 0.8966314105412853, + "grad_norm": 1.6053767204284668, + "learning_rate": 1.6800160804505167e-05, + "loss": 1.8061, + "step": 25220 + }, + { + "epoch": 0.8969869344947116, + "grad_norm": 1.7497892379760742, + "learning_rate": 1.679728680655635e-05, + "loss": 1.8272, + "step": 25230 + }, + { + "epoch": 0.897342458448138, + "grad_norm": 1.6293673515319824, + "learning_rate": 1.67944117645869e-05, + "loss": 1.7566, + "step": 25240 + }, + { + "epoch": 0.8976979824015643, + "grad_norm": 1.6386243104934692, + "learning_rate": 1.6791535679038405e-05, + "loss": 1.8065, + "step": 25250 + }, + { + "epoch": 0.8980535063549907, + "grad_norm": 1.7207205295562744, + "learning_rate": 1.678865855035261e-05, + "loss": 1.7955, + "step": 25260 + }, + { + "epoch": 0.898409030308417, + "grad_norm": 1.705877661705017, + "learning_rate": 1.6785780378971427e-05, + "loss": 1.817, + "step": 25270 + }, + { + "epoch": 0.8987645542618434, + "grad_norm": 1.626386046409607, + "learning_rate": 1.6782901165336926e-05, + "loss": 1.8166, + "step": 25280 + }, + { + "epoch": 0.8991200782152697, + "grad_norm": 1.633610486984253, + "learning_rate": 1.6780020909891333e-05, + "loss": 1.8136, + "step": 25290 + }, + { + "epoch": 0.8994756021686962, + "grad_norm": 1.6760858297348022, + "learning_rate": 1.6777139613077046e-05, + "loss": 1.8296, + "step": 25300 + }, + { + "epoch": 0.8998311261221225, + "grad_norm": 1.6591938734054565, + "learning_rate": 1.67742572753366e-05, + "loss": 1.8122, + "step": 25310 + }, + { + "epoch": 0.9001866500755489, + "grad_norm": 1.5998114347457886, + "learning_rate": 1.6771373897112716e-05, + "loss": 1.8481, + "step": 25320 + }, + { + "epoch": 0.9005421740289752, + "grad_norm": 1.7435482740402222, + "learning_rate": 1.6768489478848254e-05, + "loss": 1.8044, + "step": 25330 + }, + { + "epoch": 0.9008976979824016, + "grad_norm": 1.637698769569397, + "learning_rate": 1.676560402098625e-05, + "loss": 1.816, + "step": 25340 + }, + { + "epoch": 0.9012532219358279, + "grad_norm": 1.6445133686065674, + "learning_rate": 1.676271752396989e-05, + "loss": 1.8083, + "step": 25350 + }, + { + "epoch": 0.9016087458892543, + "grad_norm": 1.590224027633667, + "learning_rate": 1.675982998824252e-05, + "loss": 1.8029, + "step": 25360 + }, + { + "epoch": 0.9019642698426806, + "grad_norm": 1.6143308877944946, + "learning_rate": 1.6756941414247644e-05, + "loss": 1.8448, + "step": 25370 + }, + { + "epoch": 0.902319793796107, + "grad_norm": 1.5782301425933838, + "learning_rate": 1.6754051802428936e-05, + "loss": 1.8168, + "step": 25380 + }, + { + "epoch": 0.9026753177495334, + "grad_norm": 1.6096141338348389, + "learning_rate": 1.6751161153230225e-05, + "loss": 1.8351, + "step": 25390 + }, + { + "epoch": 0.9030308417029598, + "grad_norm": 1.7121573686599731, + "learning_rate": 1.6748269467095484e-05, + "loss": 1.7669, + "step": 25400 + }, + { + "epoch": 0.9033863656563861, + "grad_norm": 1.7440303564071655, + "learning_rate": 1.6745376744468867e-05, + "loss": 1.8404, + "step": 25410 + }, + { + "epoch": 0.9037418896098125, + "grad_norm": 1.6387438774108887, + "learning_rate": 1.6742482985794676e-05, + "loss": 1.8323, + "step": 25420 + }, + { + "epoch": 0.9040974135632388, + "grad_norm": 1.6786017417907715, + "learning_rate": 1.673958819151737e-05, + "loss": 1.805, + "step": 25430 + }, + { + "epoch": 0.9044529375166652, + "grad_norm": 1.655165672302246, + "learning_rate": 1.673669236208158e-05, + "loss": 1.7973, + "step": 25440 + }, + { + "epoch": 0.9048084614700915, + "grad_norm": 1.6670957803726196, + "learning_rate": 1.673379549793208e-05, + "loss": 1.8664, + "step": 25450 + }, + { + "epoch": 0.905163985423518, + "grad_norm": 1.590448260307312, + "learning_rate": 1.6730897599513817e-05, + "loss": 1.8278, + "step": 25460 + }, + { + "epoch": 0.9055195093769443, + "grad_norm": 1.7210569381713867, + "learning_rate": 1.6727998667271882e-05, + "loss": 1.8282, + "step": 25470 + }, + { + "epoch": 0.9058750333303707, + "grad_norm": 1.5972177982330322, + "learning_rate": 1.672509870165154e-05, + "loss": 1.8379, + "step": 25480 + }, + { + "epoch": 0.906230557283797, + "grad_norm": 1.665274977684021, + "learning_rate": 1.6722197703098203e-05, + "loss": 1.7867, + "step": 25490 + }, + { + "epoch": 0.9065860812372234, + "grad_norm": 1.6389977931976318, + "learning_rate": 1.6719295672057445e-05, + "loss": 1.8531, + "step": 25500 + }, + { + "epoch": 0.9069416051906497, + "grad_norm": 1.698076605796814, + "learning_rate": 1.6716392608975004e-05, + "loss": 1.7753, + "step": 25510 + }, + { + "epoch": 0.9072971291440761, + "grad_norm": 1.7209333181381226, + "learning_rate": 1.6713488514296768e-05, + "loss": 1.8097, + "step": 25520 + }, + { + "epoch": 0.9076526530975024, + "grad_norm": 1.6083821058273315, + "learning_rate": 1.671058338846879e-05, + "loss": 1.8193, + "step": 25530 + }, + { + "epoch": 0.9080081770509288, + "grad_norm": 1.6375142335891724, + "learning_rate": 1.6707677231937282e-05, + "loss": 1.7958, + "step": 25540 + }, + { + "epoch": 0.9083637010043552, + "grad_norm": 1.6612564325332642, + "learning_rate": 1.6704770045148612e-05, + "loss": 1.8496, + "step": 25550 + }, + { + "epoch": 0.9087192249577816, + "grad_norm": 1.6492503881454468, + "learning_rate": 1.6701861828549296e-05, + "loss": 1.8113, + "step": 25560 + }, + { + "epoch": 0.9090747489112079, + "grad_norm": 1.566922903060913, + "learning_rate": 1.6698952582586025e-05, + "loss": 1.8328, + "step": 25570 + }, + { + "epoch": 0.9094302728646343, + "grad_norm": 1.5768183469772339, + "learning_rate": 1.6696042307705642e-05, + "loss": 1.7921, + "step": 25580 + }, + { + "epoch": 0.9097857968180606, + "grad_norm": 1.674768328666687, + "learning_rate": 1.6693131004355145e-05, + "loss": 1.8432, + "step": 25590 + }, + { + "epoch": 0.910141320771487, + "grad_norm": 1.531554937362671, + "learning_rate": 1.6690218672981687e-05, + "loss": 1.7815, + "step": 25600 + }, + { + "epoch": 0.9104968447249133, + "grad_norm": 1.5599174499511719, + "learning_rate": 1.6687305314032592e-05, + "loss": 1.7748, + "step": 25610 + }, + { + "epoch": 0.9108523686783397, + "grad_norm": 1.6537657976150513, + "learning_rate": 1.6684390927955333e-05, + "loss": 1.851, + "step": 25620 + }, + { + "epoch": 0.911207892631766, + "grad_norm": 1.6403638124465942, + "learning_rate": 1.668147551519754e-05, + "loss": 1.8258, + "step": 25630 + }, + { + "epoch": 0.9115634165851925, + "grad_norm": 1.6577059030532837, + "learning_rate": 1.6678559076206996e-05, + "loss": 1.7702, + "step": 25640 + }, + { + "epoch": 0.9119189405386188, + "grad_norm": 1.6512460708618164, + "learning_rate": 1.6675641611431657e-05, + "loss": 1.8846, + "step": 25650 + }, + { + "epoch": 0.9122744644920452, + "grad_norm": 1.6656643152236938, + "learning_rate": 1.667272312131962e-05, + "loss": 1.8001, + "step": 25660 + }, + { + "epoch": 0.9126299884454715, + "grad_norm": 1.7069002389907837, + "learning_rate": 1.6669803606319152e-05, + "loss": 1.8675, + "step": 25670 + }, + { + "epoch": 0.9129855123988979, + "grad_norm": 1.5266073942184448, + "learning_rate": 1.666688306687867e-05, + "loss": 1.7878, + "step": 25680 + }, + { + "epoch": 0.9133410363523242, + "grad_norm": 1.6533663272857666, + "learning_rate": 1.666396150344675e-05, + "loss": 1.8186, + "step": 25690 + }, + { + "epoch": 0.9136965603057506, + "grad_norm": 1.644303321838379, + "learning_rate": 1.6661038916472125e-05, + "loss": 1.7981, + "step": 25700 + }, + { + "epoch": 0.914052084259177, + "grad_norm": 1.7668614387512207, + "learning_rate": 1.665811530640369e-05, + "loss": 1.8579, + "step": 25710 + }, + { + "epoch": 0.9144076082126034, + "grad_norm": 1.591511845588684, + "learning_rate": 1.665519067369049e-05, + "loss": 1.8095, + "step": 25720 + }, + { + "epoch": 0.9147631321660297, + "grad_norm": 1.7172024250030518, + "learning_rate": 1.6652265018781726e-05, + "loss": 1.8013, + "step": 25730 + }, + { + "epoch": 0.9151186561194561, + "grad_norm": 1.6277397871017456, + "learning_rate": 1.6649338342126772e-05, + "loss": 1.7721, + "step": 25740 + }, + { + "epoch": 0.9154741800728824, + "grad_norm": 1.5798052549362183, + "learning_rate": 1.6646410644175137e-05, + "loss": 1.7835, + "step": 25750 + }, + { + "epoch": 0.9158297040263088, + "grad_norm": 1.698182225227356, + "learning_rate": 1.66434819253765e-05, + "loss": 1.8238, + "step": 25760 + }, + { + "epoch": 0.9161852279797351, + "grad_norm": 1.635740041732788, + "learning_rate": 1.6640552186180698e-05, + "loss": 1.801, + "step": 25770 + }, + { + "epoch": 0.9165407519331615, + "grad_norm": 1.6899878978729248, + "learning_rate": 1.6637621427037714e-05, + "loss": 1.8247, + "step": 25780 + }, + { + "epoch": 0.9168962758865878, + "grad_norm": 1.4938238859176636, + "learning_rate": 1.6634689648397695e-05, + "loss": 1.8334, + "step": 25790 + }, + { + "epoch": 0.9172517998400143, + "grad_norm": 1.6516492366790771, + "learning_rate": 1.663175685071095e-05, + "loss": 1.834, + "step": 25800 + }, + { + "epoch": 0.9176073237934406, + "grad_norm": 1.6564146280288696, + "learning_rate": 1.662882303442793e-05, + "loss": 1.786, + "step": 25810 + }, + { + "epoch": 0.917962847746867, + "grad_norm": 1.6790539026260376, + "learning_rate": 1.6625888199999258e-05, + "loss": 1.8361, + "step": 25820 + }, + { + "epoch": 0.9183183717002933, + "grad_norm": 1.7647924423217773, + "learning_rate": 1.66229523478757e-05, + "loss": 1.8182, + "step": 25830 + }, + { + "epoch": 0.9186738956537197, + "grad_norm": 1.6306095123291016, + "learning_rate": 1.662001547850819e-05, + "loss": 1.8168, + "step": 25840 + }, + { + "epoch": 0.919029419607146, + "grad_norm": 1.6261768341064453, + "learning_rate": 1.6617077592347813e-05, + "loss": 1.7882, + "step": 25850 + }, + { + "epoch": 0.9193849435605724, + "grad_norm": 1.7016527652740479, + "learning_rate": 1.6614138689845806e-05, + "loss": 1.7979, + "step": 25860 + }, + { + "epoch": 0.9197404675139987, + "grad_norm": 1.562110185623169, + "learning_rate": 1.6611198771453562e-05, + "loss": 1.8436, + "step": 25870 + }, + { + "epoch": 0.9200959914674252, + "grad_norm": 1.5601515769958496, + "learning_rate": 1.6608257837622646e-05, + "loss": 1.8073, + "step": 25880 + }, + { + "epoch": 0.9204515154208515, + "grad_norm": 1.5988566875457764, + "learning_rate": 1.6605315888804753e-05, + "loss": 1.7985, + "step": 25890 + }, + { + "epoch": 0.9208070393742779, + "grad_norm": 1.6320873498916626, + "learning_rate": 1.660237292545176e-05, + "loss": 1.8177, + "step": 25900 + }, + { + "epoch": 0.9211625633277042, + "grad_norm": 1.6602503061294556, + "learning_rate": 1.6599428948015682e-05, + "loss": 1.8008, + "step": 25910 + }, + { + "epoch": 0.9215180872811306, + "grad_norm": 1.6478127241134644, + "learning_rate": 1.6596483956948696e-05, + "loss": 1.8151, + "step": 25920 + }, + { + "epoch": 0.9218736112345569, + "grad_norm": 1.8068106174468994, + "learning_rate": 1.6593537952703137e-05, + "loss": 1.8107, + "step": 25930 + }, + { + "epoch": 0.9222291351879833, + "grad_norm": 1.6383720636367798, + "learning_rate": 1.659059093573149e-05, + "loss": 1.8039, + "step": 25940 + }, + { + "epoch": 0.9225846591414096, + "grad_norm": 1.669328212738037, + "learning_rate": 1.6587642906486395e-05, + "loss": 1.8336, + "step": 25950 + }, + { + "epoch": 0.922940183094836, + "grad_norm": 1.7093185186386108, + "learning_rate": 1.6584693865420655e-05, + "loss": 1.7985, + "step": 25960 + }, + { + "epoch": 0.9232957070482624, + "grad_norm": 1.5628633499145508, + "learning_rate": 1.6581743812987222e-05, + "loss": 1.7783, + "step": 25970 + }, + { + "epoch": 0.9236512310016888, + "grad_norm": 1.7095357179641724, + "learning_rate": 1.657879274963921e-05, + "loss": 1.8462, + "step": 25980 + }, + { + "epoch": 0.9240067549551151, + "grad_norm": 1.6446905136108398, + "learning_rate": 1.6575840675829883e-05, + "loss": 1.8119, + "step": 25990 + }, + { + "epoch": 0.9243622789085415, + "grad_norm": 1.7160478830337524, + "learning_rate": 1.6572887592012655e-05, + "loss": 1.7828, + "step": 26000 + }, + { + "epoch": 0.9247178028619678, + "grad_norm": 1.7009133100509644, + "learning_rate": 1.6569933498641105e-05, + "loss": 1.8093, + "step": 26010 + }, + { + "epoch": 0.9250733268153942, + "grad_norm": 1.7161458730697632, + "learning_rate": 1.656697839616897e-05, + "loss": 1.7835, + "step": 26020 + }, + { + "epoch": 0.9254288507688205, + "grad_norm": 1.5854977369308472, + "learning_rate": 1.6564022285050124e-05, + "loss": 1.8083, + "step": 26030 + }, + { + "epoch": 0.925784374722247, + "grad_norm": 1.7766863107681274, + "learning_rate": 1.656106516573861e-05, + "loss": 1.7877, + "step": 26040 + }, + { + "epoch": 0.9261398986756733, + "grad_norm": 1.7302604913711548, + "learning_rate": 1.6558107038688625e-05, + "loss": 1.8076, + "step": 26050 + }, + { + "epoch": 0.9264954226290997, + "grad_norm": 1.652533769607544, + "learning_rate": 1.655514790435452e-05, + "loss": 1.7882, + "step": 26060 + }, + { + "epoch": 0.926850946582526, + "grad_norm": 1.6904962062835693, + "learning_rate": 1.65521877631908e-05, + "loss": 1.7563, + "step": 26070 + }, + { + "epoch": 0.9272064705359524, + "grad_norm": 1.7011816501617432, + "learning_rate": 1.654922661565212e-05, + "loss": 1.8147, + "step": 26080 + }, + { + "epoch": 0.9275619944893787, + "grad_norm": 1.6210919618606567, + "learning_rate": 1.6546264462193295e-05, + "loss": 1.8108, + "step": 26090 + }, + { + "epoch": 0.9279175184428051, + "grad_norm": 1.7529455423355103, + "learning_rate": 1.6543301303269295e-05, + "loss": 1.8035, + "step": 26100 + }, + { + "epoch": 0.9282730423962314, + "grad_norm": 1.620786190032959, + "learning_rate": 1.6540337139335245e-05, + "loss": 1.8038, + "step": 26110 + }, + { + "epoch": 0.9286285663496578, + "grad_norm": 1.72477388381958, + "learning_rate": 1.6537371970846412e-05, + "loss": 1.8131, + "step": 26120 + }, + { + "epoch": 0.9289840903030842, + "grad_norm": 1.6291157007217407, + "learning_rate": 1.6534405798258238e-05, + "loss": 1.8056, + "step": 26130 + }, + { + "epoch": 0.9293396142565106, + "grad_norm": 1.6260063648223877, + "learning_rate": 1.6531438622026305e-05, + "loss": 1.8343, + "step": 26140 + }, + { + "epoch": 0.9296951382099369, + "grad_norm": 1.6301681995391846, + "learning_rate": 1.652847044260635e-05, + "loss": 1.769, + "step": 26150 + }, + { + "epoch": 0.9300506621633633, + "grad_norm": 1.6698664426803589, + "learning_rate": 1.652550126045427e-05, + "loss": 1.7997, + "step": 26160 + }, + { + "epoch": 0.9304061861167896, + "grad_norm": 1.7774896621704102, + "learning_rate": 1.652253107602611e-05, + "loss": 1.8359, + "step": 26170 + }, + { + "epoch": 0.930761710070216, + "grad_norm": 1.7517472505569458, + "learning_rate": 1.6519559889778077e-05, + "loss": 1.829, + "step": 26180 + }, + { + "epoch": 0.9311172340236423, + "grad_norm": 1.5541648864746094, + "learning_rate": 1.651658770216652e-05, + "loss": 1.7971, + "step": 26190 + }, + { + "epoch": 0.9314727579770687, + "grad_norm": 1.5613819360733032, + "learning_rate": 1.651361451364795e-05, + "loss": 1.8104, + "step": 26200 + }, + { + "epoch": 0.931828281930495, + "grad_norm": 1.6808326244354248, + "learning_rate": 1.651064032467903e-05, + "loss": 1.7883, + "step": 26210 + }, + { + "epoch": 0.9321838058839215, + "grad_norm": 1.6574971675872803, + "learning_rate": 1.6507665135716585e-05, + "loss": 1.7869, + "step": 26220 + }, + { + "epoch": 0.9325393298373478, + "grad_norm": 1.7454781532287598, + "learning_rate": 1.6504688947217573e-05, + "loss": 1.8373, + "step": 26230 + }, + { + "epoch": 0.9328948537907742, + "grad_norm": 1.7351772785186768, + "learning_rate": 1.650171175963913e-05, + "loss": 1.8216, + "step": 26240 + }, + { + "epoch": 0.9332503777442005, + "grad_norm": 1.5835143327713013, + "learning_rate": 1.649873357343852e-05, + "loss": 1.7755, + "step": 26250 + }, + { + "epoch": 0.9336059016976269, + "grad_norm": 1.6227622032165527, + "learning_rate": 1.6495754389073183e-05, + "loss": 1.788, + "step": 26260 + }, + { + "epoch": 0.9339614256510532, + "grad_norm": 1.6082923412322998, + "learning_rate": 1.6492774207000698e-05, + "loss": 1.7966, + "step": 26270 + }, + { + "epoch": 0.9343169496044796, + "grad_norm": 1.5585588216781616, + "learning_rate": 1.6489793027678807e-05, + "loss": 1.8497, + "step": 26280 + }, + { + "epoch": 0.934672473557906, + "grad_norm": 1.6597864627838135, + "learning_rate": 1.6486810851565397e-05, + "loss": 1.826, + "step": 26290 + }, + { + "epoch": 0.9350279975113324, + "grad_norm": 1.668641209602356, + "learning_rate": 1.6483827679118515e-05, + "loss": 1.825, + "step": 26300 + }, + { + "epoch": 0.9353835214647587, + "grad_norm": 1.6874868869781494, + "learning_rate": 1.6480843510796352e-05, + "loss": 1.8088, + "step": 26310 + }, + { + "epoch": 0.9357390454181851, + "grad_norm": 1.579192876815796, + "learning_rate": 1.6477858347057265e-05, + "loss": 1.8205, + "step": 26320 + }, + { + "epoch": 0.9360945693716114, + "grad_norm": 1.6124264001846313, + "learning_rate": 1.647487218835975e-05, + "loss": 1.8093, + "step": 26330 + }, + { + "epoch": 0.9364500933250378, + "grad_norm": 1.6060585975646973, + "learning_rate": 1.6471885035162465e-05, + "loss": 1.8507, + "step": 26340 + }, + { + "epoch": 0.9368056172784641, + "grad_norm": 1.5641084909439087, + "learning_rate": 1.6468896887924218e-05, + "loss": 1.8334, + "step": 26350 + }, + { + "epoch": 0.9371611412318905, + "grad_norm": 1.6883052587509155, + "learning_rate": 1.6465907747103968e-05, + "loss": 1.8284, + "step": 26360 + }, + { + "epoch": 0.9375166651853168, + "grad_norm": 1.6133944988250732, + "learning_rate": 1.6462917613160833e-05, + "loss": 1.7882, + "step": 26370 + }, + { + "epoch": 0.9378721891387433, + "grad_norm": 1.6845167875289917, + "learning_rate": 1.645992648655407e-05, + "loss": 1.8002, + "step": 26380 + }, + { + "epoch": 0.9382277130921696, + "grad_norm": 1.63066828250885, + "learning_rate": 1.6456934367743106e-05, + "loss": 1.8107, + "step": 26390 + }, + { + "epoch": 0.938583237045596, + "grad_norm": 1.61935555934906, + "learning_rate": 1.6453941257187508e-05, + "loss": 1.7801, + "step": 26400 + }, + { + "epoch": 0.9389387609990223, + "grad_norm": 1.6454601287841797, + "learning_rate": 1.6450947155347002e-05, + "loss": 1.7875, + "step": 26410 + }, + { + "epoch": 0.9392942849524487, + "grad_norm": 1.5497645139694214, + "learning_rate": 1.6447952062681456e-05, + "loss": 1.798, + "step": 26420 + }, + { + "epoch": 0.939649808905875, + "grad_norm": 1.6547778844833374, + "learning_rate": 1.6444955979650906e-05, + "loss": 1.8283, + "step": 26430 + }, + { + "epoch": 0.9400053328593014, + "grad_norm": 1.652701497077942, + "learning_rate": 1.6441958906715527e-05, + "loss": 1.7966, + "step": 26440 + }, + { + "epoch": 0.9403608568127277, + "grad_norm": 1.7806364297866821, + "learning_rate": 1.643896084433565e-05, + "loss": 1.801, + "step": 26450 + }, + { + "epoch": 0.9407163807661542, + "grad_norm": 1.5824679136276245, + "learning_rate": 1.643596179297176e-05, + "loss": 1.8239, + "step": 26460 + }, + { + "epoch": 0.9410719047195805, + "grad_norm": 1.7209099531173706, + "learning_rate": 1.6432961753084495e-05, + "loss": 1.8259, + "step": 26470 + }, + { + "epoch": 0.9414274286730069, + "grad_norm": 1.7159688472747803, + "learning_rate": 1.6429960725134634e-05, + "loss": 1.7624, + "step": 26480 + }, + { + "epoch": 0.9417829526264332, + "grad_norm": 1.6060218811035156, + "learning_rate": 1.6426958709583128e-05, + "loss": 1.8166, + "step": 26490 + }, + { + "epoch": 0.9421384765798596, + "grad_norm": 1.6274778842926025, + "learning_rate": 1.6423955706891056e-05, + "loss": 1.8193, + "step": 26500 + }, + { + "epoch": 0.9424940005332859, + "grad_norm": 1.705024003982544, + "learning_rate": 1.6420951717519672e-05, + "loss": 1.8024, + "step": 26510 + }, + { + "epoch": 0.9428495244867123, + "grad_norm": 1.687160611152649, + "learning_rate": 1.6417946741930358e-05, + "loss": 1.7968, + "step": 26520 + }, + { + "epoch": 0.9432050484401386, + "grad_norm": 1.6380635499954224, + "learning_rate": 1.641494078058467e-05, + "loss": 1.8179, + "step": 26530 + }, + { + "epoch": 0.9435605723935651, + "grad_norm": 1.748347282409668, + "learning_rate": 1.6411933833944294e-05, + "loss": 1.8204, + "step": 26540 + }, + { + "epoch": 0.9439160963469914, + "grad_norm": 1.6630667448043823, + "learning_rate": 1.640892590247109e-05, + "loss": 1.7902, + "step": 26550 + }, + { + "epoch": 0.9442716203004178, + "grad_norm": 1.6317050457000732, + "learning_rate": 1.6405916986627052e-05, + "loss": 1.8064, + "step": 26560 + }, + { + "epoch": 0.9446271442538441, + "grad_norm": 1.7291167974472046, + "learning_rate": 1.6402907086874326e-05, + "loss": 1.8267, + "step": 26570 + }, + { + "epoch": 0.9449826682072705, + "grad_norm": 1.6491247415542603, + "learning_rate": 1.6399896203675223e-05, + "loss": 1.8358, + "step": 26580 + }, + { + "epoch": 0.9453381921606968, + "grad_norm": 1.8621935844421387, + "learning_rate": 1.639688433749219e-05, + "loss": 1.8127, + "step": 26590 + }, + { + "epoch": 0.9456937161141232, + "grad_norm": 1.6204941272735596, + "learning_rate": 1.6393871488787826e-05, + "loss": 1.7839, + "step": 26600 + }, + { + "epoch": 0.9460492400675495, + "grad_norm": 1.6154526472091675, + "learning_rate": 1.6390857658024896e-05, + "loss": 1.8298, + "step": 26610 + }, + { + "epoch": 0.946404764020976, + "grad_norm": 1.6763432025909424, + "learning_rate": 1.6387842845666298e-05, + "loss": 1.8138, + "step": 26620 + }, + { + "epoch": 0.9467602879744023, + "grad_norm": 1.7188301086425781, + "learning_rate": 1.638482705217509e-05, + "loss": 1.8138, + "step": 26630 + }, + { + "epoch": 0.9471158119278287, + "grad_norm": 1.6560871601104736, + "learning_rate": 1.6381810278014486e-05, + "loss": 1.8101, + "step": 26640 + }, + { + "epoch": 0.947471335881255, + "grad_norm": 1.6502710580825806, + "learning_rate": 1.6378792523647834e-05, + "loss": 1.8108, + "step": 26650 + }, + { + "epoch": 0.9478268598346814, + "grad_norm": 1.8194656372070312, + "learning_rate": 1.6375773789538644e-05, + "loss": 1.773, + "step": 26660 + }, + { + "epoch": 0.9481823837881077, + "grad_norm": 1.7040694952011108, + "learning_rate": 1.637275407615058e-05, + "loss": 1.8403, + "step": 26670 + }, + { + "epoch": 0.9485379077415341, + "grad_norm": 1.646286964416504, + "learning_rate": 1.6369733383947445e-05, + "loss": 1.8213, + "step": 26680 + }, + { + "epoch": 0.9488934316949604, + "grad_norm": 1.5874391794204712, + "learning_rate": 1.63667117133932e-05, + "loss": 1.8257, + "step": 26690 + }, + { + "epoch": 0.9492489556483868, + "grad_norm": 1.6585197448730469, + "learning_rate": 1.6363689064951954e-05, + "loss": 1.7766, + "step": 26700 + }, + { + "epoch": 0.9496044796018132, + "grad_norm": 1.7515182495117188, + "learning_rate": 1.6360665439087973e-05, + "loss": 1.8237, + "step": 26710 + }, + { + "epoch": 0.9499600035552396, + "grad_norm": 1.6164888143539429, + "learning_rate": 1.635764083626566e-05, + "loss": 1.8089, + "step": 26720 + }, + { + "epoch": 0.9503155275086659, + "grad_norm": 1.596706748008728, + "learning_rate": 1.6354615256949578e-05, + "loss": 1.7892, + "step": 26730 + }, + { + "epoch": 0.9506710514620923, + "grad_norm": 2.4562809467315674, + "learning_rate": 1.6351588701604436e-05, + "loss": 1.7705, + "step": 26740 + }, + { + "epoch": 0.9510265754155186, + "grad_norm": 1.5344680547714233, + "learning_rate": 1.6348561170695094e-05, + "loss": 1.8005, + "step": 26750 + }, + { + "epoch": 0.951382099368945, + "grad_norm": 1.6693150997161865, + "learning_rate": 1.634553266468656e-05, + "loss": 1.8471, + "step": 26760 + }, + { + "epoch": 0.9517376233223713, + "grad_norm": 1.7177890539169312, + "learning_rate": 1.6342503184044e-05, + "loss": 1.7796, + "step": 26770 + }, + { + "epoch": 0.9520931472757977, + "grad_norm": 1.7595194578170776, + "learning_rate": 1.6339472729232716e-05, + "loss": 1.7738, + "step": 26780 + }, + { + "epoch": 0.952448671229224, + "grad_norm": 1.7894115447998047, + "learning_rate": 1.6336441300718167e-05, + "loss": 1.8033, + "step": 26790 + }, + { + "epoch": 0.9528041951826505, + "grad_norm": 1.5425448417663574, + "learning_rate": 1.6333408898965967e-05, + "loss": 1.8276, + "step": 26800 + }, + { + "epoch": 0.9531597191360768, + "grad_norm": 1.5785398483276367, + "learning_rate": 1.633037552444187e-05, + "loss": 1.778, + "step": 26810 + }, + { + "epoch": 0.9535152430895032, + "grad_norm": 1.6010870933532715, + "learning_rate": 1.6327341177611785e-05, + "loss": 1.7984, + "step": 26820 + }, + { + "epoch": 0.9538707670429295, + "grad_norm": 1.4971858263015747, + "learning_rate": 1.632430585894177e-05, + "loss": 1.8057, + "step": 26830 + }, + { + "epoch": 0.9542262909963559, + "grad_norm": 1.6898208856582642, + "learning_rate": 1.6321269568898025e-05, + "loss": 1.8113, + "step": 26840 + }, + { + "epoch": 0.9545818149497822, + "grad_norm": 1.6145386695861816, + "learning_rate": 1.6318232307946912e-05, + "loss": 1.8073, + "step": 26850 + }, + { + "epoch": 0.9549373389032086, + "grad_norm": 1.5379389524459839, + "learning_rate": 1.631519407655493e-05, + "loss": 1.8026, + "step": 26860 + }, + { + "epoch": 0.955292862856635, + "grad_norm": 1.575872540473938, + "learning_rate": 1.6312154875188733e-05, + "loss": 1.7668, + "step": 26870 + }, + { + "epoch": 0.9556483868100614, + "grad_norm": 1.650305151939392, + "learning_rate": 1.6309114704315127e-05, + "loss": 1.8223, + "step": 26880 + }, + { + "epoch": 0.9560039107634877, + "grad_norm": 1.562211275100708, + "learning_rate": 1.630607356440106e-05, + "loss": 1.8324, + "step": 26890 + }, + { + "epoch": 0.9563594347169141, + "grad_norm": 1.560794472694397, + "learning_rate": 1.630303145591363e-05, + "loss": 1.7921, + "step": 26900 + }, + { + "epoch": 0.9567149586703404, + "grad_norm": 1.6527752876281738, + "learning_rate": 1.6299988379320094e-05, + "loss": 1.755, + "step": 26910 + }, + { + "epoch": 0.9570704826237668, + "grad_norm": 1.6441879272460938, + "learning_rate": 1.6296944335087843e-05, + "loss": 1.8096, + "step": 26920 + }, + { + "epoch": 0.9574260065771931, + "grad_norm": 1.5838773250579834, + "learning_rate": 1.6293899323684422e-05, + "loss": 1.7996, + "step": 26930 + }, + { + "epoch": 0.9577815305306195, + "grad_norm": 1.5352705717086792, + "learning_rate": 1.629085334557753e-05, + "loss": 1.8115, + "step": 26940 + }, + { + "epoch": 0.9581370544840458, + "grad_norm": 1.7199256420135498, + "learning_rate": 1.6287806401235008e-05, + "loss": 1.7838, + "step": 26950 + }, + { + "epoch": 0.9584925784374723, + "grad_norm": 1.829298734664917, + "learning_rate": 1.6284758491124847e-05, + "loss": 1.7503, + "step": 26960 + }, + { + "epoch": 0.9588481023908986, + "grad_norm": 1.6695120334625244, + "learning_rate": 1.6281709615715186e-05, + "loss": 1.7866, + "step": 26970 + }, + { + "epoch": 0.959203626344325, + "grad_norm": 1.6769123077392578, + "learning_rate": 1.6278659775474318e-05, + "loss": 1.7842, + "step": 26980 + }, + { + "epoch": 0.9595591502977513, + "grad_norm": 1.6990450620651245, + "learning_rate": 1.6275608970870674e-05, + "loss": 1.7987, + "step": 26990 + }, + { + "epoch": 0.9599146742511777, + "grad_norm": 1.6821962594985962, + "learning_rate": 1.627255720237284e-05, + "loss": 1.8118, + "step": 27000 + }, + { + "epoch": 0.960270198204604, + "grad_norm": 1.6602524518966675, + "learning_rate": 1.6269504470449548e-05, + "loss": 1.8319, + "step": 27010 + }, + { + "epoch": 0.9606257221580304, + "grad_norm": 1.5797219276428223, + "learning_rate": 1.6266450775569683e-05, + "loss": 1.8308, + "step": 27020 + }, + { + "epoch": 0.9609812461114567, + "grad_norm": 1.637904405593872, + "learning_rate": 1.626339611820227e-05, + "loss": 1.7711, + "step": 27030 + }, + { + "epoch": 0.9613367700648832, + "grad_norm": 1.626842975616455, + "learning_rate": 1.626034049881648e-05, + "loss": 1.8084, + "step": 27040 + }, + { + "epoch": 0.9616922940183095, + "grad_norm": 1.7272893190383911, + "learning_rate": 1.6257283917881644e-05, + "loss": 1.7708, + "step": 27050 + }, + { + "epoch": 0.9620478179717359, + "grad_norm": 1.6516095399856567, + "learning_rate": 1.6254226375867234e-05, + "loss": 1.8228, + "step": 27060 + }, + { + "epoch": 0.9624033419251622, + "grad_norm": 1.5165112018585205, + "learning_rate": 1.6251167873242865e-05, + "loss": 1.7821, + "step": 27070 + }, + { + "epoch": 0.9627588658785886, + "grad_norm": 1.5773776769638062, + "learning_rate": 1.624810841047831e-05, + "loss": 1.8263, + "step": 27080 + }, + { + "epoch": 0.9631143898320149, + "grad_norm": 1.8211758136749268, + "learning_rate": 1.6245047988043472e-05, + "loss": 1.8452, + "step": 27090 + }, + { + "epoch": 0.9634699137854413, + "grad_norm": 1.6081113815307617, + "learning_rate": 1.6241986606408424e-05, + "loss": 1.8437, + "step": 27100 + }, + { + "epoch": 0.9638254377388676, + "grad_norm": 1.6721961498260498, + "learning_rate": 1.623892426604337e-05, + "loss": 1.7794, + "step": 27110 + }, + { + "epoch": 0.9641809616922941, + "grad_norm": 1.6835254430770874, + "learning_rate": 1.6235860967418666e-05, + "loss": 1.8349, + "step": 27120 + }, + { + "epoch": 0.9645364856457204, + "grad_norm": 1.625592589378357, + "learning_rate": 1.6232796711004817e-05, + "loss": 1.7948, + "step": 27130 + }, + { + "epoch": 0.9648920095991468, + "grad_norm": 1.7034497261047363, + "learning_rate": 1.6229731497272474e-05, + "loss": 1.7869, + "step": 27140 + }, + { + "epoch": 0.9652475335525731, + "grad_norm": 1.7472915649414062, + "learning_rate": 1.6226665326692435e-05, + "loss": 1.8121, + "step": 27150 + }, + { + "epoch": 0.9656030575059995, + "grad_norm": 1.6585270166397095, + "learning_rate": 1.622359819973564e-05, + "loss": 1.7968, + "step": 27160 + }, + { + "epoch": 0.9659585814594258, + "grad_norm": 1.6940916776657104, + "learning_rate": 1.6220530116873186e-05, + "loss": 1.8043, + "step": 27170 + }, + { + "epoch": 0.9663141054128522, + "grad_norm": 1.6721597909927368, + "learning_rate": 1.6217461078576307e-05, + "loss": 1.8089, + "step": 27180 + }, + { + "epoch": 0.9666696293662785, + "grad_norm": 1.5319979190826416, + "learning_rate": 1.6214391085316395e-05, + "loss": 1.8323, + "step": 27190 + }, + { + "epoch": 0.967025153319705, + "grad_norm": 1.6966089010238647, + "learning_rate": 1.621132013756497e-05, + "loss": 1.8028, + "step": 27200 + }, + { + "epoch": 0.9673806772731313, + "grad_norm": 1.677552580833435, + "learning_rate": 1.620824823579372e-05, + "loss": 1.7426, + "step": 27210 + }, + { + "epoch": 0.9677362012265577, + "grad_norm": 1.5825179815292358, + "learning_rate": 1.620517538047447e-05, + "loss": 1.794, + "step": 27220 + }, + { + "epoch": 0.968091725179984, + "grad_norm": 1.6787668466567993, + "learning_rate": 1.6202101572079186e-05, + "loss": 1.838, + "step": 27230 + }, + { + "epoch": 0.9684472491334104, + "grad_norm": 1.7288752794265747, + "learning_rate": 1.619902681107999e-05, + "loss": 1.8007, + "step": 27240 + }, + { + "epoch": 0.9688027730868367, + "grad_norm": 1.6401065587997437, + "learning_rate": 1.619595109794914e-05, + "loss": 1.811, + "step": 27250 + }, + { + "epoch": 0.9691582970402631, + "grad_norm": 1.6266028881072998, + "learning_rate": 1.6192874433159054e-05, + "loss": 1.7568, + "step": 27260 + }, + { + "epoch": 0.9695138209936894, + "grad_norm": 1.7548699378967285, + "learning_rate": 1.618979681718228e-05, + "loss": 1.872, + "step": 27270 + }, + { + "epoch": 0.9698693449471159, + "grad_norm": 1.5368551015853882, + "learning_rate": 1.618671825049153e-05, + "loss": 1.79, + "step": 27280 + }, + { + "epoch": 0.9702248689005422, + "grad_norm": 1.711429238319397, + "learning_rate": 1.6183638733559646e-05, + "loss": 1.8258, + "step": 27290 + }, + { + "epoch": 0.9705803928539686, + "grad_norm": 1.6672272682189941, + "learning_rate": 1.6180558266859625e-05, + "loss": 1.7776, + "step": 27300 + }, + { + "epoch": 0.9709359168073949, + "grad_norm": 1.6353920698165894, + "learning_rate": 1.6177476850864606e-05, + "loss": 1.8005, + "step": 27310 + }, + { + "epoch": 0.9712914407608213, + "grad_norm": 1.6440942287445068, + "learning_rate": 1.6174394486047874e-05, + "loss": 1.8168, + "step": 27320 + }, + { + "epoch": 0.9716469647142476, + "grad_norm": 1.7292405366897583, + "learning_rate": 1.6171311172882866e-05, + "loss": 1.806, + "step": 27330 + }, + { + "epoch": 0.972002488667674, + "grad_norm": 1.6018248796463013, + "learning_rate": 1.6168226911843155e-05, + "loss": 1.8135, + "step": 27340 + }, + { + "epoch": 0.9723580126211003, + "grad_norm": 1.6486098766326904, + "learning_rate": 1.6165141703402466e-05, + "loss": 1.8157, + "step": 27350 + }, + { + "epoch": 0.9727135365745267, + "grad_norm": 1.7306561470031738, + "learning_rate": 1.6162055548034663e-05, + "loss": 1.8098, + "step": 27360 + }, + { + "epoch": 0.9730690605279531, + "grad_norm": 1.6542526483535767, + "learning_rate": 1.6158968446213766e-05, + "loss": 1.7903, + "step": 27370 + }, + { + "epoch": 0.9734245844813795, + "grad_norm": 1.7179967164993286, + "learning_rate": 1.6155880398413938e-05, + "loss": 1.7832, + "step": 27380 + }, + { + "epoch": 0.9737801084348058, + "grad_norm": 1.8103654384613037, + "learning_rate": 1.6152791405109473e-05, + "loss": 1.8035, + "step": 27390 + }, + { + "epoch": 0.9741356323882322, + "grad_norm": 1.5433931350708008, + "learning_rate": 1.6149701466774827e-05, + "loss": 1.7961, + "step": 27400 + }, + { + "epoch": 0.9744911563416585, + "grad_norm": 1.764198899269104, + "learning_rate": 1.6146610583884598e-05, + "loss": 1.8266, + "step": 27410 + }, + { + "epoch": 0.9748466802950849, + "grad_norm": 1.7470903396606445, + "learning_rate": 1.614351875691352e-05, + "loss": 1.8265, + "step": 27420 + }, + { + "epoch": 0.9752022042485112, + "grad_norm": 1.6823208332061768, + "learning_rate": 1.614042598633648e-05, + "loss": 1.8151, + "step": 27430 + }, + { + "epoch": 0.9755577282019376, + "grad_norm": 1.5959479808807373, + "learning_rate": 1.613733227262851e-05, + "loss": 1.8363, + "step": 27440 + }, + { + "epoch": 0.975913252155364, + "grad_norm": 1.6768295764923096, + "learning_rate": 1.6134237616264784e-05, + "loss": 1.8201, + "step": 27450 + }, + { + "epoch": 0.9762687761087904, + "grad_norm": 1.6030958890914917, + "learning_rate": 1.6131142017720624e-05, + "loss": 1.7891, + "step": 27460 + }, + { + "epoch": 0.9766243000622167, + "grad_norm": 1.7325211763381958, + "learning_rate": 1.612804547747149e-05, + "loss": 1.8158, + "step": 27470 + }, + { + "epoch": 0.9769798240156431, + "grad_norm": 1.6103816032409668, + "learning_rate": 1.6124947995993e-05, + "loss": 1.7845, + "step": 27480 + }, + { + "epoch": 0.9773353479690694, + "grad_norm": 1.6940481662750244, + "learning_rate": 1.6121849573760897e-05, + "loss": 1.7905, + "step": 27490 + }, + { + "epoch": 0.9776908719224958, + "grad_norm": 1.5602481365203857, + "learning_rate": 1.6118750211251083e-05, + "loss": 1.8666, + "step": 27500 + }, + { + "epoch": 0.9780463958759221, + "grad_norm": 1.5811543464660645, + "learning_rate": 1.6115649908939603e-05, + "loss": 1.7462, + "step": 27510 + }, + { + "epoch": 0.9784019198293485, + "grad_norm": 1.6332805156707764, + "learning_rate": 1.6112548667302642e-05, + "loss": 1.8658, + "step": 27520 + }, + { + "epoch": 0.9787574437827748, + "grad_norm": 1.6110872030258179, + "learning_rate": 1.6109446486816528e-05, + "loss": 1.8511, + "step": 27530 + }, + { + "epoch": 0.9791129677362013, + "grad_norm": 1.5546345710754395, + "learning_rate": 1.6106343367957746e-05, + "loss": 1.8003, + "step": 27540 + }, + { + "epoch": 0.9794684916896276, + "grad_norm": 1.5880579948425293, + "learning_rate": 1.610323931120291e-05, + "loss": 1.7943, + "step": 27550 + }, + { + "epoch": 0.979824015643054, + "grad_norm": 1.6306613683700562, + "learning_rate": 1.610013431702878e-05, + "loss": 1.7862, + "step": 27560 + }, + { + "epoch": 0.9801795395964803, + "grad_norm": 1.6541390419006348, + "learning_rate": 1.6097028385912268e-05, + "loss": 1.7985, + "step": 27570 + }, + { + "epoch": 0.9805350635499067, + "grad_norm": 1.6070232391357422, + "learning_rate": 1.6093921518330424e-05, + "loss": 1.7954, + "step": 27580 + }, + { + "epoch": 0.980890587503333, + "grad_norm": 1.5823501348495483, + "learning_rate": 1.6090813714760442e-05, + "loss": 1.7746, + "step": 27590 + }, + { + "epoch": 0.9812461114567594, + "grad_norm": 1.5954720973968506, + "learning_rate": 1.6087704975679667e-05, + "loss": 1.8562, + "step": 27600 + }, + { + "epoch": 0.9816016354101857, + "grad_norm": 1.5298773050308228, + "learning_rate": 1.6084595301565574e-05, + "loss": 1.8005, + "step": 27610 + }, + { + "epoch": 0.9819571593636122, + "grad_norm": 1.7437852621078491, + "learning_rate": 1.60814846928958e-05, + "loss": 1.7952, + "step": 27620 + }, + { + "epoch": 0.9823126833170385, + "grad_norm": 1.7300840616226196, + "learning_rate": 1.6078373150148104e-05, + "loss": 1.844, + "step": 27630 + }, + { + "epoch": 0.9826682072704649, + "grad_norm": 1.554736852645874, + "learning_rate": 1.6075260673800404e-05, + "loss": 1.8119, + "step": 27640 + }, + { + "epoch": 0.9830237312238912, + "grad_norm": 1.6470636129379272, + "learning_rate": 1.6072147264330756e-05, + "loss": 1.8205, + "step": 27650 + }, + { + "epoch": 0.9833792551773176, + "grad_norm": 1.5953969955444336, + "learning_rate": 1.606903292221736e-05, + "loss": 1.7882, + "step": 27660 + }, + { + "epoch": 0.9837347791307439, + "grad_norm": 1.630172610282898, + "learning_rate": 1.6065917647938562e-05, + "loss": 1.8225, + "step": 27670 + }, + { + "epoch": 0.9840903030841703, + "grad_norm": 1.59041428565979, + "learning_rate": 1.6062801441972845e-05, + "loss": 1.7515, + "step": 27680 + }, + { + "epoch": 0.9844458270375966, + "grad_norm": 1.7602511644363403, + "learning_rate": 1.605968430479884e-05, + "loss": 1.7824, + "step": 27690 + }, + { + "epoch": 0.9848013509910231, + "grad_norm": 1.7128077745437622, + "learning_rate": 1.6056566236895327e-05, + "loss": 1.8333, + "step": 27700 + }, + { + "epoch": 0.9851568749444494, + "grad_norm": 1.7968146800994873, + "learning_rate": 1.605344723874121e-05, + "loss": 1.8075, + "step": 27710 + }, + { + "epoch": 0.9855123988978758, + "grad_norm": 1.613105058670044, + "learning_rate": 1.6050327310815553e-05, + "loss": 1.7623, + "step": 27720 + }, + { + "epoch": 0.9858679228513021, + "grad_norm": 1.74599289894104, + "learning_rate": 1.604720645359756e-05, + "loss": 1.7916, + "step": 27730 + }, + { + "epoch": 0.9862234468047285, + "grad_norm": 1.6392744779586792, + "learning_rate": 1.6044084667566565e-05, + "loss": 1.7953, + "step": 27740 + }, + { + "epoch": 0.9865789707581548, + "grad_norm": 1.6749186515808105, + "learning_rate": 1.6040961953202067e-05, + "loss": 1.7712, + "step": 27750 + }, + { + "epoch": 0.9869344947115812, + "grad_norm": 1.640090823173523, + "learning_rate": 1.603783831098369e-05, + "loss": 1.7474, + "step": 27760 + }, + { + "epoch": 0.9872900186650075, + "grad_norm": 1.628670334815979, + "learning_rate": 1.60347137413912e-05, + "loss": 1.7763, + "step": 27770 + }, + { + "epoch": 0.987645542618434, + "grad_norm": 1.6321513652801514, + "learning_rate": 1.6031588244904525e-05, + "loss": 1.8094, + "step": 27780 + }, + { + "epoch": 0.9880010665718603, + "grad_norm": 1.6589654684066772, + "learning_rate": 1.602846182200371e-05, + "loss": 1.8097, + "step": 27790 + }, + { + "epoch": 0.9883565905252867, + "grad_norm": 1.6615729331970215, + "learning_rate": 1.6025334473168962e-05, + "loss": 1.8256, + "step": 27800 + }, + { + "epoch": 0.988712114478713, + "grad_norm": 1.6079349517822266, + "learning_rate": 1.6022206198880616e-05, + "loss": 1.8137, + "step": 27810 + }, + { + "epoch": 0.9890676384321394, + "grad_norm": 1.6241466999053955, + "learning_rate": 1.6019076999619155e-05, + "loss": 1.7848, + "step": 27820 + }, + { + "epoch": 0.9894231623855657, + "grad_norm": 1.6167103052139282, + "learning_rate": 1.6015946875865206e-05, + "loss": 1.7953, + "step": 27830 + }, + { + "epoch": 0.9897786863389921, + "grad_norm": 1.5251038074493408, + "learning_rate": 1.601281582809954e-05, + "loss": 1.7324, + "step": 27840 + }, + { + "epoch": 0.9901342102924184, + "grad_norm": 1.5767289400100708, + "learning_rate": 1.6009683856803063e-05, + "loss": 1.7968, + "step": 27850 + }, + { + "epoch": 0.9904897342458449, + "grad_norm": 1.695135235786438, + "learning_rate": 1.6006550962456826e-05, + "loss": 1.8523, + "step": 27860 + }, + { + "epoch": 0.9908452581992712, + "grad_norm": 1.6537102460861206, + "learning_rate": 1.6003417145542025e-05, + "loss": 1.7757, + "step": 27870 + }, + { + "epoch": 0.9912007821526976, + "grad_norm": 1.7178086042404175, + "learning_rate": 1.600028240653999e-05, + "loss": 1.7893, + "step": 27880 + }, + { + "epoch": 0.9915563061061239, + "grad_norm": 1.6889991760253906, + "learning_rate": 1.5997146745932198e-05, + "loss": 1.7875, + "step": 27890 + }, + { + "epoch": 0.9919118300595503, + "grad_norm": 1.6263501644134521, + "learning_rate": 1.5994010164200268e-05, + "loss": 1.7295, + "step": 27900 + }, + { + "epoch": 0.9922673540129766, + "grad_norm": 1.5779887437820435, + "learning_rate": 1.599087266182596e-05, + "loss": 1.7868, + "step": 27910 + }, + { + "epoch": 0.992622877966403, + "grad_norm": 1.6806020736694336, + "learning_rate": 1.5987734239291177e-05, + "loss": 1.7987, + "step": 27920 + }, + { + "epoch": 0.9929784019198293, + "grad_norm": 1.634827971458435, + "learning_rate": 1.5984594897077957e-05, + "loss": 1.801, + "step": 27930 + }, + { + "epoch": 0.9933339258732558, + "grad_norm": 1.7131608724594116, + "learning_rate": 1.5981454635668483e-05, + "loss": 1.8069, + "step": 27940 + }, + { + "epoch": 0.9936894498266821, + "grad_norm": 1.5791804790496826, + "learning_rate": 1.5978313455545085e-05, + "loss": 1.7863, + "step": 27950 + }, + { + "epoch": 0.9940449737801085, + "grad_norm": 1.7399296760559082, + "learning_rate": 1.5975171357190223e-05, + "loss": 1.7426, + "step": 27960 + }, + { + "epoch": 0.9944004977335348, + "grad_norm": 1.627443552017212, + "learning_rate": 1.5972028341086502e-05, + "loss": 1.7914, + "step": 27970 + }, + { + "epoch": 0.9947560216869612, + "grad_norm": 1.7265781164169312, + "learning_rate": 1.5968884407716675e-05, + "loss": 1.7873, + "step": 27980 + }, + { + "epoch": 0.9951115456403875, + "grad_norm": 1.624820351600647, + "learning_rate": 1.5965739557563627e-05, + "loss": 1.7712, + "step": 27990 + }, + { + "epoch": 0.9954670695938139, + "grad_norm": 1.6075184345245361, + "learning_rate": 1.5962593791110388e-05, + "loss": 1.7635, + "step": 28000 + }, + { + "epoch": 0.9958225935472402, + "grad_norm": 1.6258301734924316, + "learning_rate": 1.595944710884013e-05, + "loss": 1.761, + "step": 28010 + }, + { + "epoch": 0.9961781175006666, + "grad_norm": 1.6871589422225952, + "learning_rate": 1.5956299511236163e-05, + "loss": 1.7997, + "step": 28020 + }, + { + "epoch": 0.996533641454093, + "grad_norm": 1.6195063591003418, + "learning_rate": 1.5953150998781937e-05, + "loss": 1.7717, + "step": 28030 + }, + { + "epoch": 0.9968891654075194, + "grad_norm": 1.7741018533706665, + "learning_rate": 1.595000157196104e-05, + "loss": 1.7863, + "step": 28040 + }, + { + "epoch": 0.9972446893609457, + "grad_norm": 1.6703013181686401, + "learning_rate": 1.5946851231257214e-05, + "loss": 1.7701, + "step": 28050 + }, + { + "epoch": 0.9976002133143721, + "grad_norm": 1.6615813970565796, + "learning_rate": 1.594369997715432e-05, + "loss": 1.8063, + "step": 28060 + }, + { + "epoch": 0.9979557372677984, + "grad_norm": 1.6776505708694458, + "learning_rate": 1.594054781013638e-05, + "loss": 1.7542, + "step": 28070 + }, + { + "epoch": 0.9983112612212248, + "grad_norm": 1.6818045377731323, + "learning_rate": 1.5937394730687545e-05, + "loss": 1.8273, + "step": 28080 + }, + { + "epoch": 0.9986667851746511, + "grad_norm": 1.69220769405365, + "learning_rate": 1.5934240739292105e-05, + "loss": 1.8174, + "step": 28090 + }, + { + "epoch": 0.9990223091280775, + "grad_norm": 1.7431423664093018, + "learning_rate": 1.5931085836434498e-05, + "loss": 1.8147, + "step": 28100 + }, + { + "epoch": 0.9993778330815039, + "grad_norm": 1.4685384035110474, + "learning_rate": 1.5927930022599296e-05, + "loss": 1.7891, + "step": 28110 + }, + { + "epoch": 0.9997333570349303, + "grad_norm": 1.696838617324829, + "learning_rate": 1.5924773298271207e-05, + "loss": 1.7995, + "step": 28120 + }, + { + "epoch": 0.9999822238023287, + "eval_loss": 1.8200198411941528, + "eval_runtime": 9.6453, + "eval_samples_per_second": 106.166, + "eval_steps_per_second": 1.659, + "step": 28127 + }, + { + "epoch": 1.0000888809883566, + "grad_norm": 1.7338460683822632, + "learning_rate": 1.5921615663935088e-05, + "loss": 1.7504, + "step": 28130 + }, + { + "epoch": 1.0004444049417829, + "grad_norm": 1.7272236347198486, + "learning_rate": 1.5918457120075935e-05, + "loss": 1.7509, + "step": 28140 + }, + { + "epoch": 1.0007999288952094, + "grad_norm": 1.6638456583023071, + "learning_rate": 1.5915297667178876e-05, + "loss": 1.7167, + "step": 28150 + }, + { + "epoch": 1.0011554528486357, + "grad_norm": 1.5815426111221313, + "learning_rate": 1.5912137305729184e-05, + "loss": 1.6781, + "step": 28160 + }, + { + "epoch": 1.001510976802062, + "grad_norm": 1.709189534187317, + "learning_rate": 1.590897603621227e-05, + "loss": 1.724, + "step": 28170 + }, + { + "epoch": 1.0018665007554883, + "grad_norm": 1.631792664527893, + "learning_rate": 1.5905813859113685e-05, + "loss": 1.7253, + "step": 28180 + }, + { + "epoch": 1.0022220247089149, + "grad_norm": 1.7928650379180908, + "learning_rate": 1.5902650774919126e-05, + "loss": 1.7281, + "step": 28190 + }, + { + "epoch": 1.0025775486623412, + "grad_norm": 1.7226221561431885, + "learning_rate": 1.5899486784114416e-05, + "loss": 1.7169, + "step": 28200 + }, + { + "epoch": 1.0029330726157675, + "grad_norm": 1.6352717876434326, + "learning_rate": 1.5896321887185524e-05, + "loss": 1.6984, + "step": 28210 + }, + { + "epoch": 1.0032885965691938, + "grad_norm": 1.6581904888153076, + "learning_rate": 1.5893156084618563e-05, + "loss": 1.6843, + "step": 28220 + }, + { + "epoch": 1.0036441205226203, + "grad_norm": 1.7965270280838013, + "learning_rate": 1.5889989376899777e-05, + "loss": 1.7198, + "step": 28230 + }, + { + "epoch": 1.0039996444760466, + "grad_norm": 1.7627277374267578, + "learning_rate": 1.5886821764515552e-05, + "loss": 1.7669, + "step": 28240 + }, + { + "epoch": 1.004355168429473, + "grad_norm": 1.6812225580215454, + "learning_rate": 1.5883653247952415e-05, + "loss": 1.6532, + "step": 28250 + }, + { + "epoch": 1.0047106923828992, + "grad_norm": 1.7469241619110107, + "learning_rate": 1.588048382769703e-05, + "loss": 1.7292, + "step": 28260 + }, + { + "epoch": 1.0050662163363258, + "grad_norm": 1.6404558420181274, + "learning_rate": 1.5877313504236203e-05, + "loss": 1.7052, + "step": 28270 + }, + { + "epoch": 1.005421740289752, + "grad_norm": 1.7563822269439697, + "learning_rate": 1.5874142278056867e-05, + "loss": 1.7044, + "step": 28280 + }, + { + "epoch": 1.0057772642431784, + "grad_norm": 1.6199296712875366, + "learning_rate": 1.5870970149646113e-05, + "loss": 1.7766, + "step": 28290 + }, + { + "epoch": 1.0061327881966047, + "grad_norm": 1.7421317100524902, + "learning_rate": 1.5867797119491154e-05, + "loss": 1.7037, + "step": 28300 + }, + { + "epoch": 1.0064883121500312, + "grad_norm": 1.7849583625793457, + "learning_rate": 1.586462318807935e-05, + "loss": 1.6933, + "step": 28310 + }, + { + "epoch": 1.0068438361034575, + "grad_norm": 1.7753171920776367, + "learning_rate": 1.586144835589819e-05, + "loss": 1.7077, + "step": 28320 + }, + { + "epoch": 1.0071993600568838, + "grad_norm": 1.9485249519348145, + "learning_rate": 1.585827262343532e-05, + "loss": 1.6975, + "step": 28330 + }, + { + "epoch": 1.0075548840103101, + "grad_norm": 1.6119439601898193, + "learning_rate": 1.5855095991178507e-05, + "loss": 1.691, + "step": 28340 + }, + { + "epoch": 1.0079104079637367, + "grad_norm": 1.7305185794830322, + "learning_rate": 1.5851918459615658e-05, + "loss": 1.7579, + "step": 28350 + }, + { + "epoch": 1.008265931917163, + "grad_norm": 1.776729702949524, + "learning_rate": 1.584874002923483e-05, + "loss": 1.6817, + "step": 28360 + }, + { + "epoch": 1.0086214558705893, + "grad_norm": 1.773091197013855, + "learning_rate": 1.58455607005242e-05, + "loss": 1.7401, + "step": 28370 + }, + { + "epoch": 1.0089769798240156, + "grad_norm": 1.6762112379074097, + "learning_rate": 1.5842380473972103e-05, + "loss": 1.6955, + "step": 28380 + }, + { + "epoch": 1.009332503777442, + "grad_norm": 1.6705354452133179, + "learning_rate": 1.5839199350066994e-05, + "loss": 1.7084, + "step": 28390 + }, + { + "epoch": 1.0096880277308684, + "grad_norm": 1.9015192985534668, + "learning_rate": 1.5836017329297477e-05, + "loss": 1.6882, + "step": 28400 + }, + { + "epoch": 1.0100435516842947, + "grad_norm": 1.7727024555206299, + "learning_rate": 1.583283441215229e-05, + "loss": 1.7069, + "step": 28410 + }, + { + "epoch": 1.010399075637721, + "grad_norm": 1.7414228916168213, + "learning_rate": 1.582965059912031e-05, + "loss": 1.6679, + "step": 28420 + }, + { + "epoch": 1.0107545995911476, + "grad_norm": 1.621151089668274, + "learning_rate": 1.5826465890690556e-05, + "loss": 1.6997, + "step": 28430 + }, + { + "epoch": 1.0111101235445739, + "grad_norm": 1.7509541511535645, + "learning_rate": 1.5823280287352167e-05, + "loss": 1.7534, + "step": 28440 + }, + { + "epoch": 1.0114656474980002, + "grad_norm": 1.6893500089645386, + "learning_rate": 1.5820093789594436e-05, + "loss": 1.7204, + "step": 28450 + }, + { + "epoch": 1.0118211714514265, + "grad_norm": 1.8166451454162598, + "learning_rate": 1.5816906397906796e-05, + "loss": 1.7273, + "step": 28460 + }, + { + "epoch": 1.012176695404853, + "grad_norm": 1.7543953657150269, + "learning_rate": 1.5813718112778805e-05, + "loss": 1.7063, + "step": 28470 + }, + { + "epoch": 1.0125322193582793, + "grad_norm": 1.7828388214111328, + "learning_rate": 1.5810528934700163e-05, + "loss": 1.6872, + "step": 28480 + }, + { + "epoch": 1.0128877433117056, + "grad_norm": 1.7891048192977905, + "learning_rate": 1.580733886416071e-05, + "loss": 1.6516, + "step": 28490 + }, + { + "epoch": 1.013243267265132, + "grad_norm": 1.7601685523986816, + "learning_rate": 1.5804147901650416e-05, + "loss": 1.7025, + "step": 28500 + }, + { + "epoch": 1.0135987912185584, + "grad_norm": 1.7422977685928345, + "learning_rate": 1.5800956047659403e-05, + "loss": 1.7617, + "step": 28510 + }, + { + "epoch": 1.0139543151719848, + "grad_norm": 1.6051067113876343, + "learning_rate": 1.5797763302677908e-05, + "loss": 1.7054, + "step": 28520 + }, + { + "epoch": 1.014309839125411, + "grad_norm": 1.8280378580093384, + "learning_rate": 1.5794569667196324e-05, + "loss": 1.7477, + "step": 28530 + }, + { + "epoch": 1.0146653630788374, + "grad_norm": 1.639920949935913, + "learning_rate": 1.579137514170517e-05, + "loss": 1.7057, + "step": 28540 + }, + { + "epoch": 1.015020887032264, + "grad_norm": 1.7240723371505737, + "learning_rate": 1.5788179726695107e-05, + "loss": 1.6872, + "step": 28550 + }, + { + "epoch": 1.0153764109856902, + "grad_norm": 1.7161904573440552, + "learning_rate": 1.578498342265693e-05, + "loss": 1.6863, + "step": 28560 + }, + { + "epoch": 1.0157319349391165, + "grad_norm": 1.6381793022155762, + "learning_rate": 1.5781786230081576e-05, + "loss": 1.7147, + "step": 28570 + }, + { + "epoch": 1.0160874588925428, + "grad_norm": 1.7279282808303833, + "learning_rate": 1.5778588149460104e-05, + "loss": 1.7304, + "step": 28580 + }, + { + "epoch": 1.0164429828459693, + "grad_norm": 1.7217353582382202, + "learning_rate": 1.5775389181283727e-05, + "loss": 1.7024, + "step": 28590 + }, + { + "epoch": 1.0167985067993957, + "grad_norm": 1.6571637392044067, + "learning_rate": 1.5772189326043782e-05, + "loss": 1.7066, + "step": 28600 + }, + { + "epoch": 1.017154030752822, + "grad_norm": 1.8319371938705444, + "learning_rate": 1.5768988584231748e-05, + "loss": 1.6951, + "step": 28610 + }, + { + "epoch": 1.0175095547062483, + "grad_norm": 1.726785659790039, + "learning_rate": 1.5765786956339238e-05, + "loss": 1.7702, + "step": 28620 + }, + { + "epoch": 1.0178650786596748, + "grad_norm": 1.6269569396972656, + "learning_rate": 1.576258444285801e-05, + "loss": 1.6715, + "step": 28630 + }, + { + "epoch": 1.018220602613101, + "grad_norm": 1.6807979345321655, + "learning_rate": 1.5759381044279936e-05, + "loss": 1.7301, + "step": 28640 + }, + { + "epoch": 1.0185761265665274, + "grad_norm": 1.6536914110183716, + "learning_rate": 1.5756176761097048e-05, + "loss": 1.7087, + "step": 28650 + }, + { + "epoch": 1.0189316505199537, + "grad_norm": 1.7635669708251953, + "learning_rate": 1.57529715938015e-05, + "loss": 1.6841, + "step": 28660 + }, + { + "epoch": 1.0192871744733802, + "grad_norm": 1.760072946548462, + "learning_rate": 1.574976554288559e-05, + "loss": 1.7422, + "step": 28670 + }, + { + "epoch": 1.0196426984268065, + "grad_norm": 1.6653392314910889, + "learning_rate": 1.574655860884174e-05, + "loss": 1.6776, + "step": 28680 + }, + { + "epoch": 1.0199982223802329, + "grad_norm": 1.7768166065216064, + "learning_rate": 1.574335079216252e-05, + "loss": 1.7074, + "step": 28690 + }, + { + "epoch": 1.0203537463336592, + "grad_norm": 1.742034912109375, + "learning_rate": 1.5740142093340632e-05, + "loss": 1.6839, + "step": 28700 + }, + { + "epoch": 1.0207092702870857, + "grad_norm": 1.6778193712234497, + "learning_rate": 1.5736932512868904e-05, + "loss": 1.6873, + "step": 28710 + }, + { + "epoch": 1.021064794240512, + "grad_norm": 1.7619653940200806, + "learning_rate": 1.5733722051240318e-05, + "loss": 1.7114, + "step": 28720 + }, + { + "epoch": 1.0214203181939383, + "grad_norm": 1.6607322692871094, + "learning_rate": 1.573051070894797e-05, + "loss": 1.6773, + "step": 28730 + }, + { + "epoch": 1.0217758421473646, + "grad_norm": 1.7014106512069702, + "learning_rate": 1.5727298486485112e-05, + "loss": 1.7189, + "step": 28740 + }, + { + "epoch": 1.0221313661007911, + "grad_norm": 1.695156216621399, + "learning_rate": 1.572408538434512e-05, + "loss": 1.7163, + "step": 28750 + }, + { + "epoch": 1.0224868900542174, + "grad_norm": 1.7573307752609253, + "learning_rate": 1.57208714030215e-05, + "loss": 1.7234, + "step": 28760 + }, + { + "epoch": 1.0228424140076438, + "grad_norm": 1.7744965553283691, + "learning_rate": 1.5717656543007896e-05, + "loss": 1.704, + "step": 28770 + }, + { + "epoch": 1.02319793796107, + "grad_norm": 1.8379814624786377, + "learning_rate": 1.5714440804798105e-05, + "loss": 1.7446, + "step": 28780 + }, + { + "epoch": 1.0235534619144966, + "grad_norm": 1.6961009502410889, + "learning_rate": 1.5711224188886035e-05, + "loss": 1.6822, + "step": 28790 + }, + { + "epoch": 1.023908985867923, + "grad_norm": 1.6482083797454834, + "learning_rate": 1.5708006695765737e-05, + "loss": 1.6785, + "step": 28800 + }, + { + "epoch": 1.0242645098213492, + "grad_norm": 1.6825973987579346, + "learning_rate": 1.5704788325931403e-05, + "loss": 1.7216, + "step": 28810 + }, + { + "epoch": 1.0246200337747755, + "grad_norm": 1.7570611238479614, + "learning_rate": 1.570156907987735e-05, + "loss": 1.6912, + "step": 28820 + }, + { + "epoch": 1.024975557728202, + "grad_norm": 1.7243115901947021, + "learning_rate": 1.5698348958098035e-05, + "loss": 1.7154, + "step": 28830 + }, + { + "epoch": 1.0253310816816283, + "grad_norm": 1.759553074836731, + "learning_rate": 1.569512796108805e-05, + "loss": 1.733, + "step": 28840 + }, + { + "epoch": 1.0256866056350546, + "grad_norm": 1.828395962715149, + "learning_rate": 1.569190608934212e-05, + "loss": 1.7296, + "step": 28850 + }, + { + "epoch": 1.026042129588481, + "grad_norm": 1.7379239797592163, + "learning_rate": 1.56886833433551e-05, + "loss": 1.6896, + "step": 28860 + }, + { + "epoch": 1.0263976535419075, + "grad_norm": 1.7324833869934082, + "learning_rate": 1.5685459723621987e-05, + "loss": 1.7296, + "step": 28870 + }, + { + "epoch": 1.0267531774953338, + "grad_norm": 1.8051363229751587, + "learning_rate": 1.5682235230637913e-05, + "loss": 1.6911, + "step": 28880 + }, + { + "epoch": 1.02710870144876, + "grad_norm": 1.824020266532898, + "learning_rate": 1.567900986489813e-05, + "loss": 1.6975, + "step": 28890 + }, + { + "epoch": 1.0274642254021864, + "grad_norm": 1.8758701086044312, + "learning_rate": 1.5675783626898043e-05, + "loss": 1.7233, + "step": 28900 + }, + { + "epoch": 1.027819749355613, + "grad_norm": 1.7881940603256226, + "learning_rate": 1.5672556517133177e-05, + "loss": 1.6468, + "step": 28910 + }, + { + "epoch": 1.0281752733090392, + "grad_norm": 1.7071408033370972, + "learning_rate": 1.5669328536099196e-05, + "loss": 1.7275, + "step": 28920 + }, + { + "epoch": 1.0285307972624655, + "grad_norm": 1.6805064678192139, + "learning_rate": 1.56660996842919e-05, + "loss": 1.7205, + "step": 28930 + }, + { + "epoch": 1.0288863212158919, + "grad_norm": 1.7511591911315918, + "learning_rate": 1.566286996220722e-05, + "loss": 1.7114, + "step": 28940 + }, + { + "epoch": 1.0292418451693184, + "grad_norm": 1.841711163520813, + "learning_rate": 1.565963937034122e-05, + "loss": 1.688, + "step": 28950 + }, + { + "epoch": 1.0295973691227447, + "grad_norm": 1.788758635520935, + "learning_rate": 1.5656407909190096e-05, + "loss": 1.701, + "step": 28960 + }, + { + "epoch": 1.029952893076171, + "grad_norm": 1.80824613571167, + "learning_rate": 1.5653175579250186e-05, + "loss": 1.7522, + "step": 28970 + }, + { + "epoch": 1.0303084170295973, + "grad_norm": 1.8207590579986572, + "learning_rate": 1.5649942381017953e-05, + "loss": 1.7089, + "step": 28980 + }, + { + "epoch": 1.0306639409830238, + "grad_norm": 1.8223505020141602, + "learning_rate": 1.5646708314989997e-05, + "loss": 1.7478, + "step": 28990 + }, + { + "epoch": 1.0310194649364501, + "grad_norm": 1.8172624111175537, + "learning_rate": 1.5643473381663047e-05, + "loss": 1.7021, + "step": 29000 + }, + { + "epoch": 1.0313749888898764, + "grad_norm": 1.7984023094177246, + "learning_rate": 1.5640237581533967e-05, + "loss": 1.7288, + "step": 29010 + }, + { + "epoch": 1.0317305128433027, + "grad_norm": 1.8102020025253296, + "learning_rate": 1.5637000915099766e-05, + "loss": 1.733, + "step": 29020 + }, + { + "epoch": 1.0320860367967293, + "grad_norm": 1.7115474939346313, + "learning_rate": 1.5633763382857562e-05, + "loss": 1.7229, + "step": 29030 + }, + { + "epoch": 1.0324415607501556, + "grad_norm": 1.827752709388733, + "learning_rate": 1.563052498530463e-05, + "loss": 1.6787, + "step": 29040 + }, + { + "epoch": 1.032797084703582, + "grad_norm": 1.7382855415344238, + "learning_rate": 1.5627285722938363e-05, + "loss": 1.7076, + "step": 29050 + }, + { + "epoch": 1.0331526086570082, + "grad_norm": 1.73077392578125, + "learning_rate": 1.562404559625629e-05, + "loss": 1.7124, + "step": 29060 + }, + { + "epoch": 1.0335081326104347, + "grad_norm": 1.7690300941467285, + "learning_rate": 1.5620804605756082e-05, + "loss": 1.7559, + "step": 29070 + }, + { + "epoch": 1.033863656563861, + "grad_norm": 1.8357259035110474, + "learning_rate": 1.5617562751935525e-05, + "loss": 1.6902, + "step": 29080 + }, + { + "epoch": 1.0342191805172873, + "grad_norm": 1.6743861436843872, + "learning_rate": 1.5614320035292555e-05, + "loss": 1.7236, + "step": 29090 + }, + { + "epoch": 1.0345747044707136, + "grad_norm": 1.7483441829681396, + "learning_rate": 1.5611076456325226e-05, + "loss": 1.711, + "step": 29100 + }, + { + "epoch": 1.0349302284241402, + "grad_norm": 1.6104241609573364, + "learning_rate": 1.5607832015531736e-05, + "loss": 1.7007, + "step": 29110 + }, + { + "epoch": 1.0352857523775665, + "grad_norm": 1.6402735710144043, + "learning_rate": 1.560458671341041e-05, + "loss": 1.7371, + "step": 29120 + }, + { + "epoch": 1.0356412763309928, + "grad_norm": 1.6589477062225342, + "learning_rate": 1.5601340550459708e-05, + "loss": 1.6802, + "step": 29130 + }, + { + "epoch": 1.035996800284419, + "grad_norm": 1.6863501071929932, + "learning_rate": 1.559809352717822e-05, + "loss": 1.7383, + "step": 29140 + }, + { + "epoch": 1.0363523242378456, + "grad_norm": 1.7066422700881958, + "learning_rate": 1.559484564406466e-05, + "loss": 1.7307, + "step": 29150 + }, + { + "epoch": 1.036707848191272, + "grad_norm": 1.7184172868728638, + "learning_rate": 1.5591596901617892e-05, + "loss": 1.6952, + "step": 29160 + }, + { + "epoch": 1.0370633721446982, + "grad_norm": 1.7480359077453613, + "learning_rate": 1.55883473003369e-05, + "loss": 1.7233, + "step": 29170 + }, + { + "epoch": 1.0374188960981245, + "grad_norm": 1.810819387435913, + "learning_rate": 1.55850968407208e-05, + "loss": 1.7218, + "step": 29180 + }, + { + "epoch": 1.037774420051551, + "grad_norm": 1.704298734664917, + "learning_rate": 1.5581845523268847e-05, + "loss": 1.6839, + "step": 29190 + }, + { + "epoch": 1.0381299440049774, + "grad_norm": 1.755265474319458, + "learning_rate": 1.557859334848042e-05, + "loss": 1.7393, + "step": 29200 + }, + { + "epoch": 1.0384854679584037, + "grad_norm": 1.7880268096923828, + "learning_rate": 1.557534031685503e-05, + "loss": 1.7197, + "step": 29210 + }, + { + "epoch": 1.03884099191183, + "grad_norm": 1.7886680364608765, + "learning_rate": 1.5572086428892325e-05, + "loss": 1.7318, + "step": 29220 + }, + { + "epoch": 1.0391965158652565, + "grad_norm": 1.8346315622329712, + "learning_rate": 1.5568831685092083e-05, + "loss": 1.7039, + "step": 29230 + }, + { + "epoch": 1.0395520398186828, + "grad_norm": 1.7319073677062988, + "learning_rate": 1.5565576085954213e-05, + "loss": 1.7023, + "step": 29240 + }, + { + "epoch": 1.0399075637721091, + "grad_norm": 1.759717583656311, + "learning_rate": 1.556231963197875e-05, + "loss": 1.6992, + "step": 29250 + }, + { + "epoch": 1.0402630877255354, + "grad_norm": 1.7966276407241821, + "learning_rate": 1.555906232366587e-05, + "loss": 1.7402, + "step": 29260 + }, + { + "epoch": 1.040618611678962, + "grad_norm": 1.7950040102005005, + "learning_rate": 1.555580416151587e-05, + "loss": 1.6918, + "step": 29270 + }, + { + "epoch": 1.0409741356323883, + "grad_norm": 1.7796049118041992, + "learning_rate": 1.555254514602919e-05, + "loss": 1.7275, + "step": 29280 + }, + { + "epoch": 1.0413296595858146, + "grad_norm": 1.8265540599822998, + "learning_rate": 1.554928527770638e-05, + "loss": 1.679, + "step": 29290 + }, + { + "epoch": 1.0416851835392409, + "grad_norm": 1.799001693725586, + "learning_rate": 1.5546024557048157e-05, + "loss": 1.6946, + "step": 29300 + }, + { + "epoch": 1.0420407074926674, + "grad_norm": 1.9293655157089233, + "learning_rate": 1.5542762984555332e-05, + "loss": 1.7243, + "step": 29310 + }, + { + "epoch": 1.0423962314460937, + "grad_norm": 1.7115957736968994, + "learning_rate": 1.5539500560728865e-05, + "loss": 1.6875, + "step": 29320 + }, + { + "epoch": 1.04275175539952, + "grad_norm": 1.7396397590637207, + "learning_rate": 1.5536237286069847e-05, + "loss": 1.7303, + "step": 29330 + }, + { + "epoch": 1.0431072793529463, + "grad_norm": 1.6194676160812378, + "learning_rate": 1.553297316107949e-05, + "loss": 1.6769, + "step": 29340 + }, + { + "epoch": 1.0434628033063729, + "grad_norm": 1.8141989707946777, + "learning_rate": 1.552970818625915e-05, + "loss": 1.7001, + "step": 29350 + }, + { + "epoch": 1.0438183272597992, + "grad_norm": 1.7746583223342896, + "learning_rate": 1.5526442362110304e-05, + "loss": 1.7168, + "step": 29360 + }, + { + "epoch": 1.0441738512132255, + "grad_norm": 1.7249982357025146, + "learning_rate": 1.5523175689134563e-05, + "loss": 1.7132, + "step": 29370 + }, + { + "epoch": 1.0445293751666518, + "grad_norm": 1.8156765699386597, + "learning_rate": 1.551990816783367e-05, + "loss": 1.681, + "step": 29380 + }, + { + "epoch": 1.0448848991200783, + "grad_norm": 1.779907464981079, + "learning_rate": 1.5516639798709484e-05, + "loss": 1.7307, + "step": 29390 + }, + { + "epoch": 1.0452404230735046, + "grad_norm": 1.8506804704666138, + "learning_rate": 1.551337058226402e-05, + "loss": 1.714, + "step": 29400 + }, + { + "epoch": 1.045595947026931, + "grad_norm": 1.7650202512741089, + "learning_rate": 1.5510100518999407e-05, + "loss": 1.6814, + "step": 29410 + }, + { + "epoch": 1.0459514709803572, + "grad_norm": 1.693860411643982, + "learning_rate": 1.5506829609417896e-05, + "loss": 1.6948, + "step": 29420 + }, + { + "epoch": 1.0463069949337838, + "grad_norm": 1.6548007726669312, + "learning_rate": 1.5503557854021888e-05, + "loss": 1.6587, + "step": 29430 + }, + { + "epoch": 1.04666251888721, + "grad_norm": 1.7331669330596924, + "learning_rate": 1.5500285253313904e-05, + "loss": 1.7542, + "step": 29440 + }, + { + "epoch": 1.0470180428406364, + "grad_norm": 1.8393986225128174, + "learning_rate": 1.5497011807796586e-05, + "loss": 1.7134, + "step": 29450 + }, + { + "epoch": 1.0473735667940627, + "grad_norm": 1.802408218383789, + "learning_rate": 1.5493737517972728e-05, + "loss": 1.6933, + "step": 29460 + }, + { + "epoch": 1.0477290907474892, + "grad_norm": 1.9663327932357788, + "learning_rate": 1.5490462384345228e-05, + "loss": 1.6986, + "step": 29470 + }, + { + "epoch": 1.0480846147009155, + "grad_norm": 1.8177143335342407, + "learning_rate": 1.5487186407417133e-05, + "loss": 1.7366, + "step": 29480 + }, + { + "epoch": 1.0484401386543418, + "grad_norm": 1.6974382400512695, + "learning_rate": 1.548390958769161e-05, + "loss": 1.7108, + "step": 29490 + }, + { + "epoch": 1.0487956626077681, + "grad_norm": 1.7218098640441895, + "learning_rate": 1.548063192567196e-05, + "loss": 1.6861, + "step": 29500 + }, + { + "epoch": 1.0491511865611947, + "grad_norm": 1.8506237268447876, + "learning_rate": 1.547735342186161e-05, + "loss": 1.7004, + "step": 29510 + }, + { + "epoch": 1.049506710514621, + "grad_norm": 1.6885756254196167, + "learning_rate": 1.5474074076764116e-05, + "loss": 1.6575, + "step": 29520 + }, + { + "epoch": 1.0498622344680473, + "grad_norm": 1.838579535484314, + "learning_rate": 1.5470793890883167e-05, + "loss": 1.6883, + "step": 29530 + }, + { + "epoch": 1.0502177584214736, + "grad_norm": 1.7549519538879395, + "learning_rate": 1.5467512864722576e-05, + "loss": 1.6986, + "step": 29540 + }, + { + "epoch": 1.0505732823749, + "grad_norm": 1.7127375602722168, + "learning_rate": 1.5464230998786295e-05, + "loss": 1.676, + "step": 29550 + }, + { + "epoch": 1.0509288063283264, + "grad_norm": 1.8736270666122437, + "learning_rate": 1.5460948293578395e-05, + "loss": 1.7349, + "step": 29560 + }, + { + "epoch": 1.0512843302817527, + "grad_norm": 1.725990653038025, + "learning_rate": 1.545766474960307e-05, + "loss": 1.6781, + "step": 29570 + }, + { + "epoch": 1.051639854235179, + "grad_norm": 1.744989275932312, + "learning_rate": 1.5454380367364668e-05, + "loss": 1.7184, + "step": 29580 + }, + { + "epoch": 1.0519953781886056, + "grad_norm": 1.7831934690475464, + "learning_rate": 1.5451095147367637e-05, + "loss": 1.7057, + "step": 29590 + }, + { + "epoch": 1.0523509021420319, + "grad_norm": 1.7298821210861206, + "learning_rate": 1.5447809090116566e-05, + "loss": 1.7336, + "step": 29600 + }, + { + "epoch": 1.0527064260954582, + "grad_norm": 1.6789156198501587, + "learning_rate": 1.5444522196116182e-05, + "loss": 1.7201, + "step": 29610 + }, + { + "epoch": 1.0530619500488845, + "grad_norm": 1.647962212562561, + "learning_rate": 1.5441234465871323e-05, + "loss": 1.7174, + "step": 29620 + }, + { + "epoch": 1.053417474002311, + "grad_norm": 1.9207396507263184, + "learning_rate": 1.543794589988697e-05, + "loss": 1.7256, + "step": 29630 + }, + { + "epoch": 1.0537729979557373, + "grad_norm": 1.7186084985733032, + "learning_rate": 1.543465649866822e-05, + "loss": 1.6727, + "step": 29640 + }, + { + "epoch": 1.0541285219091636, + "grad_norm": 1.6825263500213623, + "learning_rate": 1.5431366262720313e-05, + "loss": 1.7493, + "step": 29650 + }, + { + "epoch": 1.05448404586259, + "grad_norm": 1.9607890844345093, + "learning_rate": 1.5428075192548594e-05, + "loss": 1.6711, + "step": 29660 + }, + { + "epoch": 1.0548395698160165, + "grad_norm": 1.878871202468872, + "learning_rate": 1.5424783288658564e-05, + "loss": 1.7349, + "step": 29670 + }, + { + "epoch": 1.0551950937694428, + "grad_norm": 1.799309492111206, + "learning_rate": 1.5421490551555838e-05, + "loss": 1.6914, + "step": 29680 + }, + { + "epoch": 1.055550617722869, + "grad_norm": 1.80063796043396, + "learning_rate": 1.541819698174615e-05, + "loss": 1.7078, + "step": 29690 + }, + { + "epoch": 1.0559061416762954, + "grad_norm": 1.7400059700012207, + "learning_rate": 1.5414902579735383e-05, + "loss": 1.6727, + "step": 29700 + }, + { + "epoch": 1.056261665629722, + "grad_norm": 1.831801414489746, + "learning_rate": 1.541160734602953e-05, + "loss": 1.6939, + "step": 29710 + }, + { + "epoch": 1.0566171895831482, + "grad_norm": 1.7798649072647095, + "learning_rate": 1.540831128113472e-05, + "loss": 1.7225, + "step": 29720 + }, + { + "epoch": 1.0569727135365745, + "grad_norm": 1.744949221611023, + "learning_rate": 1.5405014385557208e-05, + "loss": 1.7369, + "step": 29730 + }, + { + "epoch": 1.0573282374900008, + "grad_norm": 1.7667673826217651, + "learning_rate": 1.540171665980337e-05, + "loss": 1.6965, + "step": 29740 + }, + { + "epoch": 1.0576837614434274, + "grad_norm": 1.7514196634292603, + "learning_rate": 1.539841810437973e-05, + "loss": 1.702, + "step": 29750 + }, + { + "epoch": 1.0580392853968537, + "grad_norm": 1.6818276643753052, + "learning_rate": 1.5395118719792915e-05, + "loss": 1.7067, + "step": 29760 + }, + { + "epoch": 1.05839480935028, + "grad_norm": 1.7409169673919678, + "learning_rate": 1.539181850654969e-05, + "loss": 1.667, + "step": 29770 + }, + { + "epoch": 1.0587503333037063, + "grad_norm": 1.6888810396194458, + "learning_rate": 1.5388517465156952e-05, + "loss": 1.6842, + "step": 29780 + }, + { + "epoch": 1.0591058572571328, + "grad_norm": 1.7124814987182617, + "learning_rate": 1.5385215596121718e-05, + "loss": 1.6986, + "step": 29790 + }, + { + "epoch": 1.059461381210559, + "grad_norm": 1.6616439819335938, + "learning_rate": 1.5381912899951133e-05, + "loss": 1.6701, + "step": 29800 + }, + { + "epoch": 1.0598169051639854, + "grad_norm": 1.7271385192871094, + "learning_rate": 1.5378609377152472e-05, + "loss": 1.7024, + "step": 29810 + }, + { + "epoch": 1.0601724291174117, + "grad_norm": 1.8895982503890991, + "learning_rate": 1.5375305028233135e-05, + "loss": 1.7085, + "step": 29820 + }, + { + "epoch": 1.0605279530708382, + "grad_norm": 1.7226389646530151, + "learning_rate": 1.5371999853700647e-05, + "loss": 1.7112, + "step": 29830 + }, + { + "epoch": 1.0608834770242646, + "grad_norm": 1.779766321182251, + "learning_rate": 1.5368693854062665e-05, + "loss": 1.6956, + "step": 29840 + }, + { + "epoch": 1.0612390009776909, + "grad_norm": 1.7279249429702759, + "learning_rate": 1.536538702982697e-05, + "loss": 1.7079, + "step": 29850 + }, + { + "epoch": 1.0615945249311172, + "grad_norm": 1.8688111305236816, + "learning_rate": 1.5362079381501467e-05, + "loss": 1.6813, + "step": 29860 + }, + { + "epoch": 1.0619500488845437, + "grad_norm": 1.7290921211242676, + "learning_rate": 1.5358770909594188e-05, + "loss": 1.7075, + "step": 29870 + }, + { + "epoch": 1.06230557283797, + "grad_norm": 1.8950884342193604, + "learning_rate": 1.5355461614613306e-05, + "loss": 1.6859, + "step": 29880 + }, + { + "epoch": 1.0626610967913963, + "grad_norm": 1.83281672000885, + "learning_rate": 1.5352151497067093e-05, + "loss": 1.7043, + "step": 29890 + }, + { + "epoch": 1.0630166207448226, + "grad_norm": 1.6436896324157715, + "learning_rate": 1.534884055746397e-05, + "loss": 1.7091, + "step": 29900 + }, + { + "epoch": 1.0633721446982491, + "grad_norm": 1.8379966020584106, + "learning_rate": 1.5345528796312473e-05, + "loss": 1.703, + "step": 29910 + }, + { + "epoch": 1.0637276686516755, + "grad_norm": 1.8118281364440918, + "learning_rate": 1.5342216214121273e-05, + "loss": 1.7074, + "step": 29920 + }, + { + "epoch": 1.0640831926051018, + "grad_norm": 1.7613012790679932, + "learning_rate": 1.5338902811399154e-05, + "loss": 1.6633, + "step": 29930 + }, + { + "epoch": 1.064438716558528, + "grad_norm": 1.8082212209701538, + "learning_rate": 1.5335588588655043e-05, + "loss": 1.7131, + "step": 29940 + }, + { + "epoch": 1.0647942405119546, + "grad_norm": 1.8668187856674194, + "learning_rate": 1.5332273546397978e-05, + "loss": 1.6984, + "step": 29950 + }, + { + "epoch": 1.065149764465381, + "grad_norm": 1.8108575344085693, + "learning_rate": 1.532895768513713e-05, + "loss": 1.7179, + "step": 29960 + }, + { + "epoch": 1.0655052884188072, + "grad_norm": 1.815421462059021, + "learning_rate": 1.5325641005381793e-05, + "loss": 1.697, + "step": 29970 + }, + { + "epoch": 1.0658608123722335, + "grad_norm": 1.6751563549041748, + "learning_rate": 1.5322323507641387e-05, + "loss": 1.6947, + "step": 29980 + }, + { + "epoch": 1.06621633632566, + "grad_norm": 1.7803194522857666, + "learning_rate": 1.5319005192425466e-05, + "loss": 1.6813, + "step": 29990 + }, + { + "epoch": 1.0665718602790863, + "grad_norm": 1.685044527053833, + "learning_rate": 1.5315686060243695e-05, + "loss": 1.6922, + "step": 30000 + }, + { + "epoch": 1.0669273842325127, + "grad_norm": 1.7172998189926147, + "learning_rate": 1.5312366111605877e-05, + "loss": 1.6736, + "step": 30010 + }, + { + "epoch": 1.067282908185939, + "grad_norm": 1.677639126777649, + "learning_rate": 1.5309045347021933e-05, + "loss": 1.6962, + "step": 30020 + }, + { + "epoch": 1.0676384321393655, + "grad_norm": 1.7140579223632812, + "learning_rate": 1.530572376700191e-05, + "loss": 1.6954, + "step": 30030 + }, + { + "epoch": 1.0679939560927918, + "grad_norm": 1.7826284170150757, + "learning_rate": 1.5302401372055987e-05, + "loss": 1.7092, + "step": 30040 + }, + { + "epoch": 1.068349480046218, + "grad_norm": 1.8108514547348022, + "learning_rate": 1.5299078162694453e-05, + "loss": 1.7132, + "step": 30050 + }, + { + "epoch": 1.0687050039996444, + "grad_norm": 1.9069411754608154, + "learning_rate": 1.5295754139427743e-05, + "loss": 1.6956, + "step": 30060 + }, + { + "epoch": 1.069060527953071, + "grad_norm": 1.8205689191818237, + "learning_rate": 1.5292429302766403e-05, + "loss": 1.6801, + "step": 30070 + }, + { + "epoch": 1.0694160519064972, + "grad_norm": 1.7045793533325195, + "learning_rate": 1.5289103653221103e-05, + "loss": 1.6697, + "step": 30080 + }, + { + "epoch": 1.0697715758599236, + "grad_norm": 1.7002933025360107, + "learning_rate": 1.5285777191302648e-05, + "loss": 1.7206, + "step": 30090 + }, + { + "epoch": 1.0701270998133499, + "grad_norm": 1.7778072357177734, + "learning_rate": 1.5282449917521957e-05, + "loss": 1.7148, + "step": 30100 + }, + { + "epoch": 1.0704826237667764, + "grad_norm": 1.7735117673873901, + "learning_rate": 1.5279121832390077e-05, + "loss": 1.6865, + "step": 30110 + }, + { + "epoch": 1.0708381477202027, + "grad_norm": 1.7426199913024902, + "learning_rate": 1.5275792936418188e-05, + "loss": 1.6838, + "step": 30120 + }, + { + "epoch": 1.071193671673629, + "grad_norm": 1.8336448669433594, + "learning_rate": 1.5272463230117583e-05, + "loss": 1.7238, + "step": 30130 + }, + { + "epoch": 1.0715491956270553, + "grad_norm": 1.7430644035339355, + "learning_rate": 1.526913271399968e-05, + "loss": 1.7086, + "step": 30140 + }, + { + "epoch": 1.0719047195804818, + "grad_norm": 1.8755711317062378, + "learning_rate": 1.5265801388576034e-05, + "loss": 1.713, + "step": 30150 + }, + { + "epoch": 1.0722602435339081, + "grad_norm": 1.8830538988113403, + "learning_rate": 1.526246925435831e-05, + "loss": 1.6825, + "step": 30160 + }, + { + "epoch": 1.0726157674873344, + "grad_norm": 1.708674669265747, + "learning_rate": 1.5259136311858306e-05, + "loss": 1.6879, + "step": 30170 + }, + { + "epoch": 1.0729712914407608, + "grad_norm": 1.7258754968643188, + "learning_rate": 1.5255802561587936e-05, + "loss": 1.6937, + "step": 30180 + }, + { + "epoch": 1.0733268153941873, + "grad_norm": 1.6590732336044312, + "learning_rate": 1.525246800405925e-05, + "loss": 1.6877, + "step": 30190 + }, + { + "epoch": 1.0736823393476136, + "grad_norm": 1.7467076778411865, + "learning_rate": 1.5249132639784414e-05, + "loss": 1.7035, + "step": 30200 + }, + { + "epoch": 1.07403786330104, + "grad_norm": 1.836143136024475, + "learning_rate": 1.5245796469275714e-05, + "loss": 1.6815, + "step": 30210 + }, + { + "epoch": 1.0743933872544662, + "grad_norm": 1.9502182006835938, + "learning_rate": 1.5242459493045564e-05, + "loss": 1.7107, + "step": 30220 + }, + { + "epoch": 1.0747489112078927, + "grad_norm": 1.838701844215393, + "learning_rate": 1.5239121711606513e-05, + "loss": 1.6712, + "step": 30230 + }, + { + "epoch": 1.075104435161319, + "grad_norm": 1.7862120866775513, + "learning_rate": 1.5235783125471213e-05, + "loss": 1.7075, + "step": 30240 + }, + { + "epoch": 1.0754599591147453, + "grad_norm": 1.7898902893066406, + "learning_rate": 1.5232443735152456e-05, + "loss": 1.7522, + "step": 30250 + }, + { + "epoch": 1.0758154830681717, + "grad_norm": 1.9410507678985596, + "learning_rate": 1.5229103541163146e-05, + "loss": 1.6943, + "step": 30260 + }, + { + "epoch": 1.0761710070215982, + "grad_norm": 1.6454249620437622, + "learning_rate": 1.5225762544016318e-05, + "loss": 1.7182, + "step": 30270 + }, + { + "epoch": 1.0765265309750245, + "grad_norm": 1.7281266450881958, + "learning_rate": 1.5222420744225133e-05, + "loss": 1.7008, + "step": 30280 + }, + { + "epoch": 1.0768820549284508, + "grad_norm": 1.7373607158660889, + "learning_rate": 1.5219078142302863e-05, + "loss": 1.685, + "step": 30290 + }, + { + "epoch": 1.077237578881877, + "grad_norm": 1.750491738319397, + "learning_rate": 1.5215734738762918e-05, + "loss": 1.6883, + "step": 30300 + }, + { + "epoch": 1.0775931028353036, + "grad_norm": 1.8068252801895142, + "learning_rate": 1.5212390534118815e-05, + "loss": 1.7167, + "step": 30310 + }, + { + "epoch": 1.07794862678873, + "grad_norm": 1.6495577096939087, + "learning_rate": 1.5209045528884212e-05, + "loss": 1.6949, + "step": 30320 + }, + { + "epoch": 1.0783041507421562, + "grad_norm": 1.734872817993164, + "learning_rate": 1.5205699723572874e-05, + "loss": 1.7253, + "step": 30330 + }, + { + "epoch": 1.0786596746955825, + "grad_norm": 1.672032117843628, + "learning_rate": 1.5202353118698701e-05, + "loss": 1.7135, + "step": 30340 + }, + { + "epoch": 1.079015198649009, + "grad_norm": 1.6830453872680664, + "learning_rate": 1.5199005714775705e-05, + "loss": 1.6763, + "step": 30350 + }, + { + "epoch": 1.0793707226024354, + "grad_norm": 1.9315227270126343, + "learning_rate": 1.5195657512318032e-05, + "loss": 1.6966, + "step": 30360 + }, + { + "epoch": 1.0797262465558617, + "grad_norm": 1.7042914628982544, + "learning_rate": 1.5192308511839942e-05, + "loss": 1.7142, + "step": 30370 + }, + { + "epoch": 1.080081770509288, + "grad_norm": 1.8929035663604736, + "learning_rate": 1.5188958713855822e-05, + "loss": 1.7066, + "step": 30380 + }, + { + "epoch": 1.0804372944627145, + "grad_norm": 1.83518385887146, + "learning_rate": 1.5185608118880172e-05, + "loss": 1.7335, + "step": 30390 + }, + { + "epoch": 1.0807928184161408, + "grad_norm": 1.8305349349975586, + "learning_rate": 1.5182256727427636e-05, + "loss": 1.6995, + "step": 30400 + }, + { + "epoch": 1.0811483423695671, + "grad_norm": 1.8416990041732788, + "learning_rate": 1.5178904540012956e-05, + "loss": 1.6695, + "step": 30410 + }, + { + "epoch": 1.0815038663229934, + "grad_norm": 1.7948909997940063, + "learning_rate": 1.5175551557151012e-05, + "loss": 1.6918, + "step": 30420 + }, + { + "epoch": 1.08185939027642, + "grad_norm": 1.766680121421814, + "learning_rate": 1.5172197779356799e-05, + "loss": 1.7204, + "step": 30430 + }, + { + "epoch": 1.0822149142298463, + "grad_norm": 1.8292341232299805, + "learning_rate": 1.5168843207145436e-05, + "loss": 1.7263, + "step": 30440 + }, + { + "epoch": 1.0825704381832726, + "grad_norm": 1.79512619972229, + "learning_rate": 1.516548784103217e-05, + "loss": 1.7272, + "step": 30450 + }, + { + "epoch": 1.082925962136699, + "grad_norm": 1.7170366048812866, + "learning_rate": 1.5162131681532355e-05, + "loss": 1.7457, + "step": 30460 + }, + { + "epoch": 1.0832814860901254, + "grad_norm": 1.8364603519439697, + "learning_rate": 1.515877472916148e-05, + "loss": 1.6868, + "step": 30470 + }, + { + "epoch": 1.0836370100435517, + "grad_norm": 1.7465304136276245, + "learning_rate": 1.5155416984435153e-05, + "loss": 1.6837, + "step": 30480 + }, + { + "epoch": 1.083992533996978, + "grad_norm": 1.655500888824463, + "learning_rate": 1.5152058447869103e-05, + "loss": 1.6866, + "step": 30490 + }, + { + "epoch": 1.0843480579504043, + "grad_norm": 1.9954131841659546, + "learning_rate": 1.5148699119979183e-05, + "loss": 1.6422, + "step": 30500 + }, + { + "epoch": 1.0847035819038309, + "grad_norm": 1.7527703046798706, + "learning_rate": 1.5145339001281355e-05, + "loss": 1.6732, + "step": 30510 + }, + { + "epoch": 1.0850591058572572, + "grad_norm": 1.789578914642334, + "learning_rate": 1.514197809229172e-05, + "loss": 1.6732, + "step": 30520 + }, + { + "epoch": 1.0854146298106835, + "grad_norm": 1.6960538625717163, + "learning_rate": 1.5138616393526491e-05, + "loss": 1.6768, + "step": 30530 + }, + { + "epoch": 1.0857701537641098, + "grad_norm": 1.7201929092407227, + "learning_rate": 1.5135253905502e-05, + "loss": 1.7124, + "step": 30540 + }, + { + "epoch": 1.0861256777175363, + "grad_norm": 1.8330485820770264, + "learning_rate": 1.513189062873471e-05, + "loss": 1.6952, + "step": 30550 + }, + { + "epoch": 1.0864812016709626, + "grad_norm": 1.7012553215026855, + "learning_rate": 1.5128526563741198e-05, + "loss": 1.7371, + "step": 30560 + }, + { + "epoch": 1.086836725624389, + "grad_norm": 1.8821933269500732, + "learning_rate": 1.5125161711038159e-05, + "loss": 1.7069, + "step": 30570 + }, + { + "epoch": 1.0871922495778152, + "grad_norm": 1.7384915351867676, + "learning_rate": 1.5121796071142418e-05, + "loss": 1.7436, + "step": 30580 + }, + { + "epoch": 1.0875477735312418, + "grad_norm": 1.8113093376159668, + "learning_rate": 1.5118429644570914e-05, + "loss": 1.6728, + "step": 30590 + }, + { + "epoch": 1.087903297484668, + "grad_norm": 1.7170952558517456, + "learning_rate": 1.511506243184071e-05, + "loss": 1.7562, + "step": 30600 + }, + { + "epoch": 1.0882588214380944, + "grad_norm": 1.7559735774993896, + "learning_rate": 1.5111694433468987e-05, + "loss": 1.7138, + "step": 30610 + }, + { + "epoch": 1.0886143453915207, + "grad_norm": 1.8377747535705566, + "learning_rate": 1.510832564997305e-05, + "loss": 1.745, + "step": 30620 + }, + { + "epoch": 1.0889698693449472, + "grad_norm": 1.7283035516738892, + "learning_rate": 1.5104956081870325e-05, + "loss": 1.6916, + "step": 30630 + }, + { + "epoch": 1.0893253932983735, + "grad_norm": 1.6927980184555054, + "learning_rate": 1.5101585729678352e-05, + "loss": 1.6908, + "step": 30640 + }, + { + "epoch": 1.0896809172517998, + "grad_norm": 1.7789459228515625, + "learning_rate": 1.5098214593914797e-05, + "loss": 1.6767, + "step": 30650 + }, + { + "epoch": 1.0900364412052261, + "grad_norm": 1.7156596183776855, + "learning_rate": 1.5094842675097448e-05, + "loss": 1.7332, + "step": 30660 + }, + { + "epoch": 1.0903919651586527, + "grad_norm": 1.7196613550186157, + "learning_rate": 1.5091469973744205e-05, + "loss": 1.7253, + "step": 30670 + }, + { + "epoch": 1.090747489112079, + "grad_norm": 1.7847352027893066, + "learning_rate": 1.5088096490373106e-05, + "loss": 1.73, + "step": 30680 + }, + { + "epoch": 1.0911030130655053, + "grad_norm": 1.7828787565231323, + "learning_rate": 1.5084722225502285e-05, + "loss": 1.6746, + "step": 30690 + }, + { + "epoch": 1.0914585370189316, + "grad_norm": 1.8798949718475342, + "learning_rate": 1.508134717965001e-05, + "loss": 1.6783, + "step": 30700 + }, + { + "epoch": 1.0918140609723581, + "grad_norm": 1.7675645351409912, + "learning_rate": 1.5077971353334669e-05, + "loss": 1.6952, + "step": 30710 + }, + { + "epoch": 1.0921695849257844, + "grad_norm": 1.8203387260437012, + "learning_rate": 1.5074594747074765e-05, + "loss": 1.7111, + "step": 30720 + }, + { + "epoch": 1.0925251088792107, + "grad_norm": 1.7350901365280151, + "learning_rate": 1.5071217361388928e-05, + "loss": 1.7125, + "step": 30730 + }, + { + "epoch": 1.092880632832637, + "grad_norm": 1.7595888376235962, + "learning_rate": 1.50678391967959e-05, + "loss": 1.7157, + "step": 30740 + }, + { + "epoch": 1.0932361567860636, + "grad_norm": 1.805652141571045, + "learning_rate": 1.506446025381455e-05, + "loss": 1.7066, + "step": 30750 + }, + { + "epoch": 1.0935916807394899, + "grad_norm": 1.7728995084762573, + "learning_rate": 1.5061080532963858e-05, + "loss": 1.7122, + "step": 30760 + }, + { + "epoch": 1.0939472046929162, + "grad_norm": 1.8523882627487183, + "learning_rate": 1.505770003476293e-05, + "loss": 1.704, + "step": 30770 + }, + { + "epoch": 1.0943027286463425, + "grad_norm": 1.8022390604019165, + "learning_rate": 1.5054318759730988e-05, + "loss": 1.6893, + "step": 30780 + }, + { + "epoch": 1.094658252599769, + "grad_norm": 1.7755857706069946, + "learning_rate": 1.5050936708387371e-05, + "loss": 1.7057, + "step": 30790 + }, + { + "epoch": 1.0950137765531953, + "grad_norm": 1.7422014474868774, + "learning_rate": 1.5047553881251551e-05, + "loss": 1.6995, + "step": 30800 + }, + { + "epoch": 1.0953693005066216, + "grad_norm": 1.6298151016235352, + "learning_rate": 1.5044170278843103e-05, + "loss": 1.7035, + "step": 30810 + }, + { + "epoch": 1.095724824460048, + "grad_norm": 1.7250490188598633, + "learning_rate": 1.5040785901681725e-05, + "loss": 1.6674, + "step": 30820 + }, + { + "epoch": 1.0960803484134745, + "grad_norm": 1.759215235710144, + "learning_rate": 1.5037400750287239e-05, + "loss": 1.6645, + "step": 30830 + }, + { + "epoch": 1.0964358723669008, + "grad_norm": 1.6420128345489502, + "learning_rate": 1.5034014825179584e-05, + "loss": 1.673, + "step": 30840 + }, + { + "epoch": 1.096791396320327, + "grad_norm": 1.8458141088485718, + "learning_rate": 1.5030628126878815e-05, + "loss": 1.7032, + "step": 30850 + }, + { + "epoch": 1.0971469202737534, + "grad_norm": 1.7907358407974243, + "learning_rate": 1.5027240655905106e-05, + "loss": 1.7256, + "step": 30860 + }, + { + "epoch": 1.09750244422718, + "grad_norm": 1.746254563331604, + "learning_rate": 1.5023852412778754e-05, + "loss": 1.7219, + "step": 30870 + }, + { + "epoch": 1.0978579681806062, + "grad_norm": 1.6866568326950073, + "learning_rate": 1.5020463398020174e-05, + "loss": 1.6771, + "step": 30880 + }, + { + "epoch": 1.0982134921340325, + "grad_norm": 1.8206251859664917, + "learning_rate": 1.5017073612149888e-05, + "loss": 1.6914, + "step": 30890 + }, + { + "epoch": 1.0985690160874588, + "grad_norm": 1.6670016050338745, + "learning_rate": 1.5013683055688559e-05, + "loss": 1.7025, + "step": 30900 + }, + { + "epoch": 1.0989245400408854, + "grad_norm": 1.6651729345321655, + "learning_rate": 1.5010291729156945e-05, + "loss": 1.7279, + "step": 30910 + }, + { + "epoch": 1.0992800639943117, + "grad_norm": 1.7703865766525269, + "learning_rate": 1.5006899633075937e-05, + "loss": 1.7285, + "step": 30920 + }, + { + "epoch": 1.099635587947738, + "grad_norm": 1.6907395124435425, + "learning_rate": 1.5003506767966541e-05, + "loss": 1.73, + "step": 30930 + }, + { + "epoch": 1.0999911119011643, + "grad_norm": 1.7951709032058716, + "learning_rate": 1.5000113134349876e-05, + "loss": 1.6835, + "step": 30940 + }, + { + "epoch": 1.1003466358545908, + "grad_norm": 1.7016987800598145, + "learning_rate": 1.4996718732747187e-05, + "loss": 1.7348, + "step": 30950 + }, + { + "epoch": 1.100702159808017, + "grad_norm": 1.8535315990447998, + "learning_rate": 1.4993323563679827e-05, + "loss": 1.6841, + "step": 30960 + }, + { + "epoch": 1.1010576837614434, + "grad_norm": 1.7085849046707153, + "learning_rate": 1.4989927627669274e-05, + "loss": 1.7157, + "step": 30970 + }, + { + "epoch": 1.1014132077148697, + "grad_norm": 1.7176827192306519, + "learning_rate": 1.4986530925237128e-05, + "loss": 1.6792, + "step": 30980 + }, + { + "epoch": 1.1017687316682963, + "grad_norm": 1.768995761871338, + "learning_rate": 1.4983133456905099e-05, + "loss": 1.7004, + "step": 30990 + }, + { + "epoch": 1.1021242556217226, + "grad_norm": 1.8381508588790894, + "learning_rate": 1.4979735223195015e-05, + "loss": 1.718, + "step": 31000 + }, + { + "epoch": 1.1024797795751489, + "grad_norm": 1.8250529766082764, + "learning_rate": 1.4976336224628822e-05, + "loss": 1.6902, + "step": 31010 + }, + { + "epoch": 1.1028353035285752, + "grad_norm": 1.8430213928222656, + "learning_rate": 1.4972936461728587e-05, + "loss": 1.7276, + "step": 31020 + }, + { + "epoch": 1.1031908274820017, + "grad_norm": 1.8098722696304321, + "learning_rate": 1.4969535935016491e-05, + "loss": 1.7129, + "step": 31030 + }, + { + "epoch": 1.103546351435428, + "grad_norm": 1.816544771194458, + "learning_rate": 1.4966134645014836e-05, + "loss": 1.7628, + "step": 31040 + }, + { + "epoch": 1.1039018753888543, + "grad_norm": 1.8728526830673218, + "learning_rate": 1.4962732592246037e-05, + "loss": 1.7386, + "step": 31050 + }, + { + "epoch": 1.1042573993422806, + "grad_norm": 1.7653038501739502, + "learning_rate": 1.495932977723263e-05, + "loss": 1.7164, + "step": 31060 + }, + { + "epoch": 1.1046129232957071, + "grad_norm": 1.8237130641937256, + "learning_rate": 1.4955926200497262e-05, + "loss": 1.6975, + "step": 31070 + }, + { + "epoch": 1.1049684472491335, + "grad_norm": 1.7338460683822632, + "learning_rate": 1.4952521862562705e-05, + "loss": 1.675, + "step": 31080 + }, + { + "epoch": 1.1053239712025598, + "grad_norm": 1.8000022172927856, + "learning_rate": 1.4949116763951844e-05, + "loss": 1.6761, + "step": 31090 + }, + { + "epoch": 1.105679495155986, + "grad_norm": 1.7959693670272827, + "learning_rate": 1.4945710905187675e-05, + "loss": 1.7275, + "step": 31100 + }, + { + "epoch": 1.1060350191094126, + "grad_norm": 1.7559113502502441, + "learning_rate": 1.4942304286793323e-05, + "loss": 1.7033, + "step": 31110 + }, + { + "epoch": 1.106390543062839, + "grad_norm": 1.808515191078186, + "learning_rate": 1.4938896909292023e-05, + "loss": 1.7032, + "step": 31120 + }, + { + "epoch": 1.1067460670162652, + "grad_norm": 1.6466903686523438, + "learning_rate": 1.4935488773207123e-05, + "loss": 1.6773, + "step": 31130 + }, + { + "epoch": 1.1071015909696915, + "grad_norm": 1.7822074890136719, + "learning_rate": 1.4932079879062094e-05, + "loss": 1.7081, + "step": 31140 + }, + { + "epoch": 1.107457114923118, + "grad_norm": 1.7436120510101318, + "learning_rate": 1.4928670227380517e-05, + "loss": 1.6902, + "step": 31150 + }, + { + "epoch": 1.1078126388765444, + "grad_norm": 1.8608921766281128, + "learning_rate": 1.4925259818686099e-05, + "loss": 1.654, + "step": 31160 + }, + { + "epoch": 1.1081681628299707, + "grad_norm": 1.692667841911316, + "learning_rate": 1.4921848653502652e-05, + "loss": 1.6963, + "step": 31170 + }, + { + "epoch": 1.108523686783397, + "grad_norm": 1.9093800783157349, + "learning_rate": 1.4918436732354117e-05, + "loss": 1.6653, + "step": 31180 + }, + { + "epoch": 1.1088792107368235, + "grad_norm": 1.6940561532974243, + "learning_rate": 1.4915024055764535e-05, + "loss": 1.7274, + "step": 31190 + }, + { + "epoch": 1.1092347346902498, + "grad_norm": 1.655745267868042, + "learning_rate": 1.4911610624258077e-05, + "loss": 1.6956, + "step": 31200 + }, + { + "epoch": 1.109590258643676, + "grad_norm": 1.78951096534729, + "learning_rate": 1.4908196438359022e-05, + "loss": 1.7363, + "step": 31210 + }, + { + "epoch": 1.1099457825971024, + "grad_norm": 1.7750624418258667, + "learning_rate": 1.4904781498591766e-05, + "loss": 1.6992, + "step": 31220 + }, + { + "epoch": 1.110301306550529, + "grad_norm": 1.730631709098816, + "learning_rate": 1.4901365805480828e-05, + "loss": 1.6887, + "step": 31230 + }, + { + "epoch": 1.1106568305039553, + "grad_norm": 1.865522027015686, + "learning_rate": 1.4897949359550837e-05, + "loss": 1.7311, + "step": 31240 + }, + { + "epoch": 1.1110123544573816, + "grad_norm": 1.821334719657898, + "learning_rate": 1.489453216132653e-05, + "loss": 1.6791, + "step": 31250 + }, + { + "epoch": 1.1113678784108079, + "grad_norm": 1.8130940198898315, + "learning_rate": 1.4891114211332776e-05, + "loss": 1.7056, + "step": 31260 + }, + { + "epoch": 1.1117234023642344, + "grad_norm": 1.850980281829834, + "learning_rate": 1.4887695510094545e-05, + "loss": 1.7552, + "step": 31270 + }, + { + "epoch": 1.1120789263176607, + "grad_norm": 1.7095885276794434, + "learning_rate": 1.4884276058136928e-05, + "loss": 1.6868, + "step": 31280 + }, + { + "epoch": 1.112434450271087, + "grad_norm": 1.7346407175064087, + "learning_rate": 1.4880855855985132e-05, + "loss": 1.6652, + "step": 31290 + }, + { + "epoch": 1.1127899742245133, + "grad_norm": 1.8015046119689941, + "learning_rate": 1.4877434904164485e-05, + "loss": 1.6931, + "step": 31300 + }, + { + "epoch": 1.1131454981779398, + "grad_norm": 1.7510502338409424, + "learning_rate": 1.4874013203200415e-05, + "loss": 1.6761, + "step": 31310 + }, + { + "epoch": 1.1135010221313661, + "grad_norm": 1.7510874271392822, + "learning_rate": 1.4870590753618478e-05, + "loss": 1.7106, + "step": 31320 + }, + { + "epoch": 1.1138565460847925, + "grad_norm": 1.8281718492507935, + "learning_rate": 1.4867167555944339e-05, + "loss": 1.6621, + "step": 31330 + }, + { + "epoch": 1.1142120700382188, + "grad_norm": 1.7633572816848755, + "learning_rate": 1.4863743610703783e-05, + "loss": 1.6881, + "step": 31340 + }, + { + "epoch": 1.1145675939916453, + "grad_norm": 1.7464405298233032, + "learning_rate": 1.48603189184227e-05, + "loss": 1.7036, + "step": 31350 + }, + { + "epoch": 1.1149231179450716, + "grad_norm": 1.7094883918762207, + "learning_rate": 1.485689347962711e-05, + "loss": 1.7104, + "step": 31360 + }, + { + "epoch": 1.115278641898498, + "grad_norm": 1.8825170993804932, + "learning_rate": 1.4853467294843134e-05, + "loss": 1.6911, + "step": 31370 + }, + { + "epoch": 1.1156341658519242, + "grad_norm": 1.8182456493377686, + "learning_rate": 1.4850040364597012e-05, + "loss": 1.7122, + "step": 31380 + }, + { + "epoch": 1.1159896898053507, + "grad_norm": 1.7755396366119385, + "learning_rate": 1.4846612689415099e-05, + "loss": 1.7314, + "step": 31390 + }, + { + "epoch": 1.116345213758777, + "grad_norm": 1.7010819911956787, + "learning_rate": 1.4843184269823867e-05, + "loss": 1.7456, + "step": 31400 + }, + { + "epoch": 1.1167007377122034, + "grad_norm": 1.8768155574798584, + "learning_rate": 1.4839755106349898e-05, + "loss": 1.7313, + "step": 31410 + }, + { + "epoch": 1.1170562616656297, + "grad_norm": 1.764375925064087, + "learning_rate": 1.4836325199519887e-05, + "loss": 1.6891, + "step": 31420 + }, + { + "epoch": 1.1174117856190562, + "grad_norm": 1.729925274848938, + "learning_rate": 1.4832894549860655e-05, + "loss": 1.723, + "step": 31430 + }, + { + "epoch": 1.1177673095724825, + "grad_norm": 1.8742669820785522, + "learning_rate": 1.4829463157899118e-05, + "loss": 1.7135, + "step": 31440 + }, + { + "epoch": 1.1181228335259088, + "grad_norm": 1.7695032358169556, + "learning_rate": 1.4826031024162321e-05, + "loss": 1.6789, + "step": 31450 + }, + { + "epoch": 1.118478357479335, + "grad_norm": 1.727602243423462, + "learning_rate": 1.482259814917742e-05, + "loss": 1.6971, + "step": 31460 + }, + { + "epoch": 1.1188338814327616, + "grad_norm": 1.7881238460540771, + "learning_rate": 1.481916453347168e-05, + "loss": 1.7014, + "step": 31470 + }, + { + "epoch": 1.119189405386188, + "grad_norm": 1.6665465831756592, + "learning_rate": 1.4815730177572487e-05, + "loss": 1.6626, + "step": 31480 + }, + { + "epoch": 1.1195449293396142, + "grad_norm": 1.7676050662994385, + "learning_rate": 1.4812295082007331e-05, + "loss": 1.6791, + "step": 31490 + }, + { + "epoch": 1.1199004532930406, + "grad_norm": 1.8006497621536255, + "learning_rate": 1.4808859247303826e-05, + "loss": 1.7094, + "step": 31500 + }, + { + "epoch": 1.120255977246467, + "grad_norm": 1.8111299276351929, + "learning_rate": 1.480542267398969e-05, + "loss": 1.7503, + "step": 31510 + }, + { + "epoch": 1.1206115011998934, + "grad_norm": 1.8131684064865112, + "learning_rate": 1.4801985362592764e-05, + "loss": 1.6797, + "step": 31520 + }, + { + "epoch": 1.1209670251533197, + "grad_norm": 1.7535158395767212, + "learning_rate": 1.4798547313640992e-05, + "loss": 1.6686, + "step": 31530 + }, + { + "epoch": 1.121322549106746, + "grad_norm": 1.8932629823684692, + "learning_rate": 1.479510852766244e-05, + "loss": 1.7397, + "step": 31540 + }, + { + "epoch": 1.1216780730601725, + "grad_norm": 1.7336300611495972, + "learning_rate": 1.4791669005185285e-05, + "loss": 1.7013, + "step": 31550 + }, + { + "epoch": 1.1220335970135988, + "grad_norm": 1.7552558183670044, + "learning_rate": 1.4788228746737816e-05, + "loss": 1.7705, + "step": 31560 + }, + { + "epoch": 1.1223891209670251, + "grad_norm": 1.7884132862091064, + "learning_rate": 1.4784787752848432e-05, + "loss": 1.684, + "step": 31570 + }, + { + "epoch": 1.1227446449204515, + "grad_norm": 1.9435439109802246, + "learning_rate": 1.478134602404565e-05, + "loss": 1.7101, + "step": 31580 + }, + { + "epoch": 1.123100168873878, + "grad_norm": 1.905207633972168, + "learning_rate": 1.4777903560858098e-05, + "loss": 1.7137, + "step": 31590 + }, + { + "epoch": 1.1234556928273043, + "grad_norm": 1.7854448556900024, + "learning_rate": 1.4774460363814518e-05, + "loss": 1.6594, + "step": 31600 + }, + { + "epoch": 1.1238112167807306, + "grad_norm": 1.77256178855896, + "learning_rate": 1.4771016433443761e-05, + "loss": 1.6195, + "step": 31610 + }, + { + "epoch": 1.124166740734157, + "grad_norm": 1.6947712898254395, + "learning_rate": 1.4767571770274796e-05, + "loss": 1.659, + "step": 31620 + }, + { + "epoch": 1.1245222646875834, + "grad_norm": 1.7803505659103394, + "learning_rate": 1.4764126374836698e-05, + "loss": 1.6957, + "step": 31630 + }, + { + "epoch": 1.1248777886410097, + "grad_norm": 1.828350305557251, + "learning_rate": 1.476068024765866e-05, + "loss": 1.7002, + "step": 31640 + }, + { + "epoch": 1.125233312594436, + "grad_norm": 1.7589242458343506, + "learning_rate": 1.4757233389269986e-05, + "loss": 1.6817, + "step": 31650 + }, + { + "epoch": 1.1255888365478623, + "grad_norm": 1.739592432975769, + "learning_rate": 1.475378580020009e-05, + "loss": 1.6981, + "step": 31660 + }, + { + "epoch": 1.1259443605012889, + "grad_norm": 1.8724303245544434, + "learning_rate": 1.4750337480978506e-05, + "loss": 1.6967, + "step": 31670 + }, + { + "epoch": 1.1262998844547152, + "grad_norm": 1.8594884872436523, + "learning_rate": 1.4746888432134868e-05, + "loss": 1.6761, + "step": 31680 + }, + { + "epoch": 1.1266554084081415, + "grad_norm": 1.7723699808120728, + "learning_rate": 1.474343865419893e-05, + "loss": 1.6856, + "step": 31690 + }, + { + "epoch": 1.1270109323615678, + "grad_norm": 1.7817522287368774, + "learning_rate": 1.4739988147700555e-05, + "loss": 1.7061, + "step": 31700 + }, + { + "epoch": 1.127366456314994, + "grad_norm": 1.6874125003814697, + "learning_rate": 1.4736536913169719e-05, + "loss": 1.7159, + "step": 31710 + }, + { + "epoch": 1.1277219802684206, + "grad_norm": 1.8645119667053223, + "learning_rate": 1.4733084951136516e-05, + "loss": 1.6798, + "step": 31720 + }, + { + "epoch": 1.128077504221847, + "grad_norm": 1.8583474159240723, + "learning_rate": 1.4729632262131137e-05, + "loss": 1.6438, + "step": 31730 + }, + { + "epoch": 1.1284330281752732, + "grad_norm": 1.8013262748718262, + "learning_rate": 1.4726178846683901e-05, + "loss": 1.7132, + "step": 31740 + }, + { + "epoch": 1.1287885521286998, + "grad_norm": 1.6753315925598145, + "learning_rate": 1.4722724705325226e-05, + "loss": 1.7321, + "step": 31750 + }, + { + "epoch": 1.129144076082126, + "grad_norm": 1.7268919944763184, + "learning_rate": 1.4719269838585645e-05, + "loss": 1.7509, + "step": 31760 + }, + { + "epoch": 1.1294996000355524, + "grad_norm": 1.669714093208313, + "learning_rate": 1.471581424699581e-05, + "loss": 1.7306, + "step": 31770 + }, + { + "epoch": 1.1298551239889787, + "grad_norm": 1.8650387525558472, + "learning_rate": 1.4712357931086474e-05, + "loss": 1.7027, + "step": 31780 + }, + { + "epoch": 1.130210647942405, + "grad_norm": 1.7554460763931274, + "learning_rate": 1.4708900891388506e-05, + "loss": 1.7183, + "step": 31790 + }, + { + "epoch": 1.1305661718958315, + "grad_norm": 1.8117382526397705, + "learning_rate": 1.4705443128432891e-05, + "loss": 1.6967, + "step": 31800 + }, + { + "epoch": 1.1309216958492578, + "grad_norm": 1.660362720489502, + "learning_rate": 1.470198464275071e-05, + "loss": 1.6875, + "step": 31810 + }, + { + "epoch": 1.1312772198026841, + "grad_norm": 1.798330545425415, + "learning_rate": 1.4698525434873173e-05, + "loss": 1.6976, + "step": 31820 + }, + { + "epoch": 1.1316327437561107, + "grad_norm": 1.8113417625427246, + "learning_rate": 1.4695065505331584e-05, + "loss": 1.6929, + "step": 31830 + }, + { + "epoch": 1.131988267709537, + "grad_norm": 1.7701830863952637, + "learning_rate": 1.4691604854657375e-05, + "loss": 1.6945, + "step": 31840 + }, + { + "epoch": 1.1323437916629633, + "grad_norm": 1.686472773551941, + "learning_rate": 1.4688143483382076e-05, + "loss": 1.7623, + "step": 31850 + }, + { + "epoch": 1.1326993156163896, + "grad_norm": 1.8108429908752441, + "learning_rate": 1.4684681392037334e-05, + "loss": 1.7211, + "step": 31860 + }, + { + "epoch": 1.133054839569816, + "grad_norm": 1.7331719398498535, + "learning_rate": 1.4681218581154904e-05, + "loss": 1.6902, + "step": 31870 + }, + { + "epoch": 1.1334103635232424, + "grad_norm": 1.9255976676940918, + "learning_rate": 1.4677755051266651e-05, + "loss": 1.7045, + "step": 31880 + }, + { + "epoch": 1.1337658874766687, + "grad_norm": 1.7227295637130737, + "learning_rate": 1.4674290802904549e-05, + "loss": 1.6928, + "step": 31890 + }, + { + "epoch": 1.134121411430095, + "grad_norm": 1.915678858757019, + "learning_rate": 1.4670825836600688e-05, + "loss": 1.6951, + "step": 31900 + }, + { + "epoch": 1.1344769353835216, + "grad_norm": 1.7096298933029175, + "learning_rate": 1.4667360152887267e-05, + "loss": 1.6935, + "step": 31910 + }, + { + "epoch": 1.1348324593369479, + "grad_norm": 1.732093334197998, + "learning_rate": 1.4663893752296589e-05, + "loss": 1.7173, + "step": 31920 + }, + { + "epoch": 1.1351879832903742, + "grad_norm": 1.7549011707305908, + "learning_rate": 1.4660426635361078e-05, + "loss": 1.6538, + "step": 31930 + }, + { + "epoch": 1.1355435072438005, + "grad_norm": 1.7300089597702026, + "learning_rate": 1.4656958802613253e-05, + "loss": 1.6919, + "step": 31940 + }, + { + "epoch": 1.1358990311972268, + "grad_norm": 1.8303147554397583, + "learning_rate": 1.4653490254585756e-05, + "loss": 1.6438, + "step": 31950 + }, + { + "epoch": 1.1362545551506533, + "grad_norm": 1.86250638961792, + "learning_rate": 1.4650020991811334e-05, + "loss": 1.6658, + "step": 31960 + }, + { + "epoch": 1.1366100791040796, + "grad_norm": 1.7613438367843628, + "learning_rate": 1.4646551014822843e-05, + "loss": 1.7075, + "step": 31970 + }, + { + "epoch": 1.136965603057506, + "grad_norm": 2.0326385498046875, + "learning_rate": 1.464308032415325e-05, + "loss": 1.6903, + "step": 31980 + }, + { + "epoch": 1.1373211270109325, + "grad_norm": 1.7846877574920654, + "learning_rate": 1.4639608920335632e-05, + "loss": 1.6944, + "step": 31990 + }, + { + "epoch": 1.1376766509643588, + "grad_norm": 1.8086317777633667, + "learning_rate": 1.4636136803903175e-05, + "loss": 1.6527, + "step": 32000 + }, + { + "epoch": 1.138032174917785, + "grad_norm": 1.8257627487182617, + "learning_rate": 1.4632663975389173e-05, + "loss": 1.7188, + "step": 32010 + }, + { + "epoch": 1.1383876988712114, + "grad_norm": 1.6944304704666138, + "learning_rate": 1.4629190435327032e-05, + "loss": 1.6477, + "step": 32020 + }, + { + "epoch": 1.1387432228246377, + "grad_norm": 1.7246496677398682, + "learning_rate": 1.4625716184250262e-05, + "loss": 1.6891, + "step": 32030 + }, + { + "epoch": 1.1390987467780642, + "grad_norm": 1.7539235353469849, + "learning_rate": 1.4622241222692495e-05, + "loss": 1.6655, + "step": 32040 + }, + { + "epoch": 1.1394542707314905, + "grad_norm": 1.825313687324524, + "learning_rate": 1.4618765551187457e-05, + "loss": 1.6999, + "step": 32050 + }, + { + "epoch": 1.1398097946849168, + "grad_norm": 1.8041131496429443, + "learning_rate": 1.4615289170268986e-05, + "loss": 1.7108, + "step": 32060 + }, + { + "epoch": 1.1401653186383434, + "grad_norm": 1.7969400882720947, + "learning_rate": 1.461181208047104e-05, + "loss": 1.6538, + "step": 32070 + }, + { + "epoch": 1.1405208425917697, + "grad_norm": 1.6379525661468506, + "learning_rate": 1.4608334282327672e-05, + "loss": 1.7106, + "step": 32080 + }, + { + "epoch": 1.140876366545196, + "grad_norm": 1.7600675821304321, + "learning_rate": 1.4604855776373056e-05, + "loss": 1.6834, + "step": 32090 + }, + { + "epoch": 1.1412318904986223, + "grad_norm": 1.7814898490905762, + "learning_rate": 1.4601376563141462e-05, + "loss": 1.7124, + "step": 32100 + }, + { + "epoch": 1.1415874144520486, + "grad_norm": 1.7427417039871216, + "learning_rate": 1.4597896643167282e-05, + "loss": 1.6646, + "step": 32110 + }, + { + "epoch": 1.1419429384054751, + "grad_norm": 1.666539192199707, + "learning_rate": 1.4594416016985005e-05, + "loss": 1.7046, + "step": 32120 + }, + { + "epoch": 1.1422984623589014, + "grad_norm": 1.7348567247390747, + "learning_rate": 1.4590934685129236e-05, + "loss": 1.6734, + "step": 32130 + }, + { + "epoch": 1.1426539863123277, + "grad_norm": 1.6932833194732666, + "learning_rate": 1.4587452648134686e-05, + "loss": 1.7141, + "step": 32140 + }, + { + "epoch": 1.1430095102657543, + "grad_norm": 1.7642148733139038, + "learning_rate": 1.4583969906536168e-05, + "loss": 1.7083, + "step": 32150 + }, + { + "epoch": 1.1433650342191806, + "grad_norm": 1.7768909931182861, + "learning_rate": 1.4580486460868616e-05, + "loss": 1.7396, + "step": 32160 + }, + { + "epoch": 1.1437205581726069, + "grad_norm": 1.8518837690353394, + "learning_rate": 1.4577002311667067e-05, + "loss": 1.7454, + "step": 32170 + }, + { + "epoch": 1.1440760821260332, + "grad_norm": 1.8241009712219238, + "learning_rate": 1.457351745946666e-05, + "loss": 1.7187, + "step": 32180 + }, + { + "epoch": 1.1444316060794595, + "grad_norm": 1.8031975030899048, + "learning_rate": 1.4570031904802643e-05, + "loss": 1.6595, + "step": 32190 + }, + { + "epoch": 1.144787130032886, + "grad_norm": 1.8305586576461792, + "learning_rate": 1.4566545648210382e-05, + "loss": 1.7109, + "step": 32200 + }, + { + "epoch": 1.1451426539863123, + "grad_norm": 1.798540711402893, + "learning_rate": 1.4563058690225344e-05, + "loss": 1.6938, + "step": 32210 + }, + { + "epoch": 1.1454981779397386, + "grad_norm": 1.7197037935256958, + "learning_rate": 1.45595710313831e-05, + "loss": 1.7193, + "step": 32220 + }, + { + "epoch": 1.1458537018931652, + "grad_norm": 1.7387158870697021, + "learning_rate": 1.4556082672219333e-05, + "loss": 1.6565, + "step": 32230 + }, + { + "epoch": 1.1462092258465915, + "grad_norm": 1.7779680490493774, + "learning_rate": 1.4552593613269839e-05, + "loss": 1.6162, + "step": 32240 + }, + { + "epoch": 1.1465647498000178, + "grad_norm": 1.738901138305664, + "learning_rate": 1.4549103855070508e-05, + "loss": 1.7132, + "step": 32250 + }, + { + "epoch": 1.146920273753444, + "grad_norm": 1.907164454460144, + "learning_rate": 1.4545613398157346e-05, + "loss": 1.7251, + "step": 32260 + }, + { + "epoch": 1.1472757977068704, + "grad_norm": 1.7884010076522827, + "learning_rate": 1.4542122243066468e-05, + "loss": 1.6939, + "step": 32270 + }, + { + "epoch": 1.147631321660297, + "grad_norm": 1.812320590019226, + "learning_rate": 1.4538630390334094e-05, + "loss": 1.6683, + "step": 32280 + }, + { + "epoch": 1.1479868456137232, + "grad_norm": 1.7077202796936035, + "learning_rate": 1.4535137840496552e-05, + "loss": 1.6924, + "step": 32290 + }, + { + "epoch": 1.1483423695671495, + "grad_norm": 1.8116544485092163, + "learning_rate": 1.4531644594090271e-05, + "loss": 1.7156, + "step": 32300 + }, + { + "epoch": 1.148697893520576, + "grad_norm": 1.7970168590545654, + "learning_rate": 1.4528150651651793e-05, + "loss": 1.6802, + "step": 32310 + }, + { + "epoch": 1.1490534174740024, + "grad_norm": 1.7727383375167847, + "learning_rate": 1.4524656013717766e-05, + "loss": 1.7092, + "step": 32320 + }, + { + "epoch": 1.1494089414274287, + "grad_norm": 1.8042786121368408, + "learning_rate": 1.4521160680824945e-05, + "loss": 1.6947, + "step": 32330 + }, + { + "epoch": 1.149764465380855, + "grad_norm": 1.8238158226013184, + "learning_rate": 1.4517664653510193e-05, + "loss": 1.6564, + "step": 32340 + }, + { + "epoch": 1.1501199893342813, + "grad_norm": 1.8085988759994507, + "learning_rate": 1.4514167932310477e-05, + "loss": 1.7017, + "step": 32350 + }, + { + "epoch": 1.1504755132877078, + "grad_norm": 1.7852269411087036, + "learning_rate": 1.451067051776287e-05, + "loss": 1.7279, + "step": 32360 + }, + { + "epoch": 1.1508310372411341, + "grad_norm": 1.7860541343688965, + "learning_rate": 1.4507172410404553e-05, + "loss": 1.7226, + "step": 32370 + }, + { + "epoch": 1.1511865611945604, + "grad_norm": 1.9089226722717285, + "learning_rate": 1.4503673610772815e-05, + "loss": 1.6818, + "step": 32380 + }, + { + "epoch": 1.151542085147987, + "grad_norm": 1.7474654912948608, + "learning_rate": 1.4500174119405046e-05, + "loss": 1.6729, + "step": 32390 + }, + { + "epoch": 1.1518976091014133, + "grad_norm": 1.7584805488586426, + "learning_rate": 1.449667393683875e-05, + "loss": 1.6835, + "step": 32400 + }, + { + "epoch": 1.1522531330548396, + "grad_norm": 1.6684837341308594, + "learning_rate": 1.4493173063611532e-05, + "loss": 1.6795, + "step": 32410 + }, + { + "epoch": 1.1526086570082659, + "grad_norm": 1.8473206758499146, + "learning_rate": 1.4489671500261105e-05, + "loss": 1.6893, + "step": 32420 + }, + { + "epoch": 1.1529641809616922, + "grad_norm": 1.8049877882003784, + "learning_rate": 1.4486169247325283e-05, + "loss": 1.6762, + "step": 32430 + }, + { + "epoch": 1.1533197049151187, + "grad_norm": 1.6902273893356323, + "learning_rate": 1.4482666305341994e-05, + "loss": 1.6834, + "step": 32440 + }, + { + "epoch": 1.153675228868545, + "grad_norm": 1.811184287071228, + "learning_rate": 1.447916267484927e-05, + "loss": 1.6907, + "step": 32450 + }, + { + "epoch": 1.1540307528219713, + "grad_norm": 1.7988183498382568, + "learning_rate": 1.4475658356385243e-05, + "loss": 1.6991, + "step": 32460 + }, + { + "epoch": 1.1543862767753978, + "grad_norm": 1.783347487449646, + "learning_rate": 1.4472153350488152e-05, + "loss": 1.6674, + "step": 32470 + }, + { + "epoch": 1.1547418007288242, + "grad_norm": 1.907630205154419, + "learning_rate": 1.4468647657696351e-05, + "loss": 1.6922, + "step": 32480 + }, + { + "epoch": 1.1550973246822505, + "grad_norm": 1.874877691268921, + "learning_rate": 1.4465141278548284e-05, + "loss": 1.7199, + "step": 32490 + }, + { + "epoch": 1.1554528486356768, + "grad_norm": 1.783103108406067, + "learning_rate": 1.4461634213582516e-05, + "loss": 1.6528, + "step": 32500 + }, + { + "epoch": 1.155808372589103, + "grad_norm": 1.8129419088363647, + "learning_rate": 1.4458126463337707e-05, + "loss": 1.7292, + "step": 32510 + }, + { + "epoch": 1.1561638965425296, + "grad_norm": 1.7467507123947144, + "learning_rate": 1.4454618028352623e-05, + "loss": 1.6573, + "step": 32520 + }, + { + "epoch": 1.156519420495956, + "grad_norm": 1.7516041994094849, + "learning_rate": 1.4451108909166144e-05, + "loss": 1.6471, + "step": 32530 + }, + { + "epoch": 1.1568749444493822, + "grad_norm": 1.7575526237487793, + "learning_rate": 1.4447599106317245e-05, + "loss": 1.7071, + "step": 32540 + }, + { + "epoch": 1.1572304684028087, + "grad_norm": 1.7154433727264404, + "learning_rate": 1.4444088620345011e-05, + "loss": 1.6896, + "step": 32550 + }, + { + "epoch": 1.157585992356235, + "grad_norm": 1.7868378162384033, + "learning_rate": 1.4440577451788627e-05, + "loss": 1.6911, + "step": 32560 + }, + { + "epoch": 1.1579415163096614, + "grad_norm": 1.8467434644699097, + "learning_rate": 1.443706560118739e-05, + "loss": 1.6923, + "step": 32570 + }, + { + "epoch": 1.1582970402630877, + "grad_norm": 1.8598535060882568, + "learning_rate": 1.4433553069080697e-05, + "loss": 1.6951, + "step": 32580 + }, + { + "epoch": 1.158652564216514, + "grad_norm": 1.8078055381774902, + "learning_rate": 1.4430039856008052e-05, + "loss": 1.7245, + "step": 32590 + }, + { + "epoch": 1.1590080881699405, + "grad_norm": 1.7683792114257812, + "learning_rate": 1.442652596250906e-05, + "loss": 1.7172, + "step": 32600 + }, + { + "epoch": 1.1593636121233668, + "grad_norm": 1.7138365507125854, + "learning_rate": 1.4423011389123438e-05, + "loss": 1.6751, + "step": 32610 + }, + { + "epoch": 1.1597191360767931, + "grad_norm": 1.7290624380111694, + "learning_rate": 1.4419496136390997e-05, + "loss": 1.696, + "step": 32620 + }, + { + "epoch": 1.1600746600302196, + "grad_norm": 1.7704051733016968, + "learning_rate": 1.4415980204851661e-05, + "loss": 1.6506, + "step": 32630 + }, + { + "epoch": 1.160430183983646, + "grad_norm": 1.7916181087493896, + "learning_rate": 1.441246359504545e-05, + "loss": 1.6791, + "step": 32640 + }, + { + "epoch": 1.1607857079370723, + "grad_norm": 1.8721511363983154, + "learning_rate": 1.4408946307512502e-05, + "loss": 1.7063, + "step": 32650 + }, + { + "epoch": 1.1611412318904986, + "grad_norm": 1.9711841344833374, + "learning_rate": 1.4405428342793042e-05, + "loss": 1.7405, + "step": 32660 + }, + { + "epoch": 1.1614967558439249, + "grad_norm": 1.9350999593734741, + "learning_rate": 1.4401909701427412e-05, + "loss": 1.7047, + "step": 32670 + }, + { + "epoch": 1.1618522797973514, + "grad_norm": 1.8050715923309326, + "learning_rate": 1.439839038395605e-05, + "loss": 1.7093, + "step": 32680 + }, + { + "epoch": 1.1622078037507777, + "grad_norm": 1.8852133750915527, + "learning_rate": 1.4394870390919508e-05, + "loss": 1.642, + "step": 32690 + }, + { + "epoch": 1.162563327704204, + "grad_norm": 1.8427294492721558, + "learning_rate": 1.4391349722858428e-05, + "loss": 1.6879, + "step": 32700 + }, + { + "epoch": 1.1629188516576305, + "grad_norm": 1.7559659481048584, + "learning_rate": 1.4387828380313565e-05, + "loss": 1.6581, + "step": 32710 + }, + { + "epoch": 1.1632743756110568, + "grad_norm": 1.7213876247406006, + "learning_rate": 1.4384306363825772e-05, + "loss": 1.6987, + "step": 32720 + }, + { + "epoch": 1.1636298995644831, + "grad_norm": 1.7299596071243286, + "learning_rate": 1.4380783673936015e-05, + "loss": 1.6892, + "step": 32730 + }, + { + "epoch": 1.1639854235179095, + "grad_norm": 1.7985115051269531, + "learning_rate": 1.437726031118535e-05, + "loss": 1.7113, + "step": 32740 + }, + { + "epoch": 1.1643409474713358, + "grad_norm": 1.757331371307373, + "learning_rate": 1.4373736276114947e-05, + "loss": 1.7089, + "step": 32750 + }, + { + "epoch": 1.1646964714247623, + "grad_norm": 1.818609356880188, + "learning_rate": 1.4370211569266077e-05, + "loss": 1.6683, + "step": 32760 + }, + { + "epoch": 1.1650519953781886, + "grad_norm": 1.9024662971496582, + "learning_rate": 1.4366686191180113e-05, + "loss": 1.7072, + "step": 32770 + }, + { + "epoch": 1.165407519331615, + "grad_norm": 1.7936043739318848, + "learning_rate": 1.4363160142398526e-05, + "loss": 1.7066, + "step": 32780 + }, + { + "epoch": 1.1657630432850414, + "grad_norm": 1.6970324516296387, + "learning_rate": 1.4359633423462901e-05, + "loss": 1.6845, + "step": 32790 + }, + { + "epoch": 1.1661185672384677, + "grad_norm": 1.8118795156478882, + "learning_rate": 1.4356106034914916e-05, + "loss": 1.7008, + "step": 32800 + }, + { + "epoch": 1.166474091191894, + "grad_norm": 1.7801399230957031, + "learning_rate": 1.4352577977296358e-05, + "loss": 1.6738, + "step": 32810 + }, + { + "epoch": 1.1668296151453204, + "grad_norm": 1.745764136314392, + "learning_rate": 1.434904925114911e-05, + "loss": 1.6884, + "step": 32820 + }, + { + "epoch": 1.1671851390987467, + "grad_norm": 1.8358365297317505, + "learning_rate": 1.4345519857015168e-05, + "loss": 1.707, + "step": 32830 + }, + { + "epoch": 1.1675406630521732, + "grad_norm": 1.7131662368774414, + "learning_rate": 1.4341989795436624e-05, + "loss": 1.6752, + "step": 32840 + }, + { + "epoch": 1.1678961870055995, + "grad_norm": 1.749521017074585, + "learning_rate": 1.4338459066955672e-05, + "loss": 1.6865, + "step": 32850 + }, + { + "epoch": 1.1682517109590258, + "grad_norm": 1.747633934020996, + "learning_rate": 1.4334927672114609e-05, + "loss": 1.6788, + "step": 32860 + }, + { + "epoch": 1.1686072349124523, + "grad_norm": 1.7349754571914673, + "learning_rate": 1.4331395611455837e-05, + "loss": 1.7481, + "step": 32870 + }, + { + "epoch": 1.1689627588658786, + "grad_norm": 1.705256700515747, + "learning_rate": 1.4327862885521855e-05, + "loss": 1.6782, + "step": 32880 + }, + { + "epoch": 1.169318282819305, + "grad_norm": 1.8096204996109009, + "learning_rate": 1.432432949485527e-05, + "loss": 1.6504, + "step": 32890 + }, + { + "epoch": 1.1696738067727313, + "grad_norm": 1.8315744400024414, + "learning_rate": 1.4320795439998788e-05, + "loss": 1.7399, + "step": 32900 + }, + { + "epoch": 1.1700293307261576, + "grad_norm": 1.6125462055206299, + "learning_rate": 1.4317260721495219e-05, + "loss": 1.687, + "step": 32910 + }, + { + "epoch": 1.170384854679584, + "grad_norm": 1.7470366954803467, + "learning_rate": 1.4313725339887472e-05, + "loss": 1.6939, + "step": 32920 + }, + { + "epoch": 1.1707403786330104, + "grad_norm": 1.6082324981689453, + "learning_rate": 1.4310189295718562e-05, + "loss": 1.6882, + "step": 32930 + }, + { + "epoch": 1.1710959025864367, + "grad_norm": 1.7428748607635498, + "learning_rate": 1.4306652589531597e-05, + "loss": 1.6608, + "step": 32940 + }, + { + "epoch": 1.1714514265398632, + "grad_norm": 1.682776927947998, + "learning_rate": 1.43031152218698e-05, + "loss": 1.7281, + "step": 32950 + }, + { + "epoch": 1.1718069504932895, + "grad_norm": 1.8763965368270874, + "learning_rate": 1.4299577193276486e-05, + "loss": 1.7494, + "step": 32960 + }, + { + "epoch": 1.1721624744467158, + "grad_norm": 1.809517741203308, + "learning_rate": 1.429603850429507e-05, + "loss": 1.7082, + "step": 32970 + }, + { + "epoch": 1.1725179984001421, + "grad_norm": 1.7316521406173706, + "learning_rate": 1.4292499155469082e-05, + "loss": 1.6768, + "step": 32980 + }, + { + "epoch": 1.1728735223535685, + "grad_norm": 1.684985637664795, + "learning_rate": 1.4288959147342136e-05, + "loss": 1.7036, + "step": 32990 + }, + { + "epoch": 1.173229046306995, + "grad_norm": 1.8130797147750854, + "learning_rate": 1.4285418480457955e-05, + "loss": 1.6871, + "step": 33000 + }, + { + "epoch": 1.1735845702604213, + "grad_norm": 1.7257000207901, + "learning_rate": 1.4281877155360366e-05, + "loss": 1.6493, + "step": 33010 + }, + { + "epoch": 1.1739400942138476, + "grad_norm": 1.8485682010650635, + "learning_rate": 1.4278335172593294e-05, + "loss": 1.7094, + "step": 33020 + }, + { + "epoch": 1.1742956181672741, + "grad_norm": 1.8036755323410034, + "learning_rate": 1.4274792532700764e-05, + "loss": 1.6827, + "step": 33030 + }, + { + "epoch": 1.1746511421207004, + "grad_norm": 1.7438093423843384, + "learning_rate": 1.4271249236226907e-05, + "loss": 1.7338, + "step": 33040 + }, + { + "epoch": 1.1750066660741267, + "grad_norm": 1.8052996397018433, + "learning_rate": 1.4267705283715945e-05, + "loss": 1.66, + "step": 33050 + }, + { + "epoch": 1.175362190027553, + "grad_norm": 1.700236439704895, + "learning_rate": 1.4264160675712211e-05, + "loss": 1.6961, + "step": 33060 + }, + { + "epoch": 1.1757177139809794, + "grad_norm": 1.7625881433486938, + "learning_rate": 1.4260615412760132e-05, + "loss": 1.6751, + "step": 33070 + }, + { + "epoch": 1.1760732379344059, + "grad_norm": 1.8093489408493042, + "learning_rate": 1.425706949540424e-05, + "loss": 1.6533, + "step": 33080 + }, + { + "epoch": 1.1764287618878322, + "grad_norm": 1.8260650634765625, + "learning_rate": 1.4253522924189172e-05, + "loss": 1.6723, + "step": 33090 + }, + { + "epoch": 1.1767842858412585, + "grad_norm": 1.8158904314041138, + "learning_rate": 1.4249975699659646e-05, + "loss": 1.7197, + "step": 33100 + }, + { + "epoch": 1.177139809794685, + "grad_norm": 1.719862937927246, + "learning_rate": 1.4246427822360502e-05, + "loss": 1.7096, + "step": 33110 + }, + { + "epoch": 1.1774953337481113, + "grad_norm": 1.8489018678665161, + "learning_rate": 1.424287929283667e-05, + "loss": 1.6783, + "step": 33120 + }, + { + "epoch": 1.1778508577015376, + "grad_norm": 1.748596429824829, + "learning_rate": 1.4239330111633182e-05, + "loss": 1.7238, + "step": 33130 + }, + { + "epoch": 1.178206381654964, + "grad_norm": 1.8104336261749268, + "learning_rate": 1.4235780279295168e-05, + "loss": 1.6868, + "step": 33140 + }, + { + "epoch": 1.1785619056083902, + "grad_norm": 1.733096957206726, + "learning_rate": 1.4232229796367863e-05, + "loss": 1.6899, + "step": 33150 + }, + { + "epoch": 1.1789174295618168, + "grad_norm": 1.818442940711975, + "learning_rate": 1.4228678663396599e-05, + "loss": 1.741, + "step": 33160 + }, + { + "epoch": 1.179272953515243, + "grad_norm": 1.8043625354766846, + "learning_rate": 1.4225126880926804e-05, + "loss": 1.6767, + "step": 33170 + }, + { + "epoch": 1.1796284774686694, + "grad_norm": 1.9023876190185547, + "learning_rate": 1.4221574449504014e-05, + "loss": 1.6426, + "step": 33180 + }, + { + "epoch": 1.179984001422096, + "grad_norm": 1.7926504611968994, + "learning_rate": 1.4218021369673856e-05, + "loss": 1.7023, + "step": 33190 + }, + { + "epoch": 1.1803395253755222, + "grad_norm": 1.7633355855941772, + "learning_rate": 1.4214467641982062e-05, + "loss": 1.7391, + "step": 33200 + }, + { + "epoch": 1.1806950493289485, + "grad_norm": 1.8657419681549072, + "learning_rate": 1.4210913266974465e-05, + "loss": 1.7346, + "step": 33210 + }, + { + "epoch": 1.1810505732823748, + "grad_norm": 1.7032383680343628, + "learning_rate": 1.420735824519699e-05, + "loss": 1.6766, + "step": 33220 + }, + { + "epoch": 1.1814060972358011, + "grad_norm": 1.809301733970642, + "learning_rate": 1.4203802577195674e-05, + "loss": 1.7146, + "step": 33230 + }, + { + "epoch": 1.1817616211892277, + "grad_norm": 1.860851764678955, + "learning_rate": 1.4200246263516635e-05, + "loss": 1.7256, + "step": 33240 + }, + { + "epoch": 1.182117145142654, + "grad_norm": 1.9773718118667603, + "learning_rate": 1.419668930470611e-05, + "loss": 1.7246, + "step": 33250 + }, + { + "epoch": 1.1824726690960803, + "grad_norm": 1.7748452425003052, + "learning_rate": 1.4193131701310418e-05, + "loss": 1.682, + "step": 33260 + }, + { + "epoch": 1.1828281930495068, + "grad_norm": 1.7930796146392822, + "learning_rate": 1.418957345387599e-05, + "loss": 1.7387, + "step": 33270 + }, + { + "epoch": 1.1831837170029331, + "grad_norm": 1.67182195186615, + "learning_rate": 1.4186014562949346e-05, + "loss": 1.6883, + "step": 33280 + }, + { + "epoch": 1.1835392409563594, + "grad_norm": 1.7434818744659424, + "learning_rate": 1.4182455029077113e-05, + "loss": 1.6746, + "step": 33290 + }, + { + "epoch": 1.1838947649097857, + "grad_norm": 1.7789760828018188, + "learning_rate": 1.4178894852806013e-05, + "loss": 1.6776, + "step": 33300 + }, + { + "epoch": 1.184250288863212, + "grad_norm": 1.6772903203964233, + "learning_rate": 1.4175334034682864e-05, + "loss": 1.7219, + "step": 33310 + }, + { + "epoch": 1.1846058128166386, + "grad_norm": 1.7272416353225708, + "learning_rate": 1.4171772575254585e-05, + "loss": 1.665, + "step": 33320 + }, + { + "epoch": 1.1849613367700649, + "grad_norm": 1.8471630811691284, + "learning_rate": 1.41682104750682e-05, + "loss": 1.7101, + "step": 33330 + }, + { + "epoch": 1.1853168607234912, + "grad_norm": 1.7045025825500488, + "learning_rate": 1.4164647734670818e-05, + "loss": 1.7075, + "step": 33340 + }, + { + "epoch": 1.1856723846769177, + "grad_norm": 1.8781580924987793, + "learning_rate": 1.4161084354609657e-05, + "loss": 1.6975, + "step": 33350 + }, + { + "epoch": 1.186027908630344, + "grad_norm": 1.8737696409225464, + "learning_rate": 1.415752033543203e-05, + "loss": 1.7216, + "step": 33360 + }, + { + "epoch": 1.1863834325837703, + "grad_norm": 1.8052611351013184, + "learning_rate": 1.4153955677685347e-05, + "loss": 1.6939, + "step": 33370 + }, + { + "epoch": 1.1867389565371966, + "grad_norm": 1.688539981842041, + "learning_rate": 1.4150390381917115e-05, + "loss": 1.677, + "step": 33380 + }, + { + "epoch": 1.187094480490623, + "grad_norm": 1.8261302709579468, + "learning_rate": 1.4146824448674945e-05, + "loss": 1.6752, + "step": 33390 + }, + { + "epoch": 1.1874500044440495, + "grad_norm": 1.8232522010803223, + "learning_rate": 1.4143257878506541e-05, + "loss": 1.7094, + "step": 33400 + }, + { + "epoch": 1.1878055283974758, + "grad_norm": 1.7392261028289795, + "learning_rate": 1.4139690671959708e-05, + "loss": 1.6903, + "step": 33410 + }, + { + "epoch": 1.188161052350902, + "grad_norm": 1.805082082748413, + "learning_rate": 1.413612282958234e-05, + "loss": 1.6825, + "step": 33420 + }, + { + "epoch": 1.1885165763043286, + "grad_norm": 1.7442923784255981, + "learning_rate": 1.4132554351922444e-05, + "loss": 1.6796, + "step": 33430 + }, + { + "epoch": 1.188872100257755, + "grad_norm": 1.7606703042984009, + "learning_rate": 1.4128985239528104e-05, + "loss": 1.7187, + "step": 33440 + }, + { + "epoch": 1.1892276242111812, + "grad_norm": 1.6859506368637085, + "learning_rate": 1.4125415492947523e-05, + "loss": 1.6878, + "step": 33450 + }, + { + "epoch": 1.1895831481646075, + "grad_norm": 1.7987312078475952, + "learning_rate": 1.412184511272899e-05, + "loss": 1.6963, + "step": 33460 + }, + { + "epoch": 1.1899386721180338, + "grad_norm": 1.8057392835617065, + "learning_rate": 1.4118274099420893e-05, + "loss": 1.7036, + "step": 33470 + }, + { + "epoch": 1.1902941960714604, + "grad_norm": 1.7178395986557007, + "learning_rate": 1.4114702453571711e-05, + "loss": 1.6923, + "step": 33480 + }, + { + "epoch": 1.1906497200248867, + "grad_norm": 1.8137010335922241, + "learning_rate": 1.4111130175730038e-05, + "loss": 1.7145, + "step": 33490 + }, + { + "epoch": 1.191005243978313, + "grad_norm": 1.8016866445541382, + "learning_rate": 1.4107557266444543e-05, + "loss": 1.656, + "step": 33500 + }, + { + "epoch": 1.1913607679317395, + "grad_norm": 1.936647891998291, + "learning_rate": 1.4103983726264005e-05, + "loss": 1.6887, + "step": 33510 + }, + { + "epoch": 1.1917162918851658, + "grad_norm": 1.8246115446090698, + "learning_rate": 1.41004095557373e-05, + "loss": 1.6944, + "step": 33520 + }, + { + "epoch": 1.1920718158385921, + "grad_norm": 1.708095908164978, + "learning_rate": 1.40968347554134e-05, + "loss": 1.6923, + "step": 33530 + }, + { + "epoch": 1.1924273397920184, + "grad_norm": 1.852388620376587, + "learning_rate": 1.4093259325841366e-05, + "loss": 1.7221, + "step": 33540 + }, + { + "epoch": 1.1927828637454447, + "grad_norm": 1.8310407400131226, + "learning_rate": 1.4089683267570366e-05, + "loss": 1.6939, + "step": 33550 + }, + { + "epoch": 1.1931383876988713, + "grad_norm": 1.8411918878555298, + "learning_rate": 1.4086106581149656e-05, + "loss": 1.6536, + "step": 33560 + }, + { + "epoch": 1.1934939116522976, + "grad_norm": 1.9055877923965454, + "learning_rate": 1.40825292671286e-05, + "loss": 1.7192, + "step": 33570 + }, + { + "epoch": 1.1938494356057239, + "grad_norm": 1.8024219274520874, + "learning_rate": 1.4078951326056642e-05, + "loss": 1.7258, + "step": 33580 + }, + { + "epoch": 1.1942049595591504, + "grad_norm": 1.8253586292266846, + "learning_rate": 1.4075372758483336e-05, + "loss": 1.7017, + "step": 33590 + }, + { + "epoch": 1.1945604835125767, + "grad_norm": 1.7866944074630737, + "learning_rate": 1.4071793564958332e-05, + "loss": 1.7364, + "step": 33600 + }, + { + "epoch": 1.194916007466003, + "grad_norm": 1.7497050762176514, + "learning_rate": 1.406821374603136e-05, + "loss": 1.7286, + "step": 33610 + }, + { + "epoch": 1.1952715314194293, + "grad_norm": 1.8349730968475342, + "learning_rate": 1.4064633302252268e-05, + "loss": 1.7375, + "step": 33620 + }, + { + "epoch": 1.1956270553728556, + "grad_norm": 1.8275095224380493, + "learning_rate": 1.4061052234170985e-05, + "loss": 1.7206, + "step": 33630 + }, + { + "epoch": 1.1959825793262822, + "grad_norm": 1.7886320352554321, + "learning_rate": 1.405747054233754e-05, + "loss": 1.6903, + "step": 33640 + }, + { + "epoch": 1.1963381032797085, + "grad_norm": 1.6897066831588745, + "learning_rate": 1.4053888227302065e-05, + "loss": 1.684, + "step": 33650 + }, + { + "epoch": 1.1966936272331348, + "grad_norm": 1.7241648435592651, + "learning_rate": 1.4050305289614777e-05, + "loss": 1.7033, + "step": 33660 + }, + { + "epoch": 1.1970491511865613, + "grad_norm": 1.8050079345703125, + "learning_rate": 1.4046721729825988e-05, + "loss": 1.7016, + "step": 33670 + }, + { + "epoch": 1.1974046751399876, + "grad_norm": 1.7987489700317383, + "learning_rate": 1.4043137548486114e-05, + "loss": 1.7137, + "step": 33680 + }, + { + "epoch": 1.197760199093414, + "grad_norm": 1.7785160541534424, + "learning_rate": 1.4039552746145664e-05, + "loss": 1.7021, + "step": 33690 + }, + { + "epoch": 1.1981157230468402, + "grad_norm": 1.8410561084747314, + "learning_rate": 1.4035967323355241e-05, + "loss": 1.6731, + "step": 33700 + }, + { + "epoch": 1.1984712470002665, + "grad_norm": 1.703798770904541, + "learning_rate": 1.4032381280665544e-05, + "loss": 1.7263, + "step": 33710 + }, + { + "epoch": 1.198826770953693, + "grad_norm": 1.783011555671692, + "learning_rate": 1.4028794618627364e-05, + "loss": 1.7026, + "step": 33720 + }, + { + "epoch": 1.1991822949071194, + "grad_norm": 1.7746280431747437, + "learning_rate": 1.4025207337791593e-05, + "loss": 1.6962, + "step": 33730 + }, + { + "epoch": 1.1995378188605457, + "grad_norm": 1.820473313331604, + "learning_rate": 1.402161943870921e-05, + "loss": 1.7047, + "step": 33740 + }, + { + "epoch": 1.1998933428139722, + "grad_norm": 1.762587308883667, + "learning_rate": 1.4018030921931301e-05, + "loss": 1.6823, + "step": 33750 + }, + { + "epoch": 1.2002488667673985, + "grad_norm": 1.772749423980713, + "learning_rate": 1.4014441788009031e-05, + "loss": 1.698, + "step": 33760 + }, + { + "epoch": 1.2006043907208248, + "grad_norm": 1.96071457862854, + "learning_rate": 1.4010852037493677e-05, + "loss": 1.6903, + "step": 33770 + }, + { + "epoch": 1.2009599146742511, + "grad_norm": 1.803092122077942, + "learning_rate": 1.4007261670936599e-05, + "loss": 1.709, + "step": 33780 + }, + { + "epoch": 1.2013154386276774, + "grad_norm": 1.7847096920013428, + "learning_rate": 1.400367068888925e-05, + "loss": 1.6616, + "step": 33790 + }, + { + "epoch": 1.201670962581104, + "grad_norm": 1.7999202013015747, + "learning_rate": 1.4000079091903187e-05, + "loss": 1.7288, + "step": 33800 + }, + { + "epoch": 1.2020264865345303, + "grad_norm": 1.7760398387908936, + "learning_rate": 1.399648688053006e-05, + "loss": 1.7137, + "step": 33810 + }, + { + "epoch": 1.2023820104879566, + "grad_norm": 1.9159443378448486, + "learning_rate": 1.3992894055321604e-05, + "loss": 1.7019, + "step": 33820 + }, + { + "epoch": 1.202737534441383, + "grad_norm": 1.7627336978912354, + "learning_rate": 1.3989300616829655e-05, + "loss": 1.7146, + "step": 33830 + }, + { + "epoch": 1.2030930583948094, + "grad_norm": 1.7011263370513916, + "learning_rate": 1.3985706565606146e-05, + "loss": 1.6801, + "step": 33840 + }, + { + "epoch": 1.2034485823482357, + "grad_norm": 1.8217169046401978, + "learning_rate": 1.3982111902203101e-05, + "loss": 1.6935, + "step": 33850 + }, + { + "epoch": 1.203804106301662, + "grad_norm": 1.8142902851104736, + "learning_rate": 1.397851662717263e-05, + "loss": 1.7484, + "step": 33860 + }, + { + "epoch": 1.2041596302550883, + "grad_norm": 1.8981884717941284, + "learning_rate": 1.3974920741066958e-05, + "loss": 1.7154, + "step": 33870 + }, + { + "epoch": 1.2045151542085148, + "grad_norm": 1.6906952857971191, + "learning_rate": 1.3971324244438377e-05, + "loss": 1.6937, + "step": 33880 + }, + { + "epoch": 1.2048706781619412, + "grad_norm": 1.6254799365997314, + "learning_rate": 1.3967727137839297e-05, + "loss": 1.7114, + "step": 33890 + }, + { + "epoch": 1.2052262021153675, + "grad_norm": 1.7021291255950928, + "learning_rate": 1.3964129421822203e-05, + "loss": 1.6794, + "step": 33900 + }, + { + "epoch": 1.205581726068794, + "grad_norm": 1.829317569732666, + "learning_rate": 1.396053109693969e-05, + "loss": 1.691, + "step": 33910 + }, + { + "epoch": 1.2059372500222203, + "grad_norm": 1.8096377849578857, + "learning_rate": 1.395693216374443e-05, + "loss": 1.6996, + "step": 33920 + }, + { + "epoch": 1.2062927739756466, + "grad_norm": 1.5986207723617554, + "learning_rate": 1.3953332622789197e-05, + "loss": 1.714, + "step": 33930 + }, + { + "epoch": 1.206648297929073, + "grad_norm": 1.7374706268310547, + "learning_rate": 1.3949732474626862e-05, + "loss": 1.6701, + "step": 33940 + }, + { + "epoch": 1.2070038218824992, + "grad_norm": 1.7917215824127197, + "learning_rate": 1.3946131719810386e-05, + "loss": 1.7326, + "step": 33950 + }, + { + "epoch": 1.2073593458359257, + "grad_norm": 1.7431756258010864, + "learning_rate": 1.3942530358892821e-05, + "loss": 1.7173, + "step": 33960 + }, + { + "epoch": 1.207714869789352, + "grad_norm": 1.858600378036499, + "learning_rate": 1.3938928392427313e-05, + "loss": 1.7309, + "step": 33970 + }, + { + "epoch": 1.2080703937427784, + "grad_norm": 1.8365451097488403, + "learning_rate": 1.3935325820967098e-05, + "loss": 1.6759, + "step": 33980 + }, + { + "epoch": 1.2084259176962049, + "grad_norm": 1.8952609300613403, + "learning_rate": 1.3931722645065513e-05, + "loss": 1.6591, + "step": 33990 + }, + { + "epoch": 1.2087814416496312, + "grad_norm": 1.8191903829574585, + "learning_rate": 1.3928118865275981e-05, + "loss": 1.7209, + "step": 34000 + }, + { + "epoch": 1.2091369656030575, + "grad_norm": 1.7062873840332031, + "learning_rate": 1.3924514482152023e-05, + "loss": 1.751, + "step": 34010 + }, + { + "epoch": 1.2094924895564838, + "grad_norm": 2.049180030822754, + "learning_rate": 1.3920909496247243e-05, + "loss": 1.6886, + "step": 34020 + }, + { + "epoch": 1.2098480135099101, + "grad_norm": 1.9235094785690308, + "learning_rate": 1.3917303908115356e-05, + "loss": 1.6867, + "step": 34030 + }, + { + "epoch": 1.2102035374633366, + "grad_norm": 1.7062004804611206, + "learning_rate": 1.3913697718310144e-05, + "loss": 1.674, + "step": 34040 + }, + { + "epoch": 1.210559061416763, + "grad_norm": 1.7808175086975098, + "learning_rate": 1.3910090927385507e-05, + "loss": 1.7013, + "step": 34050 + }, + { + "epoch": 1.2109145853701893, + "grad_norm": 1.811781644821167, + "learning_rate": 1.3906483535895414e-05, + "loss": 1.6935, + "step": 34060 + }, + { + "epoch": 1.2112701093236158, + "grad_norm": 1.755676507949829, + "learning_rate": 1.3902875544393947e-05, + "loss": 1.6571, + "step": 34070 + }, + { + "epoch": 1.211625633277042, + "grad_norm": 1.848569631576538, + "learning_rate": 1.3899266953435266e-05, + "loss": 1.7062, + "step": 34080 + }, + { + "epoch": 1.2119811572304684, + "grad_norm": 1.7534435987472534, + "learning_rate": 1.3895657763573631e-05, + "loss": 1.7107, + "step": 34090 + }, + { + "epoch": 1.2123366811838947, + "grad_norm": 1.728899359703064, + "learning_rate": 1.389204797536339e-05, + "loss": 1.6623, + "step": 34100 + }, + { + "epoch": 1.212692205137321, + "grad_norm": 1.7825510501861572, + "learning_rate": 1.3888437589358982e-05, + "loss": 1.6603, + "step": 34110 + }, + { + "epoch": 1.2130477290907475, + "grad_norm": 1.7410731315612793, + "learning_rate": 1.3884826606114941e-05, + "loss": 1.6778, + "step": 34120 + }, + { + "epoch": 1.2134032530441738, + "grad_norm": 1.7989734411239624, + "learning_rate": 1.3881215026185893e-05, + "loss": 1.6652, + "step": 34130 + }, + { + "epoch": 1.2137587769976002, + "grad_norm": 1.8484128713607788, + "learning_rate": 1.387760285012655e-05, + "loss": 1.6831, + "step": 34140 + }, + { + "epoch": 1.2141143009510267, + "grad_norm": 1.8029778003692627, + "learning_rate": 1.3873990078491723e-05, + "loss": 1.6536, + "step": 34150 + }, + { + "epoch": 1.214469824904453, + "grad_norm": 1.713118553161621, + "learning_rate": 1.387037671183631e-05, + "loss": 1.68, + "step": 34160 + }, + { + "epoch": 1.2148253488578793, + "grad_norm": 1.7993403673171997, + "learning_rate": 1.3866762750715303e-05, + "loss": 1.7248, + "step": 34170 + }, + { + "epoch": 1.2151808728113056, + "grad_norm": 1.9654228687286377, + "learning_rate": 1.386314819568378e-05, + "loss": 1.7362, + "step": 34180 + }, + { + "epoch": 1.215536396764732, + "grad_norm": 1.859349012374878, + "learning_rate": 1.3859533047296916e-05, + "loss": 1.6999, + "step": 34190 + }, + { + "epoch": 1.2158919207181584, + "grad_norm": 1.712769627571106, + "learning_rate": 1.3855917306109976e-05, + "loss": 1.689, + "step": 34200 + }, + { + "epoch": 1.2162474446715847, + "grad_norm": 1.7936985492706299, + "learning_rate": 1.3852300972678316e-05, + "loss": 1.6774, + "step": 34210 + }, + { + "epoch": 1.216602968625011, + "grad_norm": 1.7569528818130493, + "learning_rate": 1.3848684047557384e-05, + "loss": 1.6438, + "step": 34220 + }, + { + "epoch": 1.2169584925784376, + "grad_norm": 1.7265123128890991, + "learning_rate": 1.3845066531302708e-05, + "loss": 1.7195, + "step": 34230 + }, + { + "epoch": 1.2173140165318639, + "grad_norm": 1.661879062652588, + "learning_rate": 1.3841448424469923e-05, + "loss": 1.7101, + "step": 34240 + }, + { + "epoch": 1.2176695404852902, + "grad_norm": 1.9059056043624878, + "learning_rate": 1.3837829727614745e-05, + "loss": 1.6865, + "step": 34250 + }, + { + "epoch": 1.2180250644387165, + "grad_norm": 1.8204132318496704, + "learning_rate": 1.3834210441292986e-05, + "loss": 1.6746, + "step": 34260 + }, + { + "epoch": 1.2183805883921428, + "grad_norm": 1.8100894689559937, + "learning_rate": 1.3830590566060545e-05, + "loss": 1.6981, + "step": 34270 + }, + { + "epoch": 1.2187361123455693, + "grad_norm": 1.8906034231185913, + "learning_rate": 1.3826970102473407e-05, + "loss": 1.6893, + "step": 34280 + }, + { + "epoch": 1.2190916362989956, + "grad_norm": 1.7277697324752808, + "learning_rate": 1.382334905108766e-05, + "loss": 1.6698, + "step": 34290 + }, + { + "epoch": 1.219447160252422, + "grad_norm": 1.8846702575683594, + "learning_rate": 1.3819727412459471e-05, + "loss": 1.715, + "step": 34300 + }, + { + "epoch": 1.2198026842058485, + "grad_norm": 1.9642268419265747, + "learning_rate": 1.38161051871451e-05, + "loss": 1.6501, + "step": 34310 + }, + { + "epoch": 1.2201582081592748, + "grad_norm": 1.7906246185302734, + "learning_rate": 1.3812482375700899e-05, + "loss": 1.7008, + "step": 34320 + }, + { + "epoch": 1.220513732112701, + "grad_norm": 1.8596678972244263, + "learning_rate": 1.380885897868331e-05, + "loss": 1.711, + "step": 34330 + }, + { + "epoch": 1.2208692560661274, + "grad_norm": 1.7478065490722656, + "learning_rate": 1.3805234996648867e-05, + "loss": 1.7013, + "step": 34340 + }, + { + "epoch": 1.2212247800195537, + "grad_norm": 1.812665343284607, + "learning_rate": 1.3801610430154182e-05, + "loss": 1.6807, + "step": 34350 + }, + { + "epoch": 1.2215803039729802, + "grad_norm": 1.7723488807678223, + "learning_rate": 1.3797985279755975e-05, + "loss": 1.6957, + "step": 34360 + }, + { + "epoch": 1.2219358279264065, + "grad_norm": 1.8480596542358398, + "learning_rate": 1.3794359546011042e-05, + "loss": 1.7231, + "step": 34370 + }, + { + "epoch": 1.2222913518798328, + "grad_norm": 1.8381192684173584, + "learning_rate": 1.3790733229476272e-05, + "loss": 1.6965, + "step": 34380 + }, + { + "epoch": 1.2226468758332594, + "grad_norm": 1.8478515148162842, + "learning_rate": 1.3787106330708646e-05, + "loss": 1.6672, + "step": 34390 + }, + { + "epoch": 1.2230023997866857, + "grad_norm": 1.7459863424301147, + "learning_rate": 1.3783478850265238e-05, + "loss": 1.6625, + "step": 34400 + }, + { + "epoch": 1.223357923740112, + "grad_norm": 1.8091344833374023, + "learning_rate": 1.3779850788703196e-05, + "loss": 1.6789, + "step": 34410 + }, + { + "epoch": 1.2237134476935383, + "grad_norm": 1.9412217140197754, + "learning_rate": 1.3776222146579772e-05, + "loss": 1.6679, + "step": 34420 + }, + { + "epoch": 1.2240689716469646, + "grad_norm": 1.6879805326461792, + "learning_rate": 1.3772592924452307e-05, + "loss": 1.7144, + "step": 34430 + }, + { + "epoch": 1.2244244956003911, + "grad_norm": 1.8009696006774902, + "learning_rate": 1.3768963122878218e-05, + "loss": 1.7023, + "step": 34440 + }, + { + "epoch": 1.2247800195538174, + "grad_norm": 1.8283684253692627, + "learning_rate": 1.3765332742415031e-05, + "loss": 1.6903, + "step": 34450 + }, + { + "epoch": 1.2251355435072437, + "grad_norm": 1.8491322994232178, + "learning_rate": 1.376170178362034e-05, + "loss": 1.7263, + "step": 34460 + }, + { + "epoch": 1.2254910674606703, + "grad_norm": 1.8187154531478882, + "learning_rate": 1.3758070247051844e-05, + "loss": 1.677, + "step": 34470 + }, + { + "epoch": 1.2258465914140966, + "grad_norm": 1.9667563438415527, + "learning_rate": 1.3754438133267318e-05, + "loss": 1.7341, + "step": 34480 + }, + { + "epoch": 1.2262021153675229, + "grad_norm": 1.81467866897583, + "learning_rate": 1.3750805442824638e-05, + "loss": 1.697, + "step": 34490 + }, + { + "epoch": 1.2265576393209492, + "grad_norm": 1.808092713356018, + "learning_rate": 1.3747172176281755e-05, + "loss": 1.6521, + "step": 34500 + }, + { + "epoch": 1.2269131632743755, + "grad_norm": 1.8158385753631592, + "learning_rate": 1.3743538334196724e-05, + "loss": 1.6757, + "step": 34510 + }, + { + "epoch": 1.227268687227802, + "grad_norm": 1.8510080575942993, + "learning_rate": 1.373990391712768e-05, + "loss": 1.6837, + "step": 34520 + }, + { + "epoch": 1.2276242111812283, + "grad_norm": 1.7220662832260132, + "learning_rate": 1.3736268925632841e-05, + "loss": 1.6921, + "step": 34530 + }, + { + "epoch": 1.2279797351346546, + "grad_norm": 1.6918028593063354, + "learning_rate": 1.3732633360270522e-05, + "loss": 1.7004, + "step": 34540 + }, + { + "epoch": 1.2283352590880812, + "grad_norm": 1.8706375360488892, + "learning_rate": 1.372899722159912e-05, + "loss": 1.6898, + "step": 34550 + }, + { + "epoch": 1.2286907830415075, + "grad_norm": 1.8237711191177368, + "learning_rate": 1.3725360510177127e-05, + "loss": 1.7009, + "step": 34560 + }, + { + "epoch": 1.2290463069949338, + "grad_norm": 1.7138383388519287, + "learning_rate": 1.372172322656312e-05, + "loss": 1.6898, + "step": 34570 + }, + { + "epoch": 1.22940183094836, + "grad_norm": 1.8330506086349487, + "learning_rate": 1.3718085371315756e-05, + "loss": 1.6476, + "step": 34580 + }, + { + "epoch": 1.2297573549017864, + "grad_norm": 2.02473783493042, + "learning_rate": 1.3714446944993798e-05, + "loss": 1.6952, + "step": 34590 + }, + { + "epoch": 1.230112878855213, + "grad_norm": 1.7767274379730225, + "learning_rate": 1.3710807948156076e-05, + "loss": 1.6857, + "step": 34600 + }, + { + "epoch": 1.2304684028086392, + "grad_norm": 1.7729326486587524, + "learning_rate": 1.370716838136152e-05, + "loss": 1.7109, + "step": 34610 + }, + { + "epoch": 1.2308239267620655, + "grad_norm": 1.718795657157898, + "learning_rate": 1.3703528245169145e-05, + "loss": 1.6839, + "step": 34620 + }, + { + "epoch": 1.231179450715492, + "grad_norm": 1.7591612339019775, + "learning_rate": 1.3699887540138052e-05, + "loss": 1.686, + "step": 34630 + }, + { + "epoch": 1.2315349746689184, + "grad_norm": 1.8209606409072876, + "learning_rate": 1.369624626682743e-05, + "loss": 1.6415, + "step": 34640 + }, + { + "epoch": 1.2318904986223447, + "grad_norm": 1.776091456413269, + "learning_rate": 1.3692604425796564e-05, + "loss": 1.7014, + "step": 34650 + }, + { + "epoch": 1.232246022575771, + "grad_norm": 1.828403115272522, + "learning_rate": 1.3688962017604804e-05, + "loss": 1.6566, + "step": 34660 + }, + { + "epoch": 1.2326015465291973, + "grad_norm": 1.7923558950424194, + "learning_rate": 1.368531904281161e-05, + "loss": 1.6052, + "step": 34670 + }, + { + "epoch": 1.2329570704826238, + "grad_norm": 1.8252242803573608, + "learning_rate": 1.3681675501976517e-05, + "loss": 1.7341, + "step": 34680 + }, + { + "epoch": 1.2333125944360501, + "grad_norm": 1.7574840784072876, + "learning_rate": 1.3678031395659152e-05, + "loss": 1.6914, + "step": 34690 + }, + { + "epoch": 1.2336681183894764, + "grad_norm": 1.817844271659851, + "learning_rate": 1.3674386724419227e-05, + "loss": 1.7124, + "step": 34700 + }, + { + "epoch": 1.234023642342903, + "grad_norm": 1.7804555892944336, + "learning_rate": 1.367074148881654e-05, + "loss": 1.6931, + "step": 34710 + }, + { + "epoch": 1.2343791662963293, + "grad_norm": 1.8270343542099, + "learning_rate": 1.3667095689410976e-05, + "loss": 1.7101, + "step": 34720 + }, + { + "epoch": 1.2347346902497556, + "grad_norm": 1.6519355773925781, + "learning_rate": 1.3663449326762505e-05, + "loss": 1.7021, + "step": 34730 + }, + { + "epoch": 1.2350902142031819, + "grad_norm": 1.8363620042800903, + "learning_rate": 1.3659802401431189e-05, + "loss": 1.7059, + "step": 34740 + }, + { + "epoch": 1.2354457381566082, + "grad_norm": 1.9127928018569946, + "learning_rate": 1.3656154913977169e-05, + "loss": 1.6899, + "step": 34750 + }, + { + "epoch": 1.2358012621100347, + "grad_norm": 1.982515573501587, + "learning_rate": 1.3652506864960679e-05, + "loss": 1.7332, + "step": 34760 + }, + { + "epoch": 1.236156786063461, + "grad_norm": 1.9231630563735962, + "learning_rate": 1.3648858254942039e-05, + "loss": 1.6745, + "step": 34770 + }, + { + "epoch": 1.2365123100168873, + "grad_norm": 1.6801555156707764, + "learning_rate": 1.364520908448165e-05, + "loss": 1.6811, + "step": 34780 + }, + { + "epoch": 1.2368678339703139, + "grad_norm": 1.9552178382873535, + "learning_rate": 1.3641559354139999e-05, + "loss": 1.7243, + "step": 34790 + }, + { + "epoch": 1.2372233579237402, + "grad_norm": 1.7672715187072754, + "learning_rate": 1.3637909064477664e-05, + "loss": 1.7373, + "step": 34800 + }, + { + "epoch": 1.2375788818771665, + "grad_norm": 1.823472261428833, + "learning_rate": 1.3634258216055305e-05, + "loss": 1.701, + "step": 34810 + }, + { + "epoch": 1.2379344058305928, + "grad_norm": 1.8090918064117432, + "learning_rate": 1.3630606809433672e-05, + "loss": 1.6791, + "step": 34820 + }, + { + "epoch": 1.238289929784019, + "grad_norm": 1.8534045219421387, + "learning_rate": 1.3626954845173599e-05, + "loss": 1.6525, + "step": 34830 + }, + { + "epoch": 1.2386454537374456, + "grad_norm": 1.9117337465286255, + "learning_rate": 1.3623302323836001e-05, + "loss": 1.6886, + "step": 34840 + }, + { + "epoch": 1.239000977690872, + "grad_norm": 1.8775508403778076, + "learning_rate": 1.3619649245981885e-05, + "loss": 1.7097, + "step": 34850 + }, + { + "epoch": 1.2393565016442982, + "grad_norm": 1.8384355306625366, + "learning_rate": 1.3615995612172342e-05, + "loss": 1.6513, + "step": 34860 + }, + { + "epoch": 1.2397120255977248, + "grad_norm": 1.7076852321624756, + "learning_rate": 1.3612341422968542e-05, + "loss": 1.6979, + "step": 34870 + }, + { + "epoch": 1.240067549551151, + "grad_norm": 1.8162572383880615, + "learning_rate": 1.3608686678931751e-05, + "loss": 1.6407, + "step": 34880 + }, + { + "epoch": 1.2404230735045774, + "grad_norm": 1.76498544216156, + "learning_rate": 1.3605031380623312e-05, + "loss": 1.6917, + "step": 34890 + }, + { + "epoch": 1.2407785974580037, + "grad_norm": 1.8899500370025635, + "learning_rate": 1.360137552860466e-05, + "loss": 1.6707, + "step": 34900 + }, + { + "epoch": 1.24113412141143, + "grad_norm": 1.8753656148910522, + "learning_rate": 1.3597719123437302e-05, + "loss": 1.7146, + "step": 34910 + }, + { + "epoch": 1.2414896453648565, + "grad_norm": 1.6240572929382324, + "learning_rate": 1.3594062165682846e-05, + "loss": 1.7083, + "step": 34920 + }, + { + "epoch": 1.2418451693182828, + "grad_norm": 1.7258415222167969, + "learning_rate": 1.3590404655902979e-05, + "loss": 1.6886, + "step": 34930 + }, + { + "epoch": 1.2422006932717091, + "grad_norm": 1.8822959661483765, + "learning_rate": 1.3586746594659468e-05, + "loss": 1.7159, + "step": 34940 + }, + { + "epoch": 1.2425562172251357, + "grad_norm": 1.784230351448059, + "learning_rate": 1.3583087982514168e-05, + "loss": 1.6507, + "step": 34950 + }, + { + "epoch": 1.242911741178562, + "grad_norm": 1.8265002965927124, + "learning_rate": 1.3579428820029021e-05, + "loss": 1.7289, + "step": 34960 + }, + { + "epoch": 1.2432672651319883, + "grad_norm": 1.8411922454833984, + "learning_rate": 1.357576910776605e-05, + "loss": 1.6828, + "step": 34970 + }, + { + "epoch": 1.2436227890854146, + "grad_norm": 1.806093692779541, + "learning_rate": 1.3572108846287364e-05, + "loss": 1.6783, + "step": 34980 + }, + { + "epoch": 1.2439783130388409, + "grad_norm": 1.904137372970581, + "learning_rate": 1.3568448036155158e-05, + "loss": 1.6938, + "step": 34990 + }, + { + "epoch": 1.2443338369922674, + "grad_norm": 1.7954858541488647, + "learning_rate": 1.3564786677931706e-05, + "loss": 1.7159, + "step": 35000 + }, + { + "epoch": 1.2446893609456937, + "grad_norm": 1.7145042419433594, + "learning_rate": 1.3561124772179372e-05, + "loss": 1.6792, + "step": 35010 + }, + { + "epoch": 1.24504488489912, + "grad_norm": 1.7948307991027832, + "learning_rate": 1.3557462319460602e-05, + "loss": 1.6726, + "step": 35020 + }, + { + "epoch": 1.2454004088525465, + "grad_norm": 1.7869338989257812, + "learning_rate": 1.3553799320337926e-05, + "loss": 1.7123, + "step": 35030 + }, + { + "epoch": 1.2457559328059729, + "grad_norm": 1.7811120748519897, + "learning_rate": 1.3550135775373957e-05, + "loss": 1.7266, + "step": 35040 + }, + { + "epoch": 1.2461114567593992, + "grad_norm": 1.7845333814620972, + "learning_rate": 1.354647168513139e-05, + "loss": 1.6855, + "step": 35050 + }, + { + "epoch": 1.2464669807128255, + "grad_norm": 1.8383392095565796, + "learning_rate": 1.3542807050173008e-05, + "loss": 1.6979, + "step": 35060 + }, + { + "epoch": 1.2468225046662518, + "grad_norm": 1.8375091552734375, + "learning_rate": 1.3539141871061679e-05, + "loss": 1.7184, + "step": 35070 + }, + { + "epoch": 1.2471780286196783, + "grad_norm": 1.8529398441314697, + "learning_rate": 1.3535476148360349e-05, + "loss": 1.6638, + "step": 35080 + }, + { + "epoch": 1.2475335525731046, + "grad_norm": 1.7339930534362793, + "learning_rate": 1.3531809882632052e-05, + "loss": 1.6769, + "step": 35090 + }, + { + "epoch": 1.247889076526531, + "grad_norm": 1.7067604064941406, + "learning_rate": 1.35281430744399e-05, + "loss": 1.6856, + "step": 35100 + }, + { + "epoch": 1.2482446004799574, + "grad_norm": 1.8274784088134766, + "learning_rate": 1.3524475724347093e-05, + "loss": 1.6603, + "step": 35110 + }, + { + "epoch": 1.2486001244333838, + "grad_norm": 1.8580459356307983, + "learning_rate": 1.3520807832916913e-05, + "loss": 1.7015, + "step": 35120 + }, + { + "epoch": 1.24895564838681, + "grad_norm": 1.9023088216781616, + "learning_rate": 1.3517139400712727e-05, + "loss": 1.7105, + "step": 35130 + }, + { + "epoch": 1.2493111723402364, + "grad_norm": 1.7594802379608154, + "learning_rate": 1.3513470428297981e-05, + "loss": 1.6994, + "step": 35140 + }, + { + "epoch": 1.2496666962936627, + "grad_norm": 1.7522236108779907, + "learning_rate": 1.3509800916236207e-05, + "loss": 1.6827, + "step": 35150 + }, + { + "epoch": 1.2500222202470892, + "grad_norm": 1.883756160736084, + "learning_rate": 1.3506130865091017e-05, + "loss": 1.6812, + "step": 35160 + }, + { + "epoch": 1.2503777442005155, + "grad_norm": 1.8085405826568604, + "learning_rate": 1.350246027542611e-05, + "loss": 1.6747, + "step": 35170 + }, + { + "epoch": 1.2507332681539418, + "grad_norm": 1.7783538103103638, + "learning_rate": 1.3498789147805269e-05, + "loss": 1.7187, + "step": 35180 + }, + { + "epoch": 1.2510887921073683, + "grad_norm": 1.8453665971755981, + "learning_rate": 1.3495117482792348e-05, + "loss": 1.7129, + "step": 35190 + }, + { + "epoch": 1.2514443160607946, + "grad_norm": 1.7402855157852173, + "learning_rate": 1.3491445280951299e-05, + "loss": 1.7051, + "step": 35200 + }, + { + "epoch": 1.251799840014221, + "grad_norm": 1.868834137916565, + "learning_rate": 1.3487772542846145e-05, + "loss": 1.7227, + "step": 35210 + }, + { + "epoch": 1.2521553639676473, + "grad_norm": 1.842235803604126, + "learning_rate": 1.3484099269040997e-05, + "loss": 1.7129, + "step": 35220 + }, + { + "epoch": 1.2525108879210736, + "grad_norm": 1.8452571630477905, + "learning_rate": 1.348042546010005e-05, + "loss": 1.7115, + "step": 35230 + }, + { + "epoch": 1.2528664118745, + "grad_norm": 1.778145670890808, + "learning_rate": 1.3476751116587567e-05, + "loss": 1.7321, + "step": 35240 + }, + { + "epoch": 1.2532219358279264, + "grad_norm": 1.7019305229187012, + "learning_rate": 1.347307623906792e-05, + "loss": 1.6857, + "step": 35250 + }, + { + "epoch": 1.2535774597813527, + "grad_norm": 1.7667639255523682, + "learning_rate": 1.3469400828105537e-05, + "loss": 1.6976, + "step": 35260 + }, + { + "epoch": 1.2539329837347792, + "grad_norm": 1.8197112083435059, + "learning_rate": 1.3465724884264939e-05, + "loss": 1.6532, + "step": 35270 + }, + { + "epoch": 1.2542885076882055, + "grad_norm": 1.7577474117279053, + "learning_rate": 1.3462048408110729e-05, + "loss": 1.7048, + "step": 35280 + }, + { + "epoch": 1.2546440316416319, + "grad_norm": 1.8621435165405273, + "learning_rate": 1.3458371400207591e-05, + "loss": 1.7134, + "step": 35290 + }, + { + "epoch": 1.2549995555950582, + "grad_norm": 1.8118704557418823, + "learning_rate": 1.3454693861120287e-05, + "loss": 1.7063, + "step": 35300 + }, + { + "epoch": 1.2553550795484845, + "grad_norm": 1.9403353929519653, + "learning_rate": 1.345101579141367e-05, + "loss": 1.695, + "step": 35310 + }, + { + "epoch": 1.255710603501911, + "grad_norm": 2.0728812217712402, + "learning_rate": 1.3447337191652665e-05, + "loss": 1.656, + "step": 35320 + }, + { + "epoch": 1.2560661274553373, + "grad_norm": 1.7686488628387451, + "learning_rate": 1.3443658062402284e-05, + "loss": 1.709, + "step": 35330 + }, + { + "epoch": 1.2564216514087636, + "grad_norm": 1.8105318546295166, + "learning_rate": 1.3439978404227616e-05, + "loss": 1.6674, + "step": 35340 + }, + { + "epoch": 1.2567771753621901, + "grad_norm": 1.846477746963501, + "learning_rate": 1.3436298217693832e-05, + "loss": 1.6916, + "step": 35350 + }, + { + "epoch": 1.2571326993156164, + "grad_norm": 1.7145262956619263, + "learning_rate": 1.343261750336619e-05, + "loss": 1.6748, + "step": 35360 + }, + { + "epoch": 1.2574882232690427, + "grad_norm": 1.8705803155899048, + "learning_rate": 1.342893626181002e-05, + "loss": 1.6939, + "step": 35370 + }, + { + "epoch": 1.257843747222469, + "grad_norm": 1.735656976699829, + "learning_rate": 1.3425254493590741e-05, + "loss": 1.6982, + "step": 35380 + }, + { + "epoch": 1.2581992711758954, + "grad_norm": 1.774243712425232, + "learning_rate": 1.3421572199273849e-05, + "loss": 1.6894, + "step": 35390 + }, + { + "epoch": 1.258554795129322, + "grad_norm": 1.7297285795211792, + "learning_rate": 1.3417889379424918e-05, + "loss": 1.6978, + "step": 35400 + }, + { + "epoch": 1.2589103190827482, + "grad_norm": 1.833452582359314, + "learning_rate": 1.341420603460961e-05, + "loss": 1.7104, + "step": 35410 + }, + { + "epoch": 1.2592658430361745, + "grad_norm": 1.8359464406967163, + "learning_rate": 1.3410522165393664e-05, + "loss": 1.7002, + "step": 35420 + }, + { + "epoch": 1.259621366989601, + "grad_norm": 1.799144983291626, + "learning_rate": 1.3406837772342896e-05, + "loss": 1.6812, + "step": 35430 + }, + { + "epoch": 1.2599768909430273, + "grad_norm": 1.920923113822937, + "learning_rate": 1.3403152856023205e-05, + "loss": 1.6805, + "step": 35440 + }, + { + "epoch": 1.2603324148964536, + "grad_norm": 1.8498185873031616, + "learning_rate": 1.3399467417000579e-05, + "loss": 1.6733, + "step": 35450 + }, + { + "epoch": 1.26068793884988, + "grad_norm": 1.8264330625534058, + "learning_rate": 1.3395781455841068e-05, + "loss": 1.6729, + "step": 35460 + }, + { + "epoch": 1.2610434628033063, + "grad_norm": 1.7874951362609863, + "learning_rate": 1.3392094973110817e-05, + "loss": 1.7034, + "step": 35470 + }, + { + "epoch": 1.2613989867567328, + "grad_norm": 1.9353516101837158, + "learning_rate": 1.3388407969376048e-05, + "loss": 1.7026, + "step": 35480 + }, + { + "epoch": 1.261754510710159, + "grad_norm": 2.0174925327301025, + "learning_rate": 1.3384720445203059e-05, + "loss": 1.7073, + "step": 35490 + }, + { + "epoch": 1.2621100346635854, + "grad_norm": 1.7028112411499023, + "learning_rate": 1.3381032401158236e-05, + "loss": 1.6538, + "step": 35500 + }, + { + "epoch": 1.262465558617012, + "grad_norm": 1.7406957149505615, + "learning_rate": 1.337734383780803e-05, + "loss": 1.6865, + "step": 35510 + }, + { + "epoch": 1.2628210825704382, + "grad_norm": 1.819901466369629, + "learning_rate": 1.3373654755718992e-05, + "loss": 1.7195, + "step": 35520 + }, + { + "epoch": 1.2631766065238645, + "grad_norm": 1.7240650653839111, + "learning_rate": 1.3369965155457734e-05, + "loss": 1.7193, + "step": 35530 + }, + { + "epoch": 1.2635321304772908, + "grad_norm": 1.927585244178772, + "learning_rate": 1.3366275037590957e-05, + "loss": 1.7325, + "step": 35540 + }, + { + "epoch": 1.2638876544307172, + "grad_norm": 1.6727463006973267, + "learning_rate": 1.336258440268544e-05, + "loss": 1.712, + "step": 35550 + }, + { + "epoch": 1.2642431783841437, + "grad_norm": 1.7826107740402222, + "learning_rate": 1.3358893251308044e-05, + "loss": 1.6523, + "step": 35560 + }, + { + "epoch": 1.26459870233757, + "grad_norm": 1.875220537185669, + "learning_rate": 1.3355201584025705e-05, + "loss": 1.7161, + "step": 35570 + }, + { + "epoch": 1.2649542262909963, + "grad_norm": 1.8240917921066284, + "learning_rate": 1.3351509401405443e-05, + "loss": 1.7365, + "step": 35580 + }, + { + "epoch": 1.2653097502444228, + "grad_norm": 1.892587661743164, + "learning_rate": 1.3347816704014346e-05, + "loss": 1.6922, + "step": 35590 + }, + { + "epoch": 1.2656652741978491, + "grad_norm": 1.767132043838501, + "learning_rate": 1.3344123492419598e-05, + "loss": 1.6761, + "step": 35600 + }, + { + "epoch": 1.2660207981512754, + "grad_norm": 1.8336524963378906, + "learning_rate": 1.3340429767188448e-05, + "loss": 1.6573, + "step": 35610 + }, + { + "epoch": 1.2663763221047017, + "grad_norm": 1.8649629354476929, + "learning_rate": 1.3336735528888227e-05, + "loss": 1.6805, + "step": 35620 + }, + { + "epoch": 1.266731846058128, + "grad_norm": 1.763104796409607, + "learning_rate": 1.3333040778086353e-05, + "loss": 1.701, + "step": 35630 + }, + { + "epoch": 1.2670873700115546, + "grad_norm": 1.7915949821472168, + "learning_rate": 1.3329345515350316e-05, + "loss": 1.6907, + "step": 35640 + }, + { + "epoch": 1.2674428939649809, + "grad_norm": 1.8075286149978638, + "learning_rate": 1.332564974124768e-05, + "loss": 1.6914, + "step": 35650 + }, + { + "epoch": 1.2677984179184072, + "grad_norm": 1.7828816175460815, + "learning_rate": 1.3321953456346099e-05, + "loss": 1.711, + "step": 35660 + }, + { + "epoch": 1.2681539418718337, + "grad_norm": 1.7524465322494507, + "learning_rate": 1.3318256661213294e-05, + "loss": 1.6562, + "step": 35670 + }, + { + "epoch": 1.26850946582526, + "grad_norm": 1.8603500127792358, + "learning_rate": 1.3314559356417069e-05, + "loss": 1.6989, + "step": 35680 + }, + { + "epoch": 1.2688649897786863, + "grad_norm": 1.7611099481582642, + "learning_rate": 1.3310861542525312e-05, + "loss": 1.6909, + "step": 35690 + }, + { + "epoch": 1.2692205137321126, + "grad_norm": 1.8640097379684448, + "learning_rate": 1.3307163220105983e-05, + "loss": 1.6944, + "step": 35700 + }, + { + "epoch": 1.269576037685539, + "grad_norm": 1.925833821296692, + "learning_rate": 1.3303464389727117e-05, + "loss": 1.705, + "step": 35710 + }, + { + "epoch": 1.2699315616389655, + "grad_norm": 1.918041706085205, + "learning_rate": 1.3299765051956835e-05, + "loss": 1.7184, + "step": 35720 + }, + { + "epoch": 1.2702870855923918, + "grad_norm": 1.8870186805725098, + "learning_rate": 1.3296065207363327e-05, + "loss": 1.6881, + "step": 35730 + }, + { + "epoch": 1.270642609545818, + "grad_norm": 1.8254750967025757, + "learning_rate": 1.3292364856514874e-05, + "loss": 1.6941, + "step": 35740 + }, + { + "epoch": 1.2709981334992446, + "grad_norm": 1.7230134010314941, + "learning_rate": 1.328866399997982e-05, + "loss": 1.711, + "step": 35750 + }, + { + "epoch": 1.271353657452671, + "grad_norm": 1.8757699728012085, + "learning_rate": 1.3284962638326597e-05, + "loss": 1.6883, + "step": 35760 + }, + { + "epoch": 1.2717091814060972, + "grad_norm": 1.719381332397461, + "learning_rate": 1.328126077212371e-05, + "loss": 1.7017, + "step": 35770 + }, + { + "epoch": 1.2720647053595235, + "grad_norm": 1.8136777877807617, + "learning_rate": 1.327755840193974e-05, + "loss": 1.7015, + "step": 35780 + }, + { + "epoch": 1.2724202293129498, + "grad_norm": 1.7327030897140503, + "learning_rate": 1.3273855528343349e-05, + "loss": 1.7203, + "step": 35790 + }, + { + "epoch": 1.2727757532663764, + "grad_norm": 1.8280681371688843, + "learning_rate": 1.327015215190328e-05, + "loss": 1.6835, + "step": 35800 + }, + { + "epoch": 1.2731312772198027, + "grad_norm": 1.7831275463104248, + "learning_rate": 1.3266448273188341e-05, + "loss": 1.6701, + "step": 35810 + }, + { + "epoch": 1.273486801173229, + "grad_norm": 1.9062355756759644, + "learning_rate": 1.3262743892767431e-05, + "loss": 1.6997, + "step": 35820 + }, + { + "epoch": 1.2738423251266555, + "grad_norm": 1.8788557052612305, + "learning_rate": 1.3259039011209515e-05, + "loss": 1.6656, + "step": 35830 + }, + { + "epoch": 1.2741978490800818, + "grad_norm": 1.877271294593811, + "learning_rate": 1.3255333629083642e-05, + "loss": 1.6937, + "step": 35840 + }, + { + "epoch": 1.2745533730335081, + "grad_norm": 1.6842414140701294, + "learning_rate": 1.3251627746958934e-05, + "loss": 1.6561, + "step": 35850 + }, + { + "epoch": 1.2749088969869344, + "grad_norm": 1.8604660034179688, + "learning_rate": 1.324792136540459e-05, + "loss": 1.6548, + "step": 35860 + }, + { + "epoch": 1.2752644209403607, + "grad_norm": 1.8948099613189697, + "learning_rate": 1.3244214484989892e-05, + "loss": 1.6864, + "step": 35870 + }, + { + "epoch": 1.2756199448937873, + "grad_norm": 1.9044815301895142, + "learning_rate": 1.324050710628419e-05, + "loss": 1.7124, + "step": 35880 + }, + { + "epoch": 1.2759754688472136, + "grad_norm": 1.8357096910476685, + "learning_rate": 1.3236799229856914e-05, + "loss": 1.6783, + "step": 35890 + }, + { + "epoch": 1.2763309928006399, + "grad_norm": 1.8171970844268799, + "learning_rate": 1.3233090856277573e-05, + "loss": 1.6754, + "step": 35900 + }, + { + "epoch": 1.2766865167540664, + "grad_norm": 1.913097858428955, + "learning_rate": 1.3229381986115746e-05, + "loss": 1.6948, + "step": 35910 + }, + { + "epoch": 1.2770420407074927, + "grad_norm": 1.8764780759811401, + "learning_rate": 1.3225672619941094e-05, + "loss": 1.6527, + "step": 35920 + }, + { + "epoch": 1.277397564660919, + "grad_norm": 1.7733298540115356, + "learning_rate": 1.3221962758323352e-05, + "loss": 1.703, + "step": 35930 + }, + { + "epoch": 1.2777530886143453, + "grad_norm": 1.6520118713378906, + "learning_rate": 1.3218252401832334e-05, + "loss": 1.6956, + "step": 35940 + }, + { + "epoch": 1.2781086125677716, + "grad_norm": 1.903979778289795, + "learning_rate": 1.3214541551037927e-05, + "loss": 1.6827, + "step": 35950 + }, + { + "epoch": 1.2784641365211982, + "grad_norm": 1.8394795656204224, + "learning_rate": 1.321083020651009e-05, + "loss": 1.6883, + "step": 35960 + }, + { + "epoch": 1.2788196604746245, + "grad_norm": 1.7833898067474365, + "learning_rate": 1.3207118368818866e-05, + "loss": 1.6519, + "step": 35970 + }, + { + "epoch": 1.2791751844280508, + "grad_norm": 1.7083699703216553, + "learning_rate": 1.3203406038534369e-05, + "loss": 1.6612, + "step": 35980 + }, + { + "epoch": 1.2795307083814773, + "grad_norm": 1.6796025037765503, + "learning_rate": 1.3199693216226792e-05, + "loss": 1.7073, + "step": 35990 + }, + { + "epoch": 1.2798862323349036, + "grad_norm": 1.7842929363250732, + "learning_rate": 1.3195979902466398e-05, + "loss": 1.6563, + "step": 36000 + }, + { + "epoch": 1.28024175628833, + "grad_norm": 1.8019449710845947, + "learning_rate": 1.3192266097823531e-05, + "loss": 1.6759, + "step": 36010 + }, + { + "epoch": 1.2805972802417562, + "grad_norm": 1.7412304878234863, + "learning_rate": 1.3188551802868606e-05, + "loss": 1.6781, + "step": 36020 + }, + { + "epoch": 1.2809528041951825, + "grad_norm": 1.7398102283477783, + "learning_rate": 1.3184837018172117e-05, + "loss": 1.6621, + "step": 36030 + }, + { + "epoch": 1.281308328148609, + "grad_norm": 1.9147517681121826, + "learning_rate": 1.3181121744304628e-05, + "loss": 1.7276, + "step": 36040 + }, + { + "epoch": 1.2816638521020354, + "grad_norm": 1.8121700286865234, + "learning_rate": 1.3177405981836788e-05, + "loss": 1.6727, + "step": 36050 + }, + { + "epoch": 1.2820193760554617, + "grad_norm": 1.7942123413085938, + "learning_rate": 1.3173689731339315e-05, + "loss": 1.689, + "step": 36060 + }, + { + "epoch": 1.2823749000088882, + "grad_norm": 2.0419764518737793, + "learning_rate": 1.3169972993382991e-05, + "loss": 1.7237, + "step": 36070 + }, + { + "epoch": 1.2827304239623145, + "grad_norm": 1.829637050628662, + "learning_rate": 1.3166255768538699e-05, + "loss": 1.7182, + "step": 36080 + }, + { + "epoch": 1.2830859479157408, + "grad_norm": 1.7687910795211792, + "learning_rate": 1.3162538057377367e-05, + "loss": 1.7163, + "step": 36090 + }, + { + "epoch": 1.2834414718691671, + "grad_norm": 1.6497434377670288, + "learning_rate": 1.3158819860470021e-05, + "loss": 1.7149, + "step": 36100 + }, + { + "epoch": 1.2837969958225934, + "grad_norm": 1.7790758609771729, + "learning_rate": 1.315510117838775e-05, + "loss": 1.6979, + "step": 36110 + }, + { + "epoch": 1.28415251977602, + "grad_norm": 1.9528772830963135, + "learning_rate": 1.315138201170172e-05, + "loss": 1.7035, + "step": 36120 + }, + { + "epoch": 1.2845080437294463, + "grad_norm": 1.720088243484497, + "learning_rate": 1.3147662360983176e-05, + "loss": 1.6743, + "step": 36130 + }, + { + "epoch": 1.2848635676828726, + "grad_norm": 1.839159607887268, + "learning_rate": 1.3143942226803427e-05, + "loss": 1.6764, + "step": 36140 + }, + { + "epoch": 1.285219091636299, + "grad_norm": 1.770328164100647, + "learning_rate": 1.3140221609733862e-05, + "loss": 1.695, + "step": 36150 + }, + { + "epoch": 1.2855746155897254, + "grad_norm": 1.7510783672332764, + "learning_rate": 1.3136500510345948e-05, + "loss": 1.6912, + "step": 36160 + }, + { + "epoch": 1.2859301395431517, + "grad_norm": 1.9082367420196533, + "learning_rate": 1.3132778929211225e-05, + "loss": 1.6526, + "step": 36170 + }, + { + "epoch": 1.286285663496578, + "grad_norm": 2.1014037132263184, + "learning_rate": 1.3129056866901297e-05, + "loss": 1.6782, + "step": 36180 + }, + { + "epoch": 1.2866411874500043, + "grad_norm": 1.6842938661575317, + "learning_rate": 1.312533432398786e-05, + "loss": 1.6733, + "step": 36190 + }, + { + "epoch": 1.2869967114034309, + "grad_norm": 1.7734049558639526, + "learning_rate": 1.312161130104266e-05, + "loss": 1.6739, + "step": 36200 + }, + { + "epoch": 1.2873522353568572, + "grad_norm": 1.7987641096115112, + "learning_rate": 1.3117887798637538e-05, + "loss": 1.7179, + "step": 36210 + }, + { + "epoch": 1.2877077593102835, + "grad_norm": 1.8217778205871582, + "learning_rate": 1.3114163817344403e-05, + "loss": 1.6731, + "step": 36220 + }, + { + "epoch": 1.28806328326371, + "grad_norm": 1.8839751482009888, + "learning_rate": 1.311043935773523e-05, + "loss": 1.7021, + "step": 36230 + }, + { + "epoch": 1.2884188072171363, + "grad_norm": 1.7346824407577515, + "learning_rate": 1.3106714420382072e-05, + "loss": 1.697, + "step": 36240 + }, + { + "epoch": 1.2887743311705626, + "grad_norm": 1.930270791053772, + "learning_rate": 1.3102989005857061e-05, + "loss": 1.7169, + "step": 36250 + }, + { + "epoch": 1.289129855123989, + "grad_norm": 1.8689700365066528, + "learning_rate": 1.3099263114732392e-05, + "loss": 1.7053, + "step": 36260 + }, + { + "epoch": 1.2894853790774152, + "grad_norm": 1.8215107917785645, + "learning_rate": 1.3095536747580344e-05, + "loss": 1.6984, + "step": 36270 + }, + { + "epoch": 1.2898409030308418, + "grad_norm": 1.6780635118484497, + "learning_rate": 1.3091809904973259e-05, + "loss": 1.6695, + "step": 36280 + }, + { + "epoch": 1.290196426984268, + "grad_norm": 1.7548359632492065, + "learning_rate": 1.3088082587483556e-05, + "loss": 1.6656, + "step": 36290 + }, + { + "epoch": 1.2905519509376944, + "grad_norm": 1.8181124925613403, + "learning_rate": 1.3084354795683735e-05, + "loss": 1.6819, + "step": 36300 + }, + { + "epoch": 1.290907474891121, + "grad_norm": 1.7393866777420044, + "learning_rate": 1.3080626530146354e-05, + "loss": 1.7029, + "step": 36310 + }, + { + "epoch": 1.2912629988445472, + "grad_norm": 1.8912163972854614, + "learning_rate": 1.3076897791444057e-05, + "loss": 1.6755, + "step": 36320 + }, + { + "epoch": 1.2916185227979735, + "grad_norm": 1.8694416284561157, + "learning_rate": 1.3073168580149547e-05, + "loss": 1.6558, + "step": 36330 + }, + { + "epoch": 1.2919740467513998, + "grad_norm": 1.9025851488113403, + "learning_rate": 1.3069438896835611e-05, + "loss": 1.7164, + "step": 36340 + }, + { + "epoch": 1.2923295707048261, + "grad_norm": 1.8557045459747314, + "learning_rate": 1.3065708742075109e-05, + "loss": 1.7191, + "step": 36350 + }, + { + "epoch": 1.2926850946582527, + "grad_norm": 1.769911527633667, + "learning_rate": 1.3061978116440965e-05, + "loss": 1.6958, + "step": 36360 + }, + { + "epoch": 1.293040618611679, + "grad_norm": 1.8770461082458496, + "learning_rate": 1.3058247020506181e-05, + "loss": 1.6676, + "step": 36370 + }, + { + "epoch": 1.2933961425651053, + "grad_norm": 1.7757688760757446, + "learning_rate": 1.3054515454843832e-05, + "loss": 1.7228, + "step": 36380 + }, + { + "epoch": 1.2937516665185318, + "grad_norm": 1.818790316581726, + "learning_rate": 1.3050783420027063e-05, + "loss": 1.6467, + "step": 36390 + }, + { + "epoch": 1.294107190471958, + "grad_norm": 2.066945791244507, + "learning_rate": 1.3047050916629085e-05, + "loss": 1.6781, + "step": 36400 + }, + { + "epoch": 1.2944627144253844, + "grad_norm": 1.7122598886489868, + "learning_rate": 1.3043317945223191e-05, + "loss": 1.748, + "step": 36410 + }, + { + "epoch": 1.2948182383788107, + "grad_norm": 1.7318012714385986, + "learning_rate": 1.3039584506382745e-05, + "loss": 1.674, + "step": 36420 + }, + { + "epoch": 1.295173762332237, + "grad_norm": 1.8598337173461914, + "learning_rate": 1.3035850600681175e-05, + "loss": 1.6954, + "step": 36430 + }, + { + "epoch": 1.2955292862856636, + "grad_norm": 1.7719144821166992, + "learning_rate": 1.3032116228691991e-05, + "loss": 1.7006, + "step": 36440 + }, + { + "epoch": 1.2958848102390899, + "grad_norm": 1.800174593925476, + "learning_rate": 1.3028381390988762e-05, + "loss": 1.6485, + "step": 36450 + }, + { + "epoch": 1.2962403341925162, + "grad_norm": 1.820265531539917, + "learning_rate": 1.302464608814514e-05, + "loss": 1.6604, + "step": 36460 + }, + { + "epoch": 1.2965958581459427, + "grad_norm": 1.8295187950134277, + "learning_rate": 1.3020910320734845e-05, + "loss": 1.7148, + "step": 36470 + }, + { + "epoch": 1.296951382099369, + "grad_norm": 1.8874891996383667, + "learning_rate": 1.3017174089331666e-05, + "loss": 1.7115, + "step": 36480 + }, + { + "epoch": 1.2973069060527953, + "grad_norm": 1.7128124237060547, + "learning_rate": 1.3013437394509462e-05, + "loss": 1.6749, + "step": 36490 + }, + { + "epoch": 1.2976624300062216, + "grad_norm": 1.8880102634429932, + "learning_rate": 1.300970023684217e-05, + "loss": 1.6478, + "step": 36500 + }, + { + "epoch": 1.298017953959648, + "grad_norm": 1.7475587129592896, + "learning_rate": 1.3005962616903797e-05, + "loss": 1.6977, + "step": 36510 + }, + { + "epoch": 1.2983734779130744, + "grad_norm": 1.8399840593338013, + "learning_rate": 1.3002224535268408e-05, + "loss": 1.687, + "step": 36520 + }, + { + "epoch": 1.2987290018665008, + "grad_norm": 1.9142301082611084, + "learning_rate": 1.2998485992510156e-05, + "loss": 1.6904, + "step": 36530 + }, + { + "epoch": 1.299084525819927, + "grad_norm": 1.7761683464050293, + "learning_rate": 1.299474698920326e-05, + "loss": 1.678, + "step": 36540 + }, + { + "epoch": 1.2994400497733536, + "grad_norm": 1.825366735458374, + "learning_rate": 1.2991007525921999e-05, + "loss": 1.6866, + "step": 36550 + }, + { + "epoch": 1.29979557372678, + "grad_norm": 1.8218415975570679, + "learning_rate": 1.2987267603240736e-05, + "loss": 1.6699, + "step": 36560 + }, + { + "epoch": 1.3001510976802062, + "grad_norm": 1.7932249307632446, + "learning_rate": 1.2983527221733902e-05, + "loss": 1.6618, + "step": 36570 + }, + { + "epoch": 1.3005066216336325, + "grad_norm": 1.7458404302597046, + "learning_rate": 1.2979786381975991e-05, + "loss": 1.7192, + "step": 36580 + }, + { + "epoch": 1.3008621455870588, + "grad_norm": 1.9243357181549072, + "learning_rate": 1.2976045084541578e-05, + "loss": 1.6671, + "step": 36590 + }, + { + "epoch": 1.3012176695404853, + "grad_norm": 1.8128772974014282, + "learning_rate": 1.2972303330005296e-05, + "loss": 1.7004, + "step": 36600 + }, + { + "epoch": 1.3015731934939117, + "grad_norm": 1.8563944101333618, + "learning_rate": 1.296856111894186e-05, + "loss": 1.6885, + "step": 36610 + }, + { + "epoch": 1.301928717447338, + "grad_norm": 1.721156120300293, + "learning_rate": 1.2964818451926053e-05, + "loss": 1.6827, + "step": 36620 + }, + { + "epoch": 1.3022842414007645, + "grad_norm": 1.9434478282928467, + "learning_rate": 1.296107532953272e-05, + "loss": 1.6935, + "step": 36630 + }, + { + "epoch": 1.3026397653541908, + "grad_norm": 1.8720649480819702, + "learning_rate": 1.2957331752336782e-05, + "loss": 1.6935, + "step": 36640 + }, + { + "epoch": 1.302995289307617, + "grad_norm": 1.8462556600570679, + "learning_rate": 1.2953587720913225e-05, + "loss": 1.6932, + "step": 36650 + }, + { + "epoch": 1.3033508132610434, + "grad_norm": 1.834159016609192, + "learning_rate": 1.2949843235837119e-05, + "loss": 1.6637, + "step": 36660 + }, + { + "epoch": 1.3037063372144697, + "grad_norm": 1.7397176027297974, + "learning_rate": 1.2946098297683582e-05, + "loss": 1.6449, + "step": 36670 + }, + { + "epoch": 1.3040618611678962, + "grad_norm": 1.9851504564285278, + "learning_rate": 1.2942352907027822e-05, + "loss": 1.6977, + "step": 36680 + }, + { + "epoch": 1.3044173851213225, + "grad_norm": 1.8635345697402954, + "learning_rate": 1.2938607064445105e-05, + "loss": 1.6898, + "step": 36690 + }, + { + "epoch": 1.3047729090747489, + "grad_norm": 1.7845070362091064, + "learning_rate": 1.293486077051077e-05, + "loss": 1.6941, + "step": 36700 + }, + { + "epoch": 1.3051284330281754, + "grad_norm": 1.809970498085022, + "learning_rate": 1.293111402580022e-05, + "loss": 1.6713, + "step": 36710 + }, + { + "epoch": 1.3054839569816017, + "grad_norm": 1.8623954057693481, + "learning_rate": 1.2927366830888933e-05, + "loss": 1.676, + "step": 36720 + }, + { + "epoch": 1.305839480935028, + "grad_norm": 1.7936292886734009, + "learning_rate": 1.2923619186352454e-05, + "loss": 1.7016, + "step": 36730 + }, + { + "epoch": 1.3061950048884543, + "grad_norm": 1.50461745262146, + "learning_rate": 1.2919871092766403e-05, + "loss": 1.6145, + "step": 36740 + }, + { + "epoch": 1.3065505288418806, + "grad_norm": 1.7584197521209717, + "learning_rate": 1.2916122550706458e-05, + "loss": 1.6588, + "step": 36750 + }, + { + "epoch": 1.3069060527953071, + "grad_norm": 1.8236706256866455, + "learning_rate": 1.2912373560748374e-05, + "loss": 1.7182, + "step": 36760 + }, + { + "epoch": 1.3072615767487334, + "grad_norm": 1.6947407722473145, + "learning_rate": 1.290862412346797e-05, + "loss": 1.6811, + "step": 36770 + }, + { + "epoch": 1.3076171007021598, + "grad_norm": 1.9147242307662964, + "learning_rate": 1.2904874239441143e-05, + "loss": 1.6334, + "step": 36780 + }, + { + "epoch": 1.3079726246555863, + "grad_norm": 1.8855173587799072, + "learning_rate": 1.2901123909243842e-05, + "loss": 1.7351, + "step": 36790 + }, + { + "epoch": 1.3083281486090126, + "grad_norm": 1.8250107765197754, + "learning_rate": 1.2897373133452098e-05, + "loss": 1.713, + "step": 36800 + }, + { + "epoch": 1.308683672562439, + "grad_norm": 1.9745123386383057, + "learning_rate": 1.2893621912642007e-05, + "loss": 1.7065, + "step": 36810 + }, + { + "epoch": 1.3090391965158652, + "grad_norm": 1.8133615255355835, + "learning_rate": 1.2889870247389738e-05, + "loss": 1.6815, + "step": 36820 + }, + { + "epoch": 1.3093947204692915, + "grad_norm": 1.8287822008132935, + "learning_rate": 1.2886118138271514e-05, + "loss": 1.7009, + "step": 36830 + }, + { + "epoch": 1.309750244422718, + "grad_norm": 1.7905579805374146, + "learning_rate": 1.2882365585863643e-05, + "loss": 1.6893, + "step": 36840 + }, + { + "epoch": 1.3101057683761443, + "grad_norm": 1.9743231534957886, + "learning_rate": 1.287861259074249e-05, + "loss": 1.6891, + "step": 36850 + }, + { + "epoch": 1.3104612923295706, + "grad_norm": 1.8222795724868774, + "learning_rate": 1.2874859153484492e-05, + "loss": 1.6787, + "step": 36860 + }, + { + "epoch": 1.3108168162829972, + "grad_norm": 1.750848412513733, + "learning_rate": 1.2871105274666154e-05, + "loss": 1.6888, + "step": 36870 + }, + { + "epoch": 1.3111723402364235, + "grad_norm": 1.8487437963485718, + "learning_rate": 1.2867350954864048e-05, + "loss": 1.6363, + "step": 36880 + }, + { + "epoch": 1.3115278641898498, + "grad_norm": 1.7813891172409058, + "learning_rate": 1.2863596194654813e-05, + "loss": 1.6858, + "step": 36890 + }, + { + "epoch": 1.311883388143276, + "grad_norm": 1.7855768203735352, + "learning_rate": 1.2859840994615156e-05, + "loss": 1.642, + "step": 36900 + }, + { + "epoch": 1.3122389120967024, + "grad_norm": 1.7619067430496216, + "learning_rate": 1.2856085355321852e-05, + "loss": 1.7181, + "step": 36910 + }, + { + "epoch": 1.312594436050129, + "grad_norm": 1.8910986185073853, + "learning_rate": 1.2852329277351746e-05, + "loss": 1.7122, + "step": 36920 + }, + { + "epoch": 1.3129499600035552, + "grad_norm": 1.8386540412902832, + "learning_rate": 1.2848572761281752e-05, + "loss": 1.747, + "step": 36930 + }, + { + "epoch": 1.3133054839569815, + "grad_norm": 1.8845070600509644, + "learning_rate": 1.284481580768884e-05, + "loss": 1.7065, + "step": 36940 + }, + { + "epoch": 1.313661007910408, + "grad_norm": 1.8629308938980103, + "learning_rate": 1.2841058417150059e-05, + "loss": 1.7201, + "step": 36950 + }, + { + "epoch": 1.3140165318638344, + "grad_norm": 1.7198325395584106, + "learning_rate": 1.2837300590242517e-05, + "loss": 1.6766, + "step": 36960 + }, + { + "epoch": 1.3143720558172607, + "grad_norm": 1.796847939491272, + "learning_rate": 1.2833542327543392e-05, + "loss": 1.6875, + "step": 36970 + }, + { + "epoch": 1.314727579770687, + "grad_norm": 1.7964692115783691, + "learning_rate": 1.2829783629629933e-05, + "loss": 1.6483, + "step": 36980 + }, + { + "epoch": 1.3150831037241133, + "grad_norm": 1.7154792547225952, + "learning_rate": 1.2826024497079452e-05, + "loss": 1.7301, + "step": 36990 + }, + { + "epoch": 1.3154386276775398, + "grad_norm": 1.9361765384674072, + "learning_rate": 1.2822264930469329e-05, + "loss": 1.7092, + "step": 37000 + }, + { + "epoch": 1.3157941516309661, + "grad_norm": 1.7642241716384888, + "learning_rate": 1.2818504930377007e-05, + "loss": 1.6975, + "step": 37010 + }, + { + "epoch": 1.3161496755843924, + "grad_norm": 1.816183090209961, + "learning_rate": 1.2814744497380001e-05, + "loss": 1.6853, + "step": 37020 + }, + { + "epoch": 1.316505199537819, + "grad_norm": 1.8195050954818726, + "learning_rate": 1.2810983632055887e-05, + "loss": 1.6636, + "step": 37030 + }, + { + "epoch": 1.3168607234912453, + "grad_norm": 1.9385960102081299, + "learning_rate": 1.2807222334982312e-05, + "loss": 1.6698, + "step": 37040 + }, + { + "epoch": 1.3172162474446716, + "grad_norm": 1.8485578298568726, + "learning_rate": 1.2803460606736989e-05, + "loss": 1.7425, + "step": 37050 + }, + { + "epoch": 1.317571771398098, + "grad_norm": 1.8752837181091309, + "learning_rate": 1.2799698447897695e-05, + "loss": 1.6967, + "step": 37060 + }, + { + "epoch": 1.3179272953515242, + "grad_norm": 1.817765712738037, + "learning_rate": 1.2795935859042272e-05, + "loss": 1.6453, + "step": 37070 + }, + { + "epoch": 1.3182828193049507, + "grad_norm": 1.7797666788101196, + "learning_rate": 1.2792172840748633e-05, + "loss": 1.6916, + "step": 37080 + }, + { + "epoch": 1.318638343258377, + "grad_norm": 1.8178123235702515, + "learning_rate": 1.2788409393594753e-05, + "loss": 1.646, + "step": 37090 + }, + { + "epoch": 1.3189938672118033, + "grad_norm": 1.9050863981246948, + "learning_rate": 1.2784645518158674e-05, + "loss": 1.6862, + "step": 37100 + }, + { + "epoch": 1.3193493911652299, + "grad_norm": 1.8194165229797363, + "learning_rate": 1.2780881215018502e-05, + "loss": 1.7165, + "step": 37110 + }, + { + "epoch": 1.3197049151186562, + "grad_norm": 1.7735542058944702, + "learning_rate": 1.277711648475241e-05, + "loss": 1.734, + "step": 37120 + }, + { + "epoch": 1.3200604390720825, + "grad_norm": 1.9834961891174316, + "learning_rate": 1.2773351327938643e-05, + "loss": 1.6942, + "step": 37130 + }, + { + "epoch": 1.3204159630255088, + "grad_norm": 1.795721411705017, + "learning_rate": 1.2769585745155497e-05, + "loss": 1.7135, + "step": 37140 + }, + { + "epoch": 1.320771486978935, + "grad_norm": 1.7426166534423828, + "learning_rate": 1.2765819736981346e-05, + "loss": 1.6587, + "step": 37150 + }, + { + "epoch": 1.3211270109323616, + "grad_norm": 1.8144384622573853, + "learning_rate": 1.2762053303994627e-05, + "loss": 1.6538, + "step": 37160 + }, + { + "epoch": 1.321482534885788, + "grad_norm": 1.7716857194900513, + "learning_rate": 1.2758286446773838e-05, + "loss": 1.6843, + "step": 37170 + }, + { + "epoch": 1.3218380588392142, + "grad_norm": 1.7559036016464233, + "learning_rate": 1.2754519165897547e-05, + "loss": 1.6962, + "step": 37180 + }, + { + "epoch": 1.3221935827926408, + "grad_norm": 1.9076648950576782, + "learning_rate": 1.2750751461944384e-05, + "loss": 1.6504, + "step": 37190 + }, + { + "epoch": 1.322549106746067, + "grad_norm": 1.8252453804016113, + "learning_rate": 1.2746983335493042e-05, + "loss": 1.6783, + "step": 37200 + }, + { + "epoch": 1.3229046306994934, + "grad_norm": 1.7769012451171875, + "learning_rate": 1.2743214787122282e-05, + "loss": 1.6997, + "step": 37210 + }, + { + "epoch": 1.3232601546529197, + "grad_norm": 1.8328056335449219, + "learning_rate": 1.2739445817410931e-05, + "loss": 1.6457, + "step": 37220 + }, + { + "epoch": 1.323615678606346, + "grad_norm": 1.9582966566085815, + "learning_rate": 1.273567642693788e-05, + "loss": 1.679, + "step": 37230 + }, + { + "epoch": 1.3239712025597725, + "grad_norm": 2.056260585784912, + "learning_rate": 1.2731906616282081e-05, + "loss": 1.6784, + "step": 37240 + }, + { + "epoch": 1.3243267265131988, + "grad_norm": 1.9473516941070557, + "learning_rate": 1.2728136386022558e-05, + "loss": 1.6985, + "step": 37250 + }, + { + "epoch": 1.3246822504666251, + "grad_norm": 1.698970913887024, + "learning_rate": 1.272436573673839e-05, + "loss": 1.6747, + "step": 37260 + }, + { + "epoch": 1.3250377744200517, + "grad_norm": 1.7800887823104858, + "learning_rate": 1.2720594669008728e-05, + "loss": 1.703, + "step": 37270 + }, + { + "epoch": 1.325393298373478, + "grad_norm": 1.6938693523406982, + "learning_rate": 1.271682318341278e-05, + "loss": 1.7243, + "step": 37280 + }, + { + "epoch": 1.3257488223269043, + "grad_norm": 1.6857800483703613, + "learning_rate": 1.2713051280529829e-05, + "loss": 1.7032, + "step": 37290 + }, + { + "epoch": 1.3261043462803306, + "grad_norm": 1.7712005376815796, + "learning_rate": 1.2709278960939209e-05, + "loss": 1.6491, + "step": 37300 + }, + { + "epoch": 1.3264598702337569, + "grad_norm": 1.8196500539779663, + "learning_rate": 1.2705506225220332e-05, + "loss": 1.7185, + "step": 37310 + }, + { + "epoch": 1.3268153941871834, + "grad_norm": 1.7919986248016357, + "learning_rate": 1.270173307395266e-05, + "loss": 1.6856, + "step": 37320 + }, + { + "epoch": 1.3271709181406097, + "grad_norm": 1.8808327913284302, + "learning_rate": 1.2697959507715727e-05, + "loss": 1.7187, + "step": 37330 + }, + { + "epoch": 1.327526442094036, + "grad_norm": 1.83591890335083, + "learning_rate": 1.2694185527089132e-05, + "loss": 1.6613, + "step": 37340 + }, + { + "epoch": 1.3278819660474626, + "grad_norm": 1.803751826286316, + "learning_rate": 1.2690411132652532e-05, + "loss": 1.7158, + "step": 37350 + }, + { + "epoch": 1.3282374900008889, + "grad_norm": 1.7949421405792236, + "learning_rate": 1.2686636324985649e-05, + "loss": 1.6868, + "step": 37360 + }, + { + "epoch": 1.3285930139543152, + "grad_norm": 1.8414742946624756, + "learning_rate": 1.2682861104668276e-05, + "loss": 1.6864, + "step": 37370 + }, + { + "epoch": 1.3289485379077415, + "grad_norm": 1.882709264755249, + "learning_rate": 1.2679085472280255e-05, + "loss": 1.6579, + "step": 37380 + }, + { + "epoch": 1.3293040618611678, + "grad_norm": 1.8424874544143677, + "learning_rate": 1.2675309428401502e-05, + "loss": 1.6771, + "step": 37390 + }, + { + "epoch": 1.3296595858145943, + "grad_norm": 1.8590044975280762, + "learning_rate": 1.2671532973611999e-05, + "loss": 1.7331, + "step": 37400 + }, + { + "epoch": 1.3300151097680206, + "grad_norm": 1.7579020261764526, + "learning_rate": 1.266775610849178e-05, + "loss": 1.7365, + "step": 37410 + }, + { + "epoch": 1.330370633721447, + "grad_norm": 1.7553985118865967, + "learning_rate": 1.2663978833620954e-05, + "loss": 1.7171, + "step": 37420 + }, + { + "epoch": 1.3307261576748735, + "grad_norm": 1.8007732629776, + "learning_rate": 1.2660201149579678e-05, + "loss": 1.6695, + "step": 37430 + }, + { + "epoch": 1.3310816816282998, + "grad_norm": 1.9665582180023193, + "learning_rate": 1.2656423056948188e-05, + "loss": 1.6685, + "step": 37440 + }, + { + "epoch": 1.331437205581726, + "grad_norm": 1.8420867919921875, + "learning_rate": 1.265264455630677e-05, + "loss": 1.7187, + "step": 37450 + }, + { + "epoch": 1.3317927295351524, + "grad_norm": 1.8061645030975342, + "learning_rate": 1.264886564823578e-05, + "loss": 1.6622, + "step": 37460 + }, + { + "epoch": 1.3321482534885787, + "grad_norm": 1.8627655506134033, + "learning_rate": 1.2645086333315636e-05, + "loss": 1.7042, + "step": 37470 + }, + { + "epoch": 1.3325037774420052, + "grad_norm": 1.8341363668441772, + "learning_rate": 1.2641306612126813e-05, + "loss": 1.6541, + "step": 37480 + }, + { + "epoch": 1.3328593013954315, + "grad_norm": 1.8347748517990112, + "learning_rate": 1.263752648524986e-05, + "loss": 1.6792, + "step": 37490 + }, + { + "epoch": 1.3332148253488578, + "grad_norm": 1.8106322288513184, + "learning_rate": 1.2633745953265377e-05, + "loss": 1.6903, + "step": 37500 + }, + { + "epoch": 1.3335703493022844, + "grad_norm": 1.8910024166107178, + "learning_rate": 1.2629965016754027e-05, + "loss": 1.6358, + "step": 37510 + }, + { + "epoch": 1.3339258732557107, + "grad_norm": 1.847715973854065, + "learning_rate": 1.262618367629654e-05, + "loss": 1.6791, + "step": 37520 + }, + { + "epoch": 1.334281397209137, + "grad_norm": 1.8155648708343506, + "learning_rate": 1.2622401932473705e-05, + "loss": 1.6605, + "step": 37530 + }, + { + "epoch": 1.3346369211625633, + "grad_norm": 1.8898736238479614, + "learning_rate": 1.2618619785866377e-05, + "loss": 1.7061, + "step": 37540 + }, + { + "epoch": 1.3349924451159896, + "grad_norm": 1.9694591760635376, + "learning_rate": 1.2614837237055468e-05, + "loss": 1.6914, + "step": 37550 + }, + { + "epoch": 1.335347969069416, + "grad_norm": 2.3319625854492188, + "learning_rate": 1.261105428662196e-05, + "loss": 1.6767, + "step": 37560 + }, + { + "epoch": 1.3357034930228424, + "grad_norm": 1.8441003561019897, + "learning_rate": 1.260727093514688e-05, + "loss": 1.6525, + "step": 37570 + }, + { + "epoch": 1.3360590169762687, + "grad_norm": 1.8745046854019165, + "learning_rate": 1.260348718321133e-05, + "loss": 1.6947, + "step": 37580 + }, + { + "epoch": 1.3364145409296952, + "grad_norm": 1.7331849336624146, + "learning_rate": 1.259970303139648e-05, + "loss": 1.7215, + "step": 37590 + }, + { + "epoch": 1.3367700648831216, + "grad_norm": 1.8349084854125977, + "learning_rate": 1.2595918480283538e-05, + "loss": 1.6903, + "step": 37600 + }, + { + "epoch": 1.3371255888365479, + "grad_norm": 1.7928951978683472, + "learning_rate": 1.2592133530453797e-05, + "loss": 1.676, + "step": 37610 + }, + { + "epoch": 1.3374811127899742, + "grad_norm": 2.0136008262634277, + "learning_rate": 1.2588348182488599e-05, + "loss": 1.6742, + "step": 37620 + }, + { + "epoch": 1.3378366367434005, + "grad_norm": 1.8972746133804321, + "learning_rate": 1.2584562436969348e-05, + "loss": 1.7219, + "step": 37630 + }, + { + "epoch": 1.338192160696827, + "grad_norm": 1.7846343517303467, + "learning_rate": 1.258077629447751e-05, + "loss": 1.6871, + "step": 37640 + }, + { + "epoch": 1.3385476846502533, + "grad_norm": 1.8348819017410278, + "learning_rate": 1.2576989755594617e-05, + "loss": 1.697, + "step": 37650 + }, + { + "epoch": 1.3389032086036796, + "grad_norm": 1.8878655433654785, + "learning_rate": 1.2573202820902254e-05, + "loss": 1.7241, + "step": 37660 + }, + { + "epoch": 1.3392587325571061, + "grad_norm": 1.9071545600891113, + "learning_rate": 1.2569415490982075e-05, + "loss": 1.7057, + "step": 37670 + }, + { + "epoch": 1.3396142565105325, + "grad_norm": 1.938977837562561, + "learning_rate": 1.2565627766415784e-05, + "loss": 1.7211, + "step": 37680 + }, + { + "epoch": 1.3399697804639588, + "grad_norm": 1.8646774291992188, + "learning_rate": 1.2561839647785159e-05, + "loss": 1.7011, + "step": 37690 + }, + { + "epoch": 1.340325304417385, + "grad_norm": 2.070342779159546, + "learning_rate": 1.2558051135672022e-05, + "loss": 1.6739, + "step": 37700 + }, + { + "epoch": 1.3406808283708114, + "grad_norm": 1.8355119228363037, + "learning_rate": 1.2554262230658271e-05, + "loss": 1.6937, + "step": 37710 + }, + { + "epoch": 1.341036352324238, + "grad_norm": 1.8518496751785278, + "learning_rate": 1.2550472933325856e-05, + "loss": 1.6695, + "step": 37720 + }, + { + "epoch": 1.3413918762776642, + "grad_norm": 1.8776099681854248, + "learning_rate": 1.2546683244256792e-05, + "loss": 1.7149, + "step": 37730 + }, + { + "epoch": 1.3417474002310905, + "grad_norm": 1.8292155265808105, + "learning_rate": 1.254289316403315e-05, + "loss": 1.6847, + "step": 37740 + }, + { + "epoch": 1.342102924184517, + "grad_norm": 1.7810564041137695, + "learning_rate": 1.2539102693237062e-05, + "loss": 1.7141, + "step": 37750 + }, + { + "epoch": 1.3424584481379433, + "grad_norm": 1.7767292261123657, + "learning_rate": 1.2535311832450718e-05, + "loss": 1.6408, + "step": 37760 + }, + { + "epoch": 1.3428139720913697, + "grad_norm": 1.768619179725647, + "learning_rate": 1.2531520582256374e-05, + "loss": 1.6612, + "step": 37770 + }, + { + "epoch": 1.343169496044796, + "grad_norm": 1.9853242635726929, + "learning_rate": 1.252772894323634e-05, + "loss": 1.7205, + "step": 37780 + }, + { + "epoch": 1.3435250199982223, + "grad_norm": 1.761792778968811, + "learning_rate": 1.2523936915972992e-05, + "loss": 1.7178, + "step": 37790 + }, + { + "epoch": 1.3438805439516488, + "grad_norm": 1.7345435619354248, + "learning_rate": 1.252014450104876e-05, + "loss": 1.7095, + "step": 37800 + }, + { + "epoch": 1.344236067905075, + "grad_norm": 1.7151758670806885, + "learning_rate": 1.251635169904613e-05, + "loss": 1.6825, + "step": 37810 + }, + { + "epoch": 1.3445915918585014, + "grad_norm": 1.7500386238098145, + "learning_rate": 1.2512558510547658e-05, + "loss": 1.6734, + "step": 37820 + }, + { + "epoch": 1.344947115811928, + "grad_norm": 1.9393870830535889, + "learning_rate": 1.2508764936135956e-05, + "loss": 1.7165, + "step": 37830 + }, + { + "epoch": 1.3453026397653542, + "grad_norm": 1.8957933187484741, + "learning_rate": 1.2504970976393687e-05, + "loss": 1.6754, + "step": 37840 + }, + { + "epoch": 1.3456581637187806, + "grad_norm": 1.8289235830307007, + "learning_rate": 1.2501176631903583e-05, + "loss": 1.6886, + "step": 37850 + }, + { + "epoch": 1.3460136876722069, + "grad_norm": 1.956375241279602, + "learning_rate": 1.2497381903248426e-05, + "loss": 1.68, + "step": 37860 + }, + { + "epoch": 1.3463692116256332, + "grad_norm": 1.7941055297851562, + "learning_rate": 1.2493586791011074e-05, + "loss": 1.6469, + "step": 37870 + }, + { + "epoch": 1.3467247355790597, + "grad_norm": 1.8033486604690552, + "learning_rate": 1.2489791295774422e-05, + "loss": 1.6262, + "step": 37880 + }, + { + "epoch": 1.347080259532486, + "grad_norm": 1.9014389514923096, + "learning_rate": 1.2485995418121441e-05, + "loss": 1.683, + "step": 37890 + }, + { + "epoch": 1.3474357834859123, + "grad_norm": 1.7786731719970703, + "learning_rate": 1.2482199158635149e-05, + "loss": 1.6627, + "step": 37900 + }, + { + "epoch": 1.3477913074393388, + "grad_norm": 1.813820481300354, + "learning_rate": 1.2478402517898632e-05, + "loss": 1.741, + "step": 37910 + }, + { + "epoch": 1.3481468313927651, + "grad_norm": 1.797961711883545, + "learning_rate": 1.2474605496495024e-05, + "loss": 1.7126, + "step": 37920 + }, + { + "epoch": 1.3485023553461915, + "grad_norm": 1.7511776685714722, + "learning_rate": 1.2470808095007535e-05, + "loss": 1.6567, + "step": 37930 + }, + { + "epoch": 1.3488578792996178, + "grad_norm": 1.8130065202713013, + "learning_rate": 1.2467010314019408e-05, + "loss": 1.6329, + "step": 37940 + }, + { + "epoch": 1.349213403253044, + "grad_norm": 1.7048790454864502, + "learning_rate": 1.2463212154113966e-05, + "loss": 1.6743, + "step": 37950 + }, + { + "epoch": 1.3495689272064706, + "grad_norm": 1.7872480154037476, + "learning_rate": 1.245941361587458e-05, + "loss": 1.6686, + "step": 37960 + }, + { + "epoch": 1.349924451159897, + "grad_norm": 1.8660238981246948, + "learning_rate": 1.2455614699884686e-05, + "loss": 1.6878, + "step": 37970 + }, + { + "epoch": 1.3502799751133232, + "grad_norm": 1.946341633796692, + "learning_rate": 1.245181540672777e-05, + "loss": 1.6678, + "step": 37980 + }, + { + "epoch": 1.3506354990667497, + "grad_norm": 1.8169410228729248, + "learning_rate": 1.2448015736987382e-05, + "loss": 1.6439, + "step": 37990 + }, + { + "epoch": 1.350991023020176, + "grad_norm": 1.7743910551071167, + "learning_rate": 1.2444215691247128e-05, + "loss": 1.7119, + "step": 38000 + }, + { + "epoch": 1.3513465469736023, + "grad_norm": 1.8813501596450806, + "learning_rate": 1.2440415270090665e-05, + "loss": 1.7038, + "step": 38010 + }, + { + "epoch": 1.3517020709270287, + "grad_norm": 1.8178991079330444, + "learning_rate": 1.2436614474101719e-05, + "loss": 1.6467, + "step": 38020 + }, + { + "epoch": 1.352057594880455, + "grad_norm": 1.7223140001296997, + "learning_rate": 1.2432813303864067e-05, + "loss": 1.6908, + "step": 38030 + }, + { + "epoch": 1.3524131188338815, + "grad_norm": 1.8696693181991577, + "learning_rate": 1.2429011759961544e-05, + "loss": 1.7457, + "step": 38040 + }, + { + "epoch": 1.3527686427873078, + "grad_norm": 1.8467459678649902, + "learning_rate": 1.242520984297805e-05, + "loss": 1.6485, + "step": 38050 + }, + { + "epoch": 1.353124166740734, + "grad_norm": 1.8918532133102417, + "learning_rate": 1.2421407553497527e-05, + "loss": 1.6325, + "step": 38060 + }, + { + "epoch": 1.3534796906941606, + "grad_norm": 1.857740044593811, + "learning_rate": 1.2417604892103988e-05, + "loss": 1.6461, + "step": 38070 + }, + { + "epoch": 1.353835214647587, + "grad_norm": 1.7795847654342651, + "learning_rate": 1.241380185938149e-05, + "loss": 1.6728, + "step": 38080 + }, + { + "epoch": 1.3541907386010132, + "grad_norm": 1.821782112121582, + "learning_rate": 1.2409998455914167e-05, + "loss": 1.6939, + "step": 38090 + }, + { + "epoch": 1.3545462625544396, + "grad_norm": 1.709168553352356, + "learning_rate": 1.2406194682286188e-05, + "loss": 1.6689, + "step": 38100 + }, + { + "epoch": 1.3549017865078659, + "grad_norm": 1.741292953491211, + "learning_rate": 1.2402390539081796e-05, + "loss": 1.698, + "step": 38110 + }, + { + "epoch": 1.3552573104612924, + "grad_norm": 1.8442113399505615, + "learning_rate": 1.239858602688528e-05, + "loss": 1.7049, + "step": 38120 + }, + { + "epoch": 1.3556128344147187, + "grad_norm": 1.8573895692825317, + "learning_rate": 1.2394781146280987e-05, + "loss": 1.6743, + "step": 38130 + }, + { + "epoch": 1.355968358368145, + "grad_norm": 1.7257417440414429, + "learning_rate": 1.2390975897853329e-05, + "loss": 1.7107, + "step": 38140 + }, + { + "epoch": 1.3563238823215715, + "grad_norm": 1.7973076105117798, + "learning_rate": 1.2387170282186762e-05, + "loss": 1.6723, + "step": 38150 + }, + { + "epoch": 1.3566794062749978, + "grad_norm": 1.8350764513015747, + "learning_rate": 1.238336429986581e-05, + "loss": 1.6761, + "step": 38160 + }, + { + "epoch": 1.3570349302284241, + "grad_norm": 1.8104625940322876, + "learning_rate": 1.2379557951475044e-05, + "loss": 1.7014, + "step": 38170 + }, + { + "epoch": 1.3573904541818504, + "grad_norm": 1.7754307985305786, + "learning_rate": 1.2375751237599096e-05, + "loss": 1.7155, + "step": 38180 + }, + { + "epoch": 1.3577459781352768, + "grad_norm": 1.7645673751831055, + "learning_rate": 1.2371944158822653e-05, + "loss": 1.6503, + "step": 38190 + }, + { + "epoch": 1.3581015020887033, + "grad_norm": 1.8259831666946411, + "learning_rate": 1.2368136715730458e-05, + "loss": 1.7034, + "step": 38200 + }, + { + "epoch": 1.3584570260421296, + "grad_norm": 1.8655147552490234, + "learning_rate": 1.2364328908907314e-05, + "loss": 1.6771, + "step": 38210 + }, + { + "epoch": 1.358812549995556, + "grad_norm": 1.8037327527999878, + "learning_rate": 1.2360520738938075e-05, + "loss": 1.6634, + "step": 38220 + }, + { + "epoch": 1.3591680739489824, + "grad_norm": 1.8114920854568481, + "learning_rate": 1.2356712206407653e-05, + "loss": 1.6914, + "step": 38230 + }, + { + "epoch": 1.3595235979024087, + "grad_norm": 1.8625586032867432, + "learning_rate": 1.2352903311901012e-05, + "loss": 1.6398, + "step": 38240 + }, + { + "epoch": 1.359879121855835, + "grad_norm": 1.8780370950698853, + "learning_rate": 1.2349094056003173e-05, + "loss": 1.6635, + "step": 38250 + }, + { + "epoch": 1.3602346458092613, + "grad_norm": 1.8319125175476074, + "learning_rate": 1.2345284439299215e-05, + "loss": 1.6816, + "step": 38260 + }, + { + "epoch": 1.3605901697626877, + "grad_norm": 1.8563363552093506, + "learning_rate": 1.2341474462374272e-05, + "loss": 1.6866, + "step": 38270 + }, + { + "epoch": 1.3609456937161142, + "grad_norm": 1.749904751777649, + "learning_rate": 1.2337664125813533e-05, + "loss": 1.6335, + "step": 38280 + }, + { + "epoch": 1.3613012176695405, + "grad_norm": 1.7130019664764404, + "learning_rate": 1.2333853430202242e-05, + "loss": 1.7024, + "step": 38290 + }, + { + "epoch": 1.3616567416229668, + "grad_norm": 1.8118706941604614, + "learning_rate": 1.2330042376125699e-05, + "loss": 1.7038, + "step": 38300 + }, + { + "epoch": 1.3620122655763933, + "grad_norm": 1.9396083354949951, + "learning_rate": 1.2326230964169258e-05, + "loss": 1.6388, + "step": 38310 + }, + { + "epoch": 1.3623677895298196, + "grad_norm": 1.8331371545791626, + "learning_rate": 1.2322419194918325e-05, + "loss": 1.6703, + "step": 38320 + }, + { + "epoch": 1.362723313483246, + "grad_norm": 1.7760456800460815, + "learning_rate": 1.2318607068958363e-05, + "loss": 1.7169, + "step": 38330 + }, + { + "epoch": 1.3630788374366722, + "grad_norm": 1.9254004955291748, + "learning_rate": 1.2314794586874893e-05, + "loss": 1.6992, + "step": 38340 + }, + { + "epoch": 1.3634343613900985, + "grad_norm": 1.8271830081939697, + "learning_rate": 1.2310981749253489e-05, + "loss": 1.6943, + "step": 38350 + }, + { + "epoch": 1.363789885343525, + "grad_norm": 1.9219883680343628, + "learning_rate": 1.2307168556679782e-05, + "loss": 1.6783, + "step": 38360 + }, + { + "epoch": 1.3641454092969514, + "grad_norm": 1.7999000549316406, + "learning_rate": 1.2303355009739447e-05, + "loss": 1.7166, + "step": 38370 + }, + { + "epoch": 1.3645009332503777, + "grad_norm": 1.9395989179611206, + "learning_rate": 1.2299541109018224e-05, + "loss": 1.6232, + "step": 38380 + }, + { + "epoch": 1.3648564572038042, + "grad_norm": 1.7778639793395996, + "learning_rate": 1.2295726855101911e-05, + "loss": 1.6502, + "step": 38390 + }, + { + "epoch": 1.3652119811572305, + "grad_norm": 1.763012170791626, + "learning_rate": 1.2291912248576341e-05, + "loss": 1.6677, + "step": 38400 + }, + { + "epoch": 1.3655675051106568, + "grad_norm": 1.800807237625122, + "learning_rate": 1.228809729002742e-05, + "loss": 1.6758, + "step": 38410 + }, + { + "epoch": 1.3659230290640831, + "grad_norm": 1.9781922101974487, + "learning_rate": 1.2284281980041105e-05, + "loss": 1.6432, + "step": 38420 + }, + { + "epoch": 1.3662785530175094, + "grad_norm": 1.842842698097229, + "learning_rate": 1.22804663192034e-05, + "loss": 1.6869, + "step": 38430 + }, + { + "epoch": 1.366634076970936, + "grad_norm": 1.7902802228927612, + "learning_rate": 1.2276650308100364e-05, + "loss": 1.6863, + "step": 38440 + }, + { + "epoch": 1.3669896009243623, + "grad_norm": 1.973473072052002, + "learning_rate": 1.2272833947318117e-05, + "loss": 1.6542, + "step": 38450 + }, + { + "epoch": 1.3673451248777886, + "grad_norm": 1.8320331573486328, + "learning_rate": 1.2269017237442826e-05, + "loss": 1.696, + "step": 38460 + }, + { + "epoch": 1.3677006488312151, + "grad_norm": 1.817253589630127, + "learning_rate": 1.2265200179060716e-05, + "loss": 1.6416, + "step": 38470 + }, + { + "epoch": 1.3680561727846414, + "grad_norm": 1.9107543230056763, + "learning_rate": 1.2261382772758061e-05, + "loss": 1.6853, + "step": 38480 + }, + { + "epoch": 1.3684116967380677, + "grad_norm": 1.8446093797683716, + "learning_rate": 1.2257565019121191e-05, + "loss": 1.6747, + "step": 38490 + }, + { + "epoch": 1.368767220691494, + "grad_norm": 1.741042971611023, + "learning_rate": 1.2253746918736489e-05, + "loss": 1.6507, + "step": 38500 + }, + { + "epoch": 1.3691227446449203, + "grad_norm": 1.7928935289382935, + "learning_rate": 1.2249928472190391e-05, + "loss": 1.6964, + "step": 38510 + }, + { + "epoch": 1.3694782685983469, + "grad_norm": 1.8920574188232422, + "learning_rate": 1.2246109680069385e-05, + "loss": 1.6542, + "step": 38520 + }, + { + "epoch": 1.3698337925517732, + "grad_norm": 1.88433039188385, + "learning_rate": 1.2242290542960017e-05, + "loss": 1.6774, + "step": 38530 + }, + { + "epoch": 1.3701893165051995, + "grad_norm": 1.817080020904541, + "learning_rate": 1.2238471061448881e-05, + "loss": 1.6833, + "step": 38540 + }, + { + "epoch": 1.370544840458626, + "grad_norm": 1.862455129623413, + "learning_rate": 1.2234651236122627e-05, + "loss": 1.6817, + "step": 38550 + }, + { + "epoch": 1.3709003644120523, + "grad_norm": 1.7352641820907593, + "learning_rate": 1.2230831067567955e-05, + "loss": 1.7034, + "step": 38560 + }, + { + "epoch": 1.3712558883654786, + "grad_norm": 1.7810717821121216, + "learning_rate": 1.2227010556371615e-05, + "loss": 1.6744, + "step": 38570 + }, + { + "epoch": 1.371611412318905, + "grad_norm": 1.6984426975250244, + "learning_rate": 1.2223189703120416e-05, + "loss": 1.6574, + "step": 38580 + }, + { + "epoch": 1.3719669362723312, + "grad_norm": 1.8884998559951782, + "learning_rate": 1.221936850840122e-05, + "loss": 1.6797, + "step": 38590 + }, + { + "epoch": 1.3723224602257578, + "grad_norm": 1.8598767518997192, + "learning_rate": 1.2215546972800937e-05, + "loss": 1.6687, + "step": 38600 + }, + { + "epoch": 1.372677984179184, + "grad_norm": 1.7620718479156494, + "learning_rate": 1.2211725096906533e-05, + "loss": 1.6711, + "step": 38610 + }, + { + "epoch": 1.3730335081326104, + "grad_norm": 1.8488720655441284, + "learning_rate": 1.2207902881305018e-05, + "loss": 1.6975, + "step": 38620 + }, + { + "epoch": 1.373389032086037, + "grad_norm": 1.7783499956130981, + "learning_rate": 1.2204080326583467e-05, + "loss": 1.6539, + "step": 38630 + }, + { + "epoch": 1.3737445560394632, + "grad_norm": 1.9365428686141968, + "learning_rate": 1.2200257433328994e-05, + "loss": 1.674, + "step": 38640 + }, + { + "epoch": 1.3741000799928895, + "grad_norm": 1.7568024396896362, + "learning_rate": 1.2196434202128777e-05, + "loss": 1.6626, + "step": 38650 + }, + { + "epoch": 1.3744556039463158, + "grad_norm": 1.9768433570861816, + "learning_rate": 1.219261063357004e-05, + "loss": 1.6946, + "step": 38660 + }, + { + "epoch": 1.3748111278997421, + "grad_norm": 1.7951604127883911, + "learning_rate": 1.2188786728240057e-05, + "loss": 1.6704, + "step": 38670 + }, + { + "epoch": 1.3751666518531687, + "grad_norm": 1.8011988401412964, + "learning_rate": 1.2184962486726154e-05, + "loss": 1.6936, + "step": 38680 + }, + { + "epoch": 1.375522175806595, + "grad_norm": 1.8213082551956177, + "learning_rate": 1.2181137909615713e-05, + "loss": 1.6814, + "step": 38690 + }, + { + "epoch": 1.3758776997600213, + "grad_norm": 1.7896612882614136, + "learning_rate": 1.2177312997496164e-05, + "loss": 1.6568, + "step": 38700 + }, + { + "epoch": 1.3762332237134478, + "grad_norm": 1.6915911436080933, + "learning_rate": 1.2173487750954993e-05, + "loss": 1.6616, + "step": 38710 + }, + { + "epoch": 1.3765887476668741, + "grad_norm": 1.857466459274292, + "learning_rate": 1.2169662170579733e-05, + "loss": 1.6526, + "step": 38720 + }, + { + "epoch": 1.3769442716203004, + "grad_norm": 1.732410192489624, + "learning_rate": 1.2165836256957963e-05, + "loss": 1.6895, + "step": 38730 + }, + { + "epoch": 1.3772997955737267, + "grad_norm": 1.8987367153167725, + "learning_rate": 1.2162010010677327e-05, + "loss": 1.6618, + "step": 38740 + }, + { + "epoch": 1.377655319527153, + "grad_norm": 1.9594670534133911, + "learning_rate": 1.2158183432325508e-05, + "loss": 1.6631, + "step": 38750 + }, + { + "epoch": 1.3780108434805796, + "grad_norm": 1.8008707761764526, + "learning_rate": 1.2154356522490245e-05, + "loss": 1.6483, + "step": 38760 + }, + { + "epoch": 1.3783663674340059, + "grad_norm": 1.852552890777588, + "learning_rate": 1.215052928175933e-05, + "loss": 1.6651, + "step": 38770 + }, + { + "epoch": 1.3787218913874322, + "grad_norm": 1.6587566137313843, + "learning_rate": 1.2146701710720599e-05, + "loss": 1.6777, + "step": 38780 + }, + { + "epoch": 1.3790774153408587, + "grad_norm": 1.7821872234344482, + "learning_rate": 1.2142873809961945e-05, + "loss": 1.7093, + "step": 38790 + }, + { + "epoch": 1.379432939294285, + "grad_norm": 1.9138959646224976, + "learning_rate": 1.2139045580071313e-05, + "loss": 1.6829, + "step": 38800 + }, + { + "epoch": 1.3797884632477113, + "grad_norm": 2.0115551948547363, + "learning_rate": 1.2135217021636691e-05, + "loss": 1.7011, + "step": 38810 + }, + { + "epoch": 1.3801439872011376, + "grad_norm": 1.8681602478027344, + "learning_rate": 1.2131388135246121e-05, + "loss": 1.6386, + "step": 38820 + }, + { + "epoch": 1.380499511154564, + "grad_norm": 1.9055702686309814, + "learning_rate": 1.2127558921487696e-05, + "loss": 1.6843, + "step": 38830 + }, + { + "epoch": 1.3808550351079905, + "grad_norm": 1.8282355070114136, + "learning_rate": 1.2123729380949563e-05, + "loss": 1.6799, + "step": 38840 + }, + { + "epoch": 1.3812105590614168, + "grad_norm": 1.9232532978057861, + "learning_rate": 1.2119899514219912e-05, + "loss": 1.6534, + "step": 38850 + }, + { + "epoch": 1.381566083014843, + "grad_norm": 1.777154564857483, + "learning_rate": 1.2116069321886987e-05, + "loss": 1.6606, + "step": 38860 + }, + { + "epoch": 1.3819216069682696, + "grad_norm": 1.9864416122436523, + "learning_rate": 1.2112238804539084e-05, + "loss": 1.6887, + "step": 38870 + }, + { + "epoch": 1.382277130921696, + "grad_norm": 1.8537176847457886, + "learning_rate": 1.2108407962764543e-05, + "loss": 1.6775, + "step": 38880 + }, + { + "epoch": 1.3826326548751222, + "grad_norm": 1.8541337251663208, + "learning_rate": 1.2104576797151758e-05, + "loss": 1.6959, + "step": 38890 + }, + { + "epoch": 1.3829881788285485, + "grad_norm": 1.7702661752700806, + "learning_rate": 1.2100745308289175e-05, + "loss": 1.6894, + "step": 38900 + }, + { + "epoch": 1.3833437027819748, + "grad_norm": 1.857193112373352, + "learning_rate": 1.209691349676528e-05, + "loss": 1.6761, + "step": 38910 + }, + { + "epoch": 1.3836992267354014, + "grad_norm": 1.9033740758895874, + "learning_rate": 1.2093081363168625e-05, + "loss": 1.6652, + "step": 38920 + }, + { + "epoch": 1.3840547506888277, + "grad_norm": 1.838472843170166, + "learning_rate": 1.2089248908087794e-05, + "loss": 1.6514, + "step": 38930 + }, + { + "epoch": 1.384410274642254, + "grad_norm": 1.7907769680023193, + "learning_rate": 1.2085416132111429e-05, + "loss": 1.6835, + "step": 38940 + }, + { + "epoch": 1.3847657985956805, + "grad_norm": 1.8862502574920654, + "learning_rate": 1.2081583035828226e-05, + "loss": 1.6456, + "step": 38950 + }, + { + "epoch": 1.3851213225491068, + "grad_norm": 1.8391231298446655, + "learning_rate": 1.2077749619826915e-05, + "loss": 1.6661, + "step": 38960 + }, + { + "epoch": 1.385476846502533, + "grad_norm": 1.8332085609436035, + "learning_rate": 1.2073915884696292e-05, + "loss": 1.6714, + "step": 38970 + }, + { + "epoch": 1.3858323704559594, + "grad_norm": 1.946373701095581, + "learning_rate": 1.2070081831025195e-05, + "loss": 1.6939, + "step": 38980 + }, + { + "epoch": 1.3861878944093857, + "grad_norm": 1.7250920534133911, + "learning_rate": 1.2066247459402507e-05, + "loss": 1.6818, + "step": 38990 + }, + { + "epoch": 1.3865434183628123, + "grad_norm": 1.8862600326538086, + "learning_rate": 1.2062412770417161e-05, + "loss": 1.7218, + "step": 39000 + }, + { + "epoch": 1.3868989423162386, + "grad_norm": 1.7957911491394043, + "learning_rate": 1.2058577764658148e-05, + "loss": 1.6923, + "step": 39010 + }, + { + "epoch": 1.3872544662696649, + "grad_norm": 2.0033061504364014, + "learning_rate": 1.2054742442714497e-05, + "loss": 1.699, + "step": 39020 + }, + { + "epoch": 1.3876099902230914, + "grad_norm": 1.6367038488388062, + "learning_rate": 1.2050906805175293e-05, + "loss": 1.7128, + "step": 39030 + }, + { + "epoch": 1.3879655141765177, + "grad_norm": 1.8533066511154175, + "learning_rate": 1.2047070852629661e-05, + "loss": 1.7222, + "step": 39040 + }, + { + "epoch": 1.388321038129944, + "grad_norm": 1.8927814960479736, + "learning_rate": 1.2043234585666782e-05, + "loss": 1.6477, + "step": 39050 + }, + { + "epoch": 1.3886765620833703, + "grad_norm": 1.680945634841919, + "learning_rate": 1.2039398004875882e-05, + "loss": 1.7053, + "step": 39060 + }, + { + "epoch": 1.3890320860367966, + "grad_norm": 1.9230496883392334, + "learning_rate": 1.2035561110846232e-05, + "loss": 1.6782, + "step": 39070 + }, + { + "epoch": 1.3893876099902231, + "grad_norm": 1.8632361888885498, + "learning_rate": 1.2031723904167161e-05, + "loss": 1.6711, + "step": 39080 + }, + { + "epoch": 1.3897431339436495, + "grad_norm": 1.8879597187042236, + "learning_rate": 1.2027886385428035e-05, + "loss": 1.6603, + "step": 39090 + }, + { + "epoch": 1.3900986578970758, + "grad_norm": 1.8897348642349243, + "learning_rate": 1.2024048555218283e-05, + "loss": 1.6896, + "step": 39100 + }, + { + "epoch": 1.3904541818505023, + "grad_norm": 1.9044712781906128, + "learning_rate": 1.2020210414127359e-05, + "loss": 1.7011, + "step": 39110 + }, + { + "epoch": 1.3908097058039286, + "grad_norm": 1.814377784729004, + "learning_rate": 1.201637196274478e-05, + "loss": 1.6547, + "step": 39120 + }, + { + "epoch": 1.391165229757355, + "grad_norm": 1.9063407182693481, + "learning_rate": 1.201253320166011e-05, + "loss": 1.6876, + "step": 39130 + }, + { + "epoch": 1.3915207537107812, + "grad_norm": 2.003798007965088, + "learning_rate": 1.2008694131462962e-05, + "loss": 1.686, + "step": 39140 + }, + { + "epoch": 1.3918762776642075, + "grad_norm": 1.863899827003479, + "learning_rate": 1.2004854752742988e-05, + "loss": 1.6531, + "step": 39150 + }, + { + "epoch": 1.392231801617634, + "grad_norm": 1.9100146293640137, + "learning_rate": 1.2001015066089893e-05, + "loss": 1.6482, + "step": 39160 + }, + { + "epoch": 1.3925873255710604, + "grad_norm": 1.8719366788864136, + "learning_rate": 1.1997175072093435e-05, + "loss": 1.6845, + "step": 39170 + }, + { + "epoch": 1.3929428495244867, + "grad_norm": 1.7857741117477417, + "learning_rate": 1.1993334771343405e-05, + "loss": 1.6882, + "step": 39180 + }, + { + "epoch": 1.3932983734779132, + "grad_norm": 1.866883635520935, + "learning_rate": 1.1989494164429654e-05, + "loss": 1.6766, + "step": 39190 + }, + { + "epoch": 1.3936538974313395, + "grad_norm": 1.9655028581619263, + "learning_rate": 1.1985653251942074e-05, + "loss": 1.6653, + "step": 39200 + }, + { + "epoch": 1.3940094213847658, + "grad_norm": 1.7872508764266968, + "learning_rate": 1.1981812034470601e-05, + "loss": 1.6846, + "step": 39210 + }, + { + "epoch": 1.394364945338192, + "grad_norm": 1.8483127355575562, + "learning_rate": 1.1977970512605228e-05, + "loss": 1.6923, + "step": 39220 + }, + { + "epoch": 1.3947204692916184, + "grad_norm": 1.9430288076400757, + "learning_rate": 1.1974128686935988e-05, + "loss": 1.6806, + "step": 39230 + }, + { + "epoch": 1.395075993245045, + "grad_norm": 1.7674256563186646, + "learning_rate": 1.1970286558052957e-05, + "loss": 1.6693, + "step": 39240 + }, + { + "epoch": 1.3954315171984712, + "grad_norm": 1.8529326915740967, + "learning_rate": 1.1966444126546263e-05, + "loss": 1.6688, + "step": 39250 + }, + { + "epoch": 1.3957870411518976, + "grad_norm": 1.9361050128936768, + "learning_rate": 1.1962601393006083e-05, + "loss": 1.6815, + "step": 39260 + }, + { + "epoch": 1.396142565105324, + "grad_norm": 1.8823657035827637, + "learning_rate": 1.1958758358022637e-05, + "loss": 1.6776, + "step": 39270 + }, + { + "epoch": 1.3964980890587504, + "grad_norm": 1.6542255878448486, + "learning_rate": 1.1954915022186187e-05, + "loss": 1.6878, + "step": 39280 + }, + { + "epoch": 1.3968536130121767, + "grad_norm": 1.7041078805923462, + "learning_rate": 1.1951071386087047e-05, + "loss": 1.6859, + "step": 39290 + }, + { + "epoch": 1.397209136965603, + "grad_norm": 1.8247658014297485, + "learning_rate": 1.1947227450315575e-05, + "loss": 1.6819, + "step": 39300 + }, + { + "epoch": 1.3975646609190293, + "grad_norm": 1.8667571544647217, + "learning_rate": 1.1943383215462175e-05, + "loss": 1.6375, + "step": 39310 + }, + { + "epoch": 1.3979201848724558, + "grad_norm": 1.8258676528930664, + "learning_rate": 1.1939538682117298e-05, + "loss": 1.6931, + "step": 39320 + }, + { + "epoch": 1.3982757088258821, + "grad_norm": 1.7988314628601074, + "learning_rate": 1.1935693850871442e-05, + "loss": 1.6444, + "step": 39330 + }, + { + "epoch": 1.3986312327793085, + "grad_norm": 1.7440369129180908, + "learning_rate": 1.1931848722315145e-05, + "loss": 1.7064, + "step": 39340 + }, + { + "epoch": 1.398986756732735, + "grad_norm": 1.827201247215271, + "learning_rate": 1.1928003297039001e-05, + "loss": 1.7128, + "step": 39350 + }, + { + "epoch": 1.3993422806861613, + "grad_norm": 1.7500665187835693, + "learning_rate": 1.1924157575633639e-05, + "loss": 1.6726, + "step": 39360 + }, + { + "epoch": 1.3996978046395876, + "grad_norm": 1.8692196607589722, + "learning_rate": 1.1920311558689734e-05, + "loss": 1.6694, + "step": 39370 + }, + { + "epoch": 1.400053328593014, + "grad_norm": 1.973334789276123, + "learning_rate": 1.1916465246798017e-05, + "loss": 1.6643, + "step": 39380 + }, + { + "epoch": 1.4004088525464402, + "grad_norm": 1.8776823282241821, + "learning_rate": 1.1912618640549252e-05, + "loss": 1.6996, + "step": 39390 + }, + { + "epoch": 1.4007643764998667, + "grad_norm": 1.7455120086669922, + "learning_rate": 1.1908771740534257e-05, + "loss": 1.6534, + "step": 39400 + }, + { + "epoch": 1.401119900453293, + "grad_norm": 1.9548767805099487, + "learning_rate": 1.1904924547343892e-05, + "loss": 1.6756, + "step": 39410 + }, + { + "epoch": 1.4014754244067194, + "grad_norm": 1.876504898071289, + "learning_rate": 1.190107706156906e-05, + "loss": 1.6896, + "step": 39420 + }, + { + "epoch": 1.4018309483601459, + "grad_norm": 1.7416571378707886, + "learning_rate": 1.1897229283800713e-05, + "loss": 1.6689, + "step": 39430 + }, + { + "epoch": 1.4021864723135722, + "grad_norm": 1.8590924739837646, + "learning_rate": 1.189338121462984e-05, + "loss": 1.7051, + "step": 39440 + }, + { + "epoch": 1.4025419962669985, + "grad_norm": 1.8786284923553467, + "learning_rate": 1.1889532854647485e-05, + "loss": 1.6731, + "step": 39450 + }, + { + "epoch": 1.4028975202204248, + "grad_norm": 1.8990617990493774, + "learning_rate": 1.1885684204444732e-05, + "loss": 1.7025, + "step": 39460 + }, + { + "epoch": 1.403253044173851, + "grad_norm": 1.7606281042099, + "learning_rate": 1.1881835264612706e-05, + "loss": 1.6887, + "step": 39470 + }, + { + "epoch": 1.4036085681272776, + "grad_norm": 1.8033561706542969, + "learning_rate": 1.1877986035742589e-05, + "loss": 1.6736, + "step": 39480 + }, + { + "epoch": 1.403964092080704, + "grad_norm": 1.7199445962905884, + "learning_rate": 1.1874136518425586e-05, + "loss": 1.6996, + "step": 39490 + }, + { + "epoch": 1.4043196160341302, + "grad_norm": 1.8428937196731567, + "learning_rate": 1.1870286713252966e-05, + "loss": 1.6787, + "step": 39500 + }, + { + "epoch": 1.4046751399875568, + "grad_norm": 1.9001675844192505, + "learning_rate": 1.1866436620816035e-05, + "loss": 1.6595, + "step": 39510 + }, + { + "epoch": 1.405030663940983, + "grad_norm": 1.7418138980865479, + "learning_rate": 1.186258624170614e-05, + "loss": 1.6787, + "step": 39520 + }, + { + "epoch": 1.4053861878944094, + "grad_norm": 1.915069580078125, + "learning_rate": 1.1858735576514677e-05, + "loss": 1.6626, + "step": 39530 + }, + { + "epoch": 1.4057417118478357, + "grad_norm": 1.9052213430404663, + "learning_rate": 1.1854884625833085e-05, + "loss": 1.6565, + "step": 39540 + }, + { + "epoch": 1.406097235801262, + "grad_norm": 1.9232211112976074, + "learning_rate": 1.1851033390252843e-05, + "loss": 1.7102, + "step": 39550 + }, + { + "epoch": 1.4064527597546885, + "grad_norm": 1.8242404460906982, + "learning_rate": 1.184718187036548e-05, + "loss": 1.6517, + "step": 39560 + }, + { + "epoch": 1.4068082837081148, + "grad_norm": 1.757689356803894, + "learning_rate": 1.1843330066762562e-05, + "loss": 1.6502, + "step": 39570 + }, + { + "epoch": 1.4071638076615411, + "grad_norm": 1.8252004384994507, + "learning_rate": 1.1839477980035705e-05, + "loss": 1.7388, + "step": 39580 + }, + { + "epoch": 1.4075193316149677, + "grad_norm": 1.9856147766113281, + "learning_rate": 1.1835625610776565e-05, + "loss": 1.7049, + "step": 39590 + }, + { + "epoch": 1.407874855568394, + "grad_norm": 1.8483895063400269, + "learning_rate": 1.1831772959576839e-05, + "loss": 1.6785, + "step": 39600 + }, + { + "epoch": 1.4082303795218203, + "grad_norm": 1.8824540376663208, + "learning_rate": 1.1827920027028273e-05, + "loss": 1.6775, + "step": 39610 + }, + { + "epoch": 1.4085859034752466, + "grad_norm": 1.814045786857605, + "learning_rate": 1.182406681372265e-05, + "loss": 1.6556, + "step": 39620 + }, + { + "epoch": 1.408941427428673, + "grad_norm": 1.9120724201202393, + "learning_rate": 1.1820213320251802e-05, + "loss": 1.6917, + "step": 39630 + }, + { + "epoch": 1.4092969513820994, + "grad_norm": 1.7687956094741821, + "learning_rate": 1.18163595472076e-05, + "loss": 1.6225, + "step": 39640 + }, + { + "epoch": 1.4096524753355257, + "grad_norm": 1.8206088542938232, + "learning_rate": 1.181250549518196e-05, + "loss": 1.69, + "step": 39650 + }, + { + "epoch": 1.410007999288952, + "grad_norm": 1.9125192165374756, + "learning_rate": 1.1808651164766843e-05, + "loss": 1.6492, + "step": 39660 + }, + { + "epoch": 1.4103635232423786, + "grad_norm": 1.8194414377212524, + "learning_rate": 1.1804796556554248e-05, + "loss": 1.6986, + "step": 39670 + }, + { + "epoch": 1.4107190471958049, + "grad_norm": 2.033812999725342, + "learning_rate": 1.1800941671136215e-05, + "loss": 1.6855, + "step": 39680 + }, + { + "epoch": 1.4110745711492312, + "grad_norm": 1.7757148742675781, + "learning_rate": 1.1797086509104834e-05, + "loss": 1.6576, + "step": 39690 + }, + { + "epoch": 1.4114300951026575, + "grad_norm": 1.9182121753692627, + "learning_rate": 1.1793231071052233e-05, + "loss": 1.6182, + "step": 39700 + }, + { + "epoch": 1.4117856190560838, + "grad_norm": 1.8637202978134155, + "learning_rate": 1.1789375357570582e-05, + "loss": 1.6454, + "step": 39710 + }, + { + "epoch": 1.4121411430095103, + "grad_norm": 1.7202696800231934, + "learning_rate": 1.17855193692521e-05, + "loss": 1.7092, + "step": 39720 + }, + { + "epoch": 1.4124966669629366, + "grad_norm": 1.7852152585983276, + "learning_rate": 1.1781663106689034e-05, + "loss": 1.6899, + "step": 39730 + }, + { + "epoch": 1.412852190916363, + "grad_norm": 1.73618745803833, + "learning_rate": 1.1777806570473687e-05, + "loss": 1.6783, + "step": 39740 + }, + { + "epoch": 1.4132077148697895, + "grad_norm": 1.7458254098892212, + "learning_rate": 1.17739497611984e-05, + "loss": 1.6808, + "step": 39750 + }, + { + "epoch": 1.4135632388232158, + "grad_norm": 1.760912299156189, + "learning_rate": 1.177009267945555e-05, + "loss": 1.648, + "step": 39760 + }, + { + "epoch": 1.413918762776642, + "grad_norm": 1.7297006845474243, + "learning_rate": 1.1766235325837563e-05, + "loss": 1.6661, + "step": 39770 + }, + { + "epoch": 1.4142742867300684, + "grad_norm": 1.8834292888641357, + "learning_rate": 1.1762377700936903e-05, + "loss": 1.6917, + "step": 39780 + }, + { + "epoch": 1.4146298106834947, + "grad_norm": 1.8323746919631958, + "learning_rate": 1.1758519805346083e-05, + "loss": 1.6754, + "step": 39790 + }, + { + "epoch": 1.4149853346369212, + "grad_norm": 1.8268206119537354, + "learning_rate": 1.1754661639657643e-05, + "loss": 1.6941, + "step": 39800 + }, + { + "epoch": 1.4153408585903475, + "grad_norm": 1.8546395301818848, + "learning_rate": 1.1750803204464176e-05, + "loss": 1.7018, + "step": 39810 + }, + { + "epoch": 1.4156963825437738, + "grad_norm": 1.7926831245422363, + "learning_rate": 1.1746944500358316e-05, + "loss": 1.6539, + "step": 39820 + }, + { + "epoch": 1.4160519064972004, + "grad_norm": 1.8957622051239014, + "learning_rate": 1.1743085527932736e-05, + "loss": 1.7029, + "step": 39830 + }, + { + "epoch": 1.4164074304506267, + "grad_norm": 1.705998420715332, + "learning_rate": 1.1739226287780146e-05, + "loss": 1.6544, + "step": 39840 + }, + { + "epoch": 1.416762954404053, + "grad_norm": 1.7661248445510864, + "learning_rate": 1.1735366780493305e-05, + "loss": 1.6653, + "step": 39850 + }, + { + "epoch": 1.4171184783574793, + "grad_norm": 1.7905749082565308, + "learning_rate": 1.1731507006665006e-05, + "loss": 1.6604, + "step": 39860 + }, + { + "epoch": 1.4174740023109056, + "grad_norm": 1.7992613315582275, + "learning_rate": 1.1727646966888086e-05, + "loss": 1.6612, + "step": 39870 + }, + { + "epoch": 1.4178295262643321, + "grad_norm": 1.849700689315796, + "learning_rate": 1.1723786661755428e-05, + "loss": 1.6686, + "step": 39880 + }, + { + "epoch": 1.4181850502177584, + "grad_norm": 1.9370216131210327, + "learning_rate": 1.1719926091859943e-05, + "loss": 1.6644, + "step": 39890 + }, + { + "epoch": 1.4185405741711847, + "grad_norm": 1.8564960956573486, + "learning_rate": 1.1716065257794595e-05, + "loss": 1.6729, + "step": 39900 + }, + { + "epoch": 1.4188960981246113, + "grad_norm": 1.7115602493286133, + "learning_rate": 1.1712204160152387e-05, + "loss": 1.6881, + "step": 39910 + }, + { + "epoch": 1.4192516220780376, + "grad_norm": 1.9424142837524414, + "learning_rate": 1.1708342799526355e-05, + "loss": 1.6612, + "step": 39920 + }, + { + "epoch": 1.4196071460314639, + "grad_norm": 1.8880399465560913, + "learning_rate": 1.1704481176509577e-05, + "loss": 1.69, + "step": 39930 + }, + { + "epoch": 1.4199626699848902, + "grad_norm": 1.8421772718429565, + "learning_rate": 1.1700619291695179e-05, + "loss": 1.6468, + "step": 39940 + }, + { + "epoch": 1.4203181939383165, + "grad_norm": 1.8074126243591309, + "learning_rate": 1.1696757145676318e-05, + "loss": 1.6916, + "step": 39950 + }, + { + "epoch": 1.420673717891743, + "grad_norm": 1.7604447603225708, + "learning_rate": 1.16928947390462e-05, + "loss": 1.6848, + "step": 39960 + }, + { + "epoch": 1.4210292418451693, + "grad_norm": 1.9991328716278076, + "learning_rate": 1.1689032072398068e-05, + "loss": 1.7153, + "step": 39970 + }, + { + "epoch": 1.4213847657985956, + "grad_norm": 1.9345002174377441, + "learning_rate": 1.1685169146325197e-05, + "loss": 1.6902, + "step": 39980 + }, + { + "epoch": 1.4217402897520222, + "grad_norm": 1.8295515775680542, + "learning_rate": 1.1681305961420915e-05, + "loss": 1.6904, + "step": 39990 + }, + { + "epoch": 1.4220958137054485, + "grad_norm": 1.9185062646865845, + "learning_rate": 1.1677442518278575e-05, + "loss": 1.6831, + "step": 40000 + }, + { + "epoch": 1.4224513376588748, + "grad_norm": 1.8064296245574951, + "learning_rate": 1.1673578817491582e-05, + "loss": 1.677, + "step": 40010 + }, + { + "epoch": 1.422806861612301, + "grad_norm": 1.8515894412994385, + "learning_rate": 1.1669714859653377e-05, + "loss": 1.6941, + "step": 40020 + }, + { + "epoch": 1.4231623855657274, + "grad_norm": 1.7757716178894043, + "learning_rate": 1.166585064535744e-05, + "loss": 1.6463, + "step": 40030 + }, + { + "epoch": 1.423517909519154, + "grad_norm": 1.834594488143921, + "learning_rate": 1.1661986175197284e-05, + "loss": 1.7113, + "step": 40040 + }, + { + "epoch": 1.4238734334725802, + "grad_norm": 1.984318733215332, + "learning_rate": 1.1658121449766475e-05, + "loss": 1.6948, + "step": 40050 + }, + { + "epoch": 1.4242289574260065, + "grad_norm": 1.7884846925735474, + "learning_rate": 1.165425646965861e-05, + "loss": 1.6818, + "step": 40060 + }, + { + "epoch": 1.424584481379433, + "grad_norm": 1.8413163423538208, + "learning_rate": 1.1650391235467322e-05, + "loss": 1.6688, + "step": 40070 + }, + { + "epoch": 1.4249400053328594, + "grad_norm": 2.025177478790283, + "learning_rate": 1.1646525747786288e-05, + "loss": 1.6483, + "step": 40080 + }, + { + "epoch": 1.4252955292862857, + "grad_norm": 1.9283874034881592, + "learning_rate": 1.1642660007209221e-05, + "loss": 1.6825, + "step": 40090 + }, + { + "epoch": 1.425651053239712, + "grad_norm": 1.8084274530410767, + "learning_rate": 1.1638794014329881e-05, + "loss": 1.6637, + "step": 40100 + }, + { + "epoch": 1.4260065771931383, + "grad_norm": 2.027099609375, + "learning_rate": 1.1634927769742053e-05, + "loss": 1.6782, + "step": 40110 + }, + { + "epoch": 1.4263621011465648, + "grad_norm": 1.707337498664856, + "learning_rate": 1.163106127403957e-05, + "loss": 1.6334, + "step": 40120 + }, + { + "epoch": 1.4267176250999911, + "grad_norm": 1.9192252159118652, + "learning_rate": 1.1627194527816304e-05, + "loss": 1.6684, + "step": 40130 + }, + { + "epoch": 1.4270731490534174, + "grad_norm": 1.83455491065979, + "learning_rate": 1.1623327531666157e-05, + "loss": 1.6631, + "step": 40140 + }, + { + "epoch": 1.427428673006844, + "grad_norm": 1.937157392501831, + "learning_rate": 1.1619460286183087e-05, + "loss": 1.6805, + "step": 40150 + }, + { + "epoch": 1.4277841969602703, + "grad_norm": 1.799842357635498, + "learning_rate": 1.1615592791961068e-05, + "loss": 1.6486, + "step": 40160 + }, + { + "epoch": 1.4281397209136966, + "grad_norm": 1.7772995233535767, + "learning_rate": 1.1611725049594122e-05, + "loss": 1.6559, + "step": 40170 + }, + { + "epoch": 1.4284952448671229, + "grad_norm": 1.8416128158569336, + "learning_rate": 1.1607857059676317e-05, + "loss": 1.6281, + "step": 40180 + }, + { + "epoch": 1.4288507688205492, + "grad_norm": 1.7117491960525513, + "learning_rate": 1.1603988822801749e-05, + "loss": 1.699, + "step": 40190 + }, + { + "epoch": 1.4292062927739757, + "grad_norm": 1.7336164712905884, + "learning_rate": 1.1600120339564554e-05, + "loss": 1.6589, + "step": 40200 + }, + { + "epoch": 1.429561816727402, + "grad_norm": 1.8139528036117554, + "learning_rate": 1.1596251610558906e-05, + "loss": 1.6789, + "step": 40210 + }, + { + "epoch": 1.4299173406808283, + "grad_norm": 1.9444037675857544, + "learning_rate": 1.1592382636379025e-05, + "loss": 1.6659, + "step": 40220 + }, + { + "epoch": 1.4302728646342548, + "grad_norm": 1.7962716817855835, + "learning_rate": 1.1588513417619152e-05, + "loss": 1.6685, + "step": 40230 + }, + { + "epoch": 1.4306283885876812, + "grad_norm": 1.8451653718948364, + "learning_rate": 1.1584643954873577e-05, + "loss": 1.6623, + "step": 40240 + }, + { + "epoch": 1.4309839125411075, + "grad_norm": 1.8435050249099731, + "learning_rate": 1.1580774248736629e-05, + "loss": 1.7194, + "step": 40250 + }, + { + "epoch": 1.4313394364945338, + "grad_norm": 1.7766956090927124, + "learning_rate": 1.1576904299802665e-05, + "loss": 1.6877, + "step": 40260 + }, + { + "epoch": 1.43169496044796, + "grad_norm": 1.8470335006713867, + "learning_rate": 1.1573034108666088e-05, + "loss": 1.7135, + "step": 40270 + }, + { + "epoch": 1.4320504844013866, + "grad_norm": 2.0463173389434814, + "learning_rate": 1.1569163675921338e-05, + "loss": 1.6935, + "step": 40280 + }, + { + "epoch": 1.432406008354813, + "grad_norm": 1.9131762981414795, + "learning_rate": 1.1565293002162883e-05, + "loss": 1.6545, + "step": 40290 + }, + { + "epoch": 1.4327615323082392, + "grad_norm": 1.7757117748260498, + "learning_rate": 1.1561422087985237e-05, + "loss": 1.6937, + "step": 40300 + }, + { + "epoch": 1.4331170562616657, + "grad_norm": 1.7911549806594849, + "learning_rate": 1.155755093398295e-05, + "loss": 1.6911, + "step": 40310 + }, + { + "epoch": 1.433472580215092, + "grad_norm": 1.7745696306228638, + "learning_rate": 1.1553679540750606e-05, + "loss": 1.6788, + "step": 40320 + }, + { + "epoch": 1.4338281041685184, + "grad_norm": 1.824103593826294, + "learning_rate": 1.1549807908882827e-05, + "loss": 1.6678, + "step": 40330 + }, + { + "epoch": 1.4341836281219447, + "grad_norm": 1.8068585395812988, + "learning_rate": 1.1545936038974269e-05, + "loss": 1.6559, + "step": 40340 + }, + { + "epoch": 1.434539152075371, + "grad_norm": 1.8663021326065063, + "learning_rate": 1.1542063931619629e-05, + "loss": 1.6163, + "step": 40350 + }, + { + "epoch": 1.4348946760287975, + "grad_norm": 1.7645716667175293, + "learning_rate": 1.1538191587413637e-05, + "loss": 1.6904, + "step": 40360 + }, + { + "epoch": 1.4352501999822238, + "grad_norm": 1.8233351707458496, + "learning_rate": 1.153431900695106e-05, + "loss": 1.6601, + "step": 40370 + }, + { + "epoch": 1.4356057239356501, + "grad_norm": 1.7978568077087402, + "learning_rate": 1.1530446190826706e-05, + "loss": 1.6534, + "step": 40380 + }, + { + "epoch": 1.4359612478890766, + "grad_norm": 1.9237686395645142, + "learning_rate": 1.1526573139635413e-05, + "loss": 1.6658, + "step": 40390 + }, + { + "epoch": 1.436316771842503, + "grad_norm": 1.7938878536224365, + "learning_rate": 1.152269985397206e-05, + "loss": 1.7059, + "step": 40400 + }, + { + "epoch": 1.4366722957959293, + "grad_norm": 1.8476046323776245, + "learning_rate": 1.1518826334431554e-05, + "loss": 1.6844, + "step": 40410 + }, + { + "epoch": 1.4370278197493556, + "grad_norm": 1.8722749948501587, + "learning_rate": 1.1514952581608847e-05, + "loss": 1.6734, + "step": 40420 + }, + { + "epoch": 1.4373833437027819, + "grad_norm": 1.9160614013671875, + "learning_rate": 1.1511078596098922e-05, + "loss": 1.6755, + "step": 40430 + }, + { + "epoch": 1.4377388676562084, + "grad_norm": 1.8917723894119263, + "learning_rate": 1.1507204378496798e-05, + "loss": 1.6371, + "step": 40440 + }, + { + "epoch": 1.4380943916096347, + "grad_norm": 1.833464503288269, + "learning_rate": 1.1503329929397531e-05, + "loss": 1.673, + "step": 40450 + }, + { + "epoch": 1.438449915563061, + "grad_norm": 2.0275955200195312, + "learning_rate": 1.1499455249396216e-05, + "loss": 1.6789, + "step": 40460 + }, + { + "epoch": 1.4388054395164875, + "grad_norm": 1.8364850282669067, + "learning_rate": 1.1495580339087974e-05, + "loss": 1.6801, + "step": 40470 + }, + { + "epoch": 1.4391609634699138, + "grad_norm": 1.827332615852356, + "learning_rate": 1.1491705199067973e-05, + "loss": 1.6975, + "step": 40480 + }, + { + "epoch": 1.4395164874233402, + "grad_norm": 1.7486658096313477, + "learning_rate": 1.1487829829931403e-05, + "loss": 1.6993, + "step": 40490 + }, + { + "epoch": 1.4398720113767665, + "grad_norm": 1.9424848556518555, + "learning_rate": 1.14839542322735e-05, + "loss": 1.6983, + "step": 40500 + }, + { + "epoch": 1.4402275353301928, + "grad_norm": 1.9159152507781982, + "learning_rate": 1.1480078406689529e-05, + "loss": 1.6657, + "step": 40510 + }, + { + "epoch": 1.4405830592836193, + "grad_norm": 1.753502368927002, + "learning_rate": 1.1476202353774799e-05, + "loss": 1.6404, + "step": 40520 + }, + { + "epoch": 1.4409385832370456, + "grad_norm": 1.8607120513916016, + "learning_rate": 1.1472326074124642e-05, + "loss": 1.703, + "step": 40530 + }, + { + "epoch": 1.441294107190472, + "grad_norm": 1.7924970388412476, + "learning_rate": 1.1468449568334433e-05, + "loss": 1.7019, + "step": 40540 + }, + { + "epoch": 1.4416496311438984, + "grad_norm": 1.8681445121765137, + "learning_rate": 1.1464572836999575e-05, + "loss": 1.6766, + "step": 40550 + }, + { + "epoch": 1.4420051550973247, + "grad_norm": 1.7013012170791626, + "learning_rate": 1.1460695880715516e-05, + "loss": 1.664, + "step": 40560 + }, + { + "epoch": 1.442360679050751, + "grad_norm": 1.912614345550537, + "learning_rate": 1.1456818700077723e-05, + "loss": 1.6694, + "step": 40570 + }, + { + "epoch": 1.4427162030041774, + "grad_norm": 1.9411284923553467, + "learning_rate": 1.1452941295681715e-05, + "loss": 1.6325, + "step": 40580 + }, + { + "epoch": 1.4430717269576037, + "grad_norm": 1.845502257347107, + "learning_rate": 1.1449063668123035e-05, + "loss": 1.652, + "step": 40590 + }, + { + "epoch": 1.4434272509110302, + "grad_norm": 1.7138339281082153, + "learning_rate": 1.144518581799726e-05, + "loss": 1.649, + "step": 40600 + }, + { + "epoch": 1.4437827748644565, + "grad_norm": 1.8865113258361816, + "learning_rate": 1.1441307745900003e-05, + "loss": 1.6822, + "step": 40610 + }, + { + "epoch": 1.4441382988178828, + "grad_norm": 1.7740147113800049, + "learning_rate": 1.1437429452426915e-05, + "loss": 1.6733, + "step": 40620 + }, + { + "epoch": 1.4444938227713093, + "grad_norm": 1.9099332094192505, + "learning_rate": 1.1433550938173677e-05, + "loss": 1.7122, + "step": 40630 + }, + { + "epoch": 1.4448493467247356, + "grad_norm": 1.9059197902679443, + "learning_rate": 1.1429672203736001e-05, + "loss": 1.6678, + "step": 40640 + }, + { + "epoch": 1.445204870678162, + "grad_norm": 1.73043954372406, + "learning_rate": 1.142579324970964e-05, + "loss": 1.6927, + "step": 40650 + }, + { + "epoch": 1.4455603946315883, + "grad_norm": 1.875238299369812, + "learning_rate": 1.1421914076690376e-05, + "loss": 1.7018, + "step": 40660 + }, + { + "epoch": 1.4459159185850146, + "grad_norm": 1.8329334259033203, + "learning_rate": 1.1418034685274026e-05, + "loss": 1.6683, + "step": 40670 + }, + { + "epoch": 1.446271442538441, + "grad_norm": 1.7928236722946167, + "learning_rate": 1.1414155076056437e-05, + "loss": 1.6834, + "step": 40680 + }, + { + "epoch": 1.4466269664918674, + "grad_norm": 1.9530136585235596, + "learning_rate": 1.1410275249633496e-05, + "loss": 1.678, + "step": 40690 + }, + { + "epoch": 1.4469824904452937, + "grad_norm": 1.8798272609710693, + "learning_rate": 1.140639520660112e-05, + "loss": 1.6921, + "step": 40700 + }, + { + "epoch": 1.4473380143987202, + "grad_norm": 1.7171239852905273, + "learning_rate": 1.140251494755526e-05, + "loss": 1.6791, + "step": 40710 + }, + { + "epoch": 1.4476935383521465, + "grad_norm": 1.737978219985962, + "learning_rate": 1.1398634473091897e-05, + "loss": 1.6708, + "step": 40720 + }, + { + "epoch": 1.4480490623055728, + "grad_norm": 1.8468761444091797, + "learning_rate": 1.1394753783807047e-05, + "loss": 1.6682, + "step": 40730 + }, + { + "epoch": 1.4484045862589991, + "grad_norm": 1.9467835426330566, + "learning_rate": 1.139087288029676e-05, + "loss": 1.6929, + "step": 40740 + }, + { + "epoch": 1.4487601102124255, + "grad_norm": 1.847671627998352, + "learning_rate": 1.138699176315712e-05, + "loss": 1.663, + "step": 40750 + }, + { + "epoch": 1.449115634165852, + "grad_norm": 2.084027051925659, + "learning_rate": 1.138311043298424e-05, + "loss": 1.6463, + "step": 40760 + }, + { + "epoch": 1.4494711581192783, + "grad_norm": 1.8675135374069214, + "learning_rate": 1.1379228890374274e-05, + "loss": 1.66, + "step": 40770 + }, + { + "epoch": 1.4498266820727046, + "grad_norm": 1.7958040237426758, + "learning_rate": 1.1375347135923395e-05, + "loss": 1.6281, + "step": 40780 + }, + { + "epoch": 1.4501822060261311, + "grad_norm": 1.8652892112731934, + "learning_rate": 1.1371465170227822e-05, + "loss": 1.6369, + "step": 40790 + }, + { + "epoch": 1.4505377299795574, + "grad_norm": 1.7667006254196167, + "learning_rate": 1.1367582993883798e-05, + "loss": 1.6318, + "step": 40800 + }, + { + "epoch": 1.4508932539329837, + "grad_norm": 1.8608719110488892, + "learning_rate": 1.13637006074876e-05, + "loss": 1.6648, + "step": 40810 + }, + { + "epoch": 1.45124877788641, + "grad_norm": 1.79188871383667, + "learning_rate": 1.1359818011635538e-05, + "loss": 1.6788, + "step": 40820 + }, + { + "epoch": 1.4516043018398364, + "grad_norm": 1.860592007637024, + "learning_rate": 1.1355935206923955e-05, + "loss": 1.6826, + "step": 40830 + }, + { + "epoch": 1.4519598257932629, + "grad_norm": 1.888957142829895, + "learning_rate": 1.135205219394923e-05, + "loss": 1.7112, + "step": 40840 + }, + { + "epoch": 1.4523153497466892, + "grad_norm": 1.904823899269104, + "learning_rate": 1.1348168973307762e-05, + "loss": 1.6255, + "step": 40850 + }, + { + "epoch": 1.4526708737001155, + "grad_norm": 1.7432588338851929, + "learning_rate": 1.1344285545595991e-05, + "loss": 1.6706, + "step": 40860 + }, + { + "epoch": 1.453026397653542, + "grad_norm": 1.725298285484314, + "learning_rate": 1.1340401911410392e-05, + "loss": 1.6436, + "step": 40870 + }, + { + "epoch": 1.4533819216069683, + "grad_norm": 1.8614853620529175, + "learning_rate": 1.1336518071347467e-05, + "loss": 1.6568, + "step": 40880 + }, + { + "epoch": 1.4537374455603946, + "grad_norm": 1.7306885719299316, + "learning_rate": 1.1332634026003742e-05, + "loss": 1.68, + "step": 40890 + }, + { + "epoch": 1.454092969513821, + "grad_norm": 1.769089698791504, + "learning_rate": 1.1328749775975786e-05, + "loss": 1.7314, + "step": 40900 + }, + { + "epoch": 1.4544484934672472, + "grad_norm": 1.7994245290756226, + "learning_rate": 1.1324865321860197e-05, + "loss": 1.7049, + "step": 40910 + }, + { + "epoch": 1.4548040174206738, + "grad_norm": 1.8408583402633667, + "learning_rate": 1.13209806642536e-05, + "loss": 1.7092, + "step": 40920 + }, + { + "epoch": 1.4551595413741, + "grad_norm": 2.042452096939087, + "learning_rate": 1.1317095803752657e-05, + "loss": 1.6875, + "step": 40930 + }, + { + "epoch": 1.4555150653275264, + "grad_norm": 1.9339094161987305, + "learning_rate": 1.1313210740954057e-05, + "loss": 1.6558, + "step": 40940 + }, + { + "epoch": 1.455870589280953, + "grad_norm": 1.9632941484451294, + "learning_rate": 1.1309325476454519e-05, + "loss": 1.7093, + "step": 40950 + }, + { + "epoch": 1.4562261132343792, + "grad_norm": 1.8465474843978882, + "learning_rate": 1.1305440010850802e-05, + "loss": 1.6853, + "step": 40960 + }, + { + "epoch": 1.4565816371878055, + "grad_norm": 1.747512936592102, + "learning_rate": 1.1301554344739683e-05, + "loss": 1.684, + "step": 40970 + }, + { + "epoch": 1.4569371611412318, + "grad_norm": 1.9150688648223877, + "learning_rate": 1.1297668478717974e-05, + "loss": 1.7034, + "step": 40980 + }, + { + "epoch": 1.4572926850946581, + "grad_norm": 1.750643014907837, + "learning_rate": 1.1293782413382523e-05, + "loss": 1.6631, + "step": 40990 + }, + { + "epoch": 1.4576482090480847, + "grad_norm": 1.7395461797714233, + "learning_rate": 1.1289896149330209e-05, + "loss": 1.6791, + "step": 41000 + }, + { + "epoch": 1.458003733001511, + "grad_norm": 1.8940759897232056, + "learning_rate": 1.1286009687157931e-05, + "loss": 1.6268, + "step": 41010 + }, + { + "epoch": 1.4583592569549373, + "grad_norm": 1.8179832696914673, + "learning_rate": 1.1282123027462632e-05, + "loss": 1.6772, + "step": 41020 + }, + { + "epoch": 1.4587147809083638, + "grad_norm": 1.7574387788772583, + "learning_rate": 1.1278236170841272e-05, + "loss": 1.6809, + "step": 41030 + }, + { + "epoch": 1.4590703048617901, + "grad_norm": 1.7769755125045776, + "learning_rate": 1.1274349117890852e-05, + "loss": 1.6538, + "step": 41040 + }, + { + "epoch": 1.4594258288152164, + "grad_norm": 1.8514573574066162, + "learning_rate": 1.1270461869208398e-05, + "loss": 1.6794, + "step": 41050 + }, + { + "epoch": 1.4597813527686427, + "grad_norm": 1.8502614498138428, + "learning_rate": 1.1266574425390966e-05, + "loss": 1.6804, + "step": 41060 + }, + { + "epoch": 1.460136876722069, + "grad_norm": 1.898262858390808, + "learning_rate": 1.1262686787035643e-05, + "loss": 1.648, + "step": 41070 + }, + { + "epoch": 1.4604924006754956, + "grad_norm": 1.8465478420257568, + "learning_rate": 1.1258798954739547e-05, + "loss": 1.6714, + "step": 41080 + }, + { + "epoch": 1.4608479246289219, + "grad_norm": 1.9473800659179688, + "learning_rate": 1.1254910929099827e-05, + "loss": 1.6868, + "step": 41090 + }, + { + "epoch": 1.4612034485823482, + "grad_norm": 1.9544756412506104, + "learning_rate": 1.1251022710713655e-05, + "loss": 1.7166, + "step": 41100 + }, + { + "epoch": 1.4615589725357747, + "grad_norm": 1.8831160068511963, + "learning_rate": 1.1247134300178235e-05, + "loss": 1.651, + "step": 41110 + }, + { + "epoch": 1.461914496489201, + "grad_norm": 2.1862456798553467, + "learning_rate": 1.1243245698090812e-05, + "loss": 1.6506, + "step": 41120 + }, + { + "epoch": 1.4622700204426273, + "grad_norm": 1.830612063407898, + "learning_rate": 1.1239356905048642e-05, + "loss": 1.6733, + "step": 41130 + }, + { + "epoch": 1.4626255443960536, + "grad_norm": 1.9072941541671753, + "learning_rate": 1.123546792164902e-05, + "loss": 1.6304, + "step": 41140 + }, + { + "epoch": 1.46298106834948, + "grad_norm": 1.740522861480713, + "learning_rate": 1.1231578748489277e-05, + "loss": 1.6535, + "step": 41150 + }, + { + "epoch": 1.4633365923029065, + "grad_norm": 1.8051260709762573, + "learning_rate": 1.1227689386166758e-05, + "loss": 1.6591, + "step": 41160 + }, + { + "epoch": 1.4636921162563328, + "grad_norm": 1.7794371843338013, + "learning_rate": 1.1223799835278844e-05, + "loss": 1.6826, + "step": 41170 + }, + { + "epoch": 1.464047640209759, + "grad_norm": 1.8125745058059692, + "learning_rate": 1.121991009642295e-05, + "loss": 1.6881, + "step": 41180 + }, + { + "epoch": 1.4644031641631856, + "grad_norm": 1.9413714408874512, + "learning_rate": 1.1216020170196516e-05, + "loss": 1.6356, + "step": 41190 + }, + { + "epoch": 1.464758688116612, + "grad_norm": 1.967499852180481, + "learning_rate": 1.1212130057197009e-05, + "loss": 1.6819, + "step": 41200 + }, + { + "epoch": 1.4651142120700382, + "grad_norm": 1.929537296295166, + "learning_rate": 1.1208239758021923e-05, + "loss": 1.6923, + "step": 41210 + }, + { + "epoch": 1.4654697360234645, + "grad_norm": 1.7280325889587402, + "learning_rate": 1.1204349273268786e-05, + "loss": 1.6852, + "step": 41220 + }, + { + "epoch": 1.4658252599768908, + "grad_norm": 2.1349501609802246, + "learning_rate": 1.1200458603535153e-05, + "loss": 1.6731, + "step": 41230 + }, + { + "epoch": 1.4661807839303174, + "grad_norm": 1.897236943244934, + "learning_rate": 1.1196567749418606e-05, + "loss": 1.6999, + "step": 41240 + }, + { + "epoch": 1.4665363078837437, + "grad_norm": 1.8861037492752075, + "learning_rate": 1.1192676711516752e-05, + "loss": 1.6768, + "step": 41250 + }, + { + "epoch": 1.46689183183717, + "grad_norm": 2.259942054748535, + "learning_rate": 1.1188785490427234e-05, + "loss": 1.6913, + "step": 41260 + }, + { + "epoch": 1.4672473557905965, + "grad_norm": 1.8978254795074463, + "learning_rate": 1.1184894086747721e-05, + "loss": 1.6547, + "step": 41270 + }, + { + "epoch": 1.4676028797440228, + "grad_norm": 1.8073842525482178, + "learning_rate": 1.1181002501075908e-05, + "loss": 1.6646, + "step": 41280 + }, + { + "epoch": 1.4679584036974491, + "grad_norm": 1.9123353958129883, + "learning_rate": 1.1177110734009512e-05, + "loss": 1.6652, + "step": 41290 + }, + { + "epoch": 1.4683139276508754, + "grad_norm": 1.7248234748840332, + "learning_rate": 1.1173218786146287e-05, + "loss": 1.7009, + "step": 41300 + }, + { + "epoch": 1.4686694516043017, + "grad_norm": 1.7227554321289062, + "learning_rate": 1.1169326658084013e-05, + "loss": 1.6481, + "step": 41310 + }, + { + "epoch": 1.4690249755577283, + "grad_norm": 1.7111023664474487, + "learning_rate": 1.1165434350420496e-05, + "loss": 1.6573, + "step": 41320 + }, + { + "epoch": 1.4693804995111546, + "grad_norm": 1.849845290184021, + "learning_rate": 1.1161541863753571e-05, + "loss": 1.6412, + "step": 41330 + }, + { + "epoch": 1.4697360234645809, + "grad_norm": 1.8790321350097656, + "learning_rate": 1.11576491986811e-05, + "loss": 1.6816, + "step": 41340 + }, + { + "epoch": 1.4700915474180074, + "grad_norm": 2.038875102996826, + "learning_rate": 1.1153756355800966e-05, + "loss": 1.6827, + "step": 41350 + }, + { + "epoch": 1.4704470713714337, + "grad_norm": 1.9586118459701538, + "learning_rate": 1.1149863335711095e-05, + "loss": 1.6575, + "step": 41360 + }, + { + "epoch": 1.47080259532486, + "grad_norm": 1.8184459209442139, + "learning_rate": 1.1145970139009424e-05, + "loss": 1.6877, + "step": 41370 + }, + { + "epoch": 1.4711581192782863, + "grad_norm": 1.867667317390442, + "learning_rate": 1.1142076766293923e-05, + "loss": 1.7235, + "step": 41380 + }, + { + "epoch": 1.4715136432317126, + "grad_norm": 1.7963002920150757, + "learning_rate": 1.1138183218162593e-05, + "loss": 1.6558, + "step": 41390 + }, + { + "epoch": 1.4718691671851392, + "grad_norm": 1.8462467193603516, + "learning_rate": 1.1134289495213457e-05, + "loss": 1.6822, + "step": 41400 + }, + { + "epoch": 1.4722246911385655, + "grad_norm": 1.8079371452331543, + "learning_rate": 1.1130395598044565e-05, + "loss": 1.6507, + "step": 41410 + }, + { + "epoch": 1.4725802150919918, + "grad_norm": 1.8218004703521729, + "learning_rate": 1.1126501527253998e-05, + "loss": 1.7074, + "step": 41420 + }, + { + "epoch": 1.4729357390454183, + "grad_norm": 1.8829344511032104, + "learning_rate": 1.1122607283439862e-05, + "loss": 1.655, + "step": 41430 + }, + { + "epoch": 1.4732912629988446, + "grad_norm": 2.0060136318206787, + "learning_rate": 1.1118712867200284e-05, + "loss": 1.6516, + "step": 41440 + }, + { + "epoch": 1.473646786952271, + "grad_norm": 1.8415420055389404, + "learning_rate": 1.1114818279133424e-05, + "loss": 1.6614, + "step": 41450 + }, + { + "epoch": 1.4740023109056972, + "grad_norm": 1.7176750898361206, + "learning_rate": 1.1110923519837466e-05, + "loss": 1.6794, + "step": 41460 + }, + { + "epoch": 1.4743578348591235, + "grad_norm": 1.945716142654419, + "learning_rate": 1.1107028589910623e-05, + "loss": 1.6776, + "step": 41470 + }, + { + "epoch": 1.47471335881255, + "grad_norm": 1.7233506441116333, + "learning_rate": 1.1103133489951125e-05, + "loss": 1.6975, + "step": 41480 + }, + { + "epoch": 1.4750688827659764, + "grad_norm": 1.8264062404632568, + "learning_rate": 1.1099238220557243e-05, + "loss": 1.6433, + "step": 41490 + }, + { + "epoch": 1.4754244067194027, + "grad_norm": 1.77842378616333, + "learning_rate": 1.109534278232726e-05, + "loss": 1.6663, + "step": 41500 + }, + { + "epoch": 1.4757799306728292, + "grad_norm": 2.1303818225860596, + "learning_rate": 1.1091447175859497e-05, + "loss": 1.6863, + "step": 41510 + }, + { + "epoch": 1.4761354546262555, + "grad_norm": 1.9709882736206055, + "learning_rate": 1.108755140175229e-05, + "loss": 1.6655, + "step": 41520 + }, + { + "epoch": 1.4764909785796818, + "grad_norm": 1.8122498989105225, + "learning_rate": 1.1083655460604008e-05, + "loss": 1.6817, + "step": 41530 + }, + { + "epoch": 1.4768465025331081, + "grad_norm": 1.9306962490081787, + "learning_rate": 1.107975935301304e-05, + "loss": 1.6562, + "step": 41540 + }, + { + "epoch": 1.4772020264865344, + "grad_norm": 1.8164222240447998, + "learning_rate": 1.1075863079577804e-05, + "loss": 1.7255, + "step": 41550 + }, + { + "epoch": 1.477557550439961, + "grad_norm": 1.6643892526626587, + "learning_rate": 1.1071966640896748e-05, + "loss": 1.677, + "step": 41560 + }, + { + "epoch": 1.4779130743933873, + "grad_norm": 1.7539207935333252, + "learning_rate": 1.1068070037568335e-05, + "loss": 1.7027, + "step": 41570 + }, + { + "epoch": 1.4782685983468136, + "grad_norm": 1.9250538349151611, + "learning_rate": 1.1064173270191063e-05, + "loss": 1.6921, + "step": 41580 + }, + { + "epoch": 1.47862412230024, + "grad_norm": 1.8705344200134277, + "learning_rate": 1.1060276339363448e-05, + "loss": 1.7, + "step": 41590 + }, + { + "epoch": 1.4789796462536664, + "grad_norm": 1.8841394186019897, + "learning_rate": 1.1056379245684036e-05, + "loss": 1.6806, + "step": 41600 + }, + { + "epoch": 1.4793351702070927, + "grad_norm": 1.8760476112365723, + "learning_rate": 1.1052481989751393e-05, + "loss": 1.6466, + "step": 41610 + }, + { + "epoch": 1.479690694160519, + "grad_norm": 1.9157289266586304, + "learning_rate": 1.1048584572164118e-05, + "loss": 1.6591, + "step": 41620 + }, + { + "epoch": 1.4800462181139453, + "grad_norm": 1.6999433040618896, + "learning_rate": 1.1044686993520825e-05, + "loss": 1.6746, + "step": 41630 + }, + { + "epoch": 1.4804017420673719, + "grad_norm": 1.8318113088607788, + "learning_rate": 1.1040789254420164e-05, + "loss": 1.6636, + "step": 41640 + }, + { + "epoch": 1.4807572660207982, + "grad_norm": 1.930933952331543, + "learning_rate": 1.1036891355460795e-05, + "loss": 1.7151, + "step": 41650 + }, + { + "epoch": 1.4811127899742245, + "grad_norm": 1.818397045135498, + "learning_rate": 1.1032993297241417e-05, + "loss": 1.6597, + "step": 41660 + }, + { + "epoch": 1.481468313927651, + "grad_norm": 1.746368408203125, + "learning_rate": 1.1029095080360745e-05, + "loss": 1.6479, + "step": 41670 + }, + { + "epoch": 1.4818238378810773, + "grad_norm": 1.9062423706054688, + "learning_rate": 1.1025196705417522e-05, + "loss": 1.6398, + "step": 41680 + }, + { + "epoch": 1.4821793618345036, + "grad_norm": 1.9456902742385864, + "learning_rate": 1.1021298173010513e-05, + "loss": 1.6641, + "step": 41690 + }, + { + "epoch": 1.48253488578793, + "grad_norm": 1.8529020547866821, + "learning_rate": 1.1017399483738507e-05, + "loss": 1.6792, + "step": 41700 + }, + { + "epoch": 1.4828904097413562, + "grad_norm": 1.7703274488449097, + "learning_rate": 1.1013500638200322e-05, + "loss": 1.6948, + "step": 41710 + }, + { + "epoch": 1.4832459336947827, + "grad_norm": 1.8067706823349, + "learning_rate": 1.1009601636994792e-05, + "loss": 1.7028, + "step": 41720 + }, + { + "epoch": 1.483601457648209, + "grad_norm": 1.773571491241455, + "learning_rate": 1.1005702480720778e-05, + "loss": 1.6318, + "step": 41730 + }, + { + "epoch": 1.4839569816016354, + "grad_norm": 1.800899863243103, + "learning_rate": 1.1001803169977171e-05, + "loss": 1.6743, + "step": 41740 + }, + { + "epoch": 1.484312505555062, + "grad_norm": 1.7902923822402954, + "learning_rate": 1.099790370536288e-05, + "loss": 1.6847, + "step": 41750 + }, + { + "epoch": 1.4846680295084882, + "grad_norm": 1.8870327472686768, + "learning_rate": 1.0994004087476837e-05, + "loss": 1.7068, + "step": 41760 + }, + { + "epoch": 1.4850235534619145, + "grad_norm": 1.7531715631484985, + "learning_rate": 1.0990104316918e-05, + "loss": 1.6547, + "step": 41770 + }, + { + "epoch": 1.4853790774153408, + "grad_norm": 2.0109541416168213, + "learning_rate": 1.0986204394285345e-05, + "loss": 1.6797, + "step": 41780 + }, + { + "epoch": 1.4857346013687671, + "grad_norm": 1.689207673072815, + "learning_rate": 1.0982304320177877e-05, + "loss": 1.6096, + "step": 41790 + }, + { + "epoch": 1.4860901253221936, + "grad_norm": 1.8851569890975952, + "learning_rate": 1.0978404095194625e-05, + "loss": 1.7236, + "step": 41800 + }, + { + "epoch": 1.48644564927562, + "grad_norm": 1.8390060663223267, + "learning_rate": 1.0974503719934642e-05, + "loss": 1.6689, + "step": 41810 + }, + { + "epoch": 1.4868011732290463, + "grad_norm": 1.8239262104034424, + "learning_rate": 1.0970603194996994e-05, + "loss": 1.6515, + "step": 41820 + }, + { + "epoch": 1.4871566971824728, + "grad_norm": 1.805426001548767, + "learning_rate": 1.0966702520980786e-05, + "loss": 1.6885, + "step": 41830 + }, + { + "epoch": 1.487512221135899, + "grad_norm": 1.8900763988494873, + "learning_rate": 1.0962801698485127e-05, + "loss": 1.6806, + "step": 41840 + }, + { + "epoch": 1.4878677450893254, + "grad_norm": 1.8122882843017578, + "learning_rate": 1.0958900728109167e-05, + "loss": 1.7162, + "step": 41850 + }, + { + "epoch": 1.4882232690427517, + "grad_norm": 1.731871247291565, + "learning_rate": 1.0954999610452066e-05, + "loss": 1.6979, + "step": 41860 + }, + { + "epoch": 1.488578792996178, + "grad_norm": 1.8255044221878052, + "learning_rate": 1.0951098346113011e-05, + "loss": 1.6548, + "step": 41870 + }, + { + "epoch": 1.4889343169496045, + "grad_norm": 1.8068124055862427, + "learning_rate": 1.0947196935691215e-05, + "loss": 1.679, + "step": 41880 + }, + { + "epoch": 1.4892898409030308, + "grad_norm": 1.8259496688842773, + "learning_rate": 1.0943295379785911e-05, + "loss": 1.6882, + "step": 41890 + }, + { + "epoch": 1.4896453648564572, + "grad_norm": 1.8950399160385132, + "learning_rate": 1.093939367899635e-05, + "loss": 1.6814, + "step": 41900 + }, + { + "epoch": 1.4900008888098837, + "grad_norm": 1.8045412302017212, + "learning_rate": 1.093549183392181e-05, + "loss": 1.6489, + "step": 41910 + }, + { + "epoch": 1.49035641276331, + "grad_norm": 2.060487985610962, + "learning_rate": 1.093158984516159e-05, + "loss": 1.6177, + "step": 41920 + }, + { + "epoch": 1.4907119367167363, + "grad_norm": 1.9152122735977173, + "learning_rate": 1.092768771331501e-05, + "loss": 1.7015, + "step": 41930 + }, + { + "epoch": 1.4910674606701626, + "grad_norm": 1.7958887815475464, + "learning_rate": 1.092378543898141e-05, + "loss": 1.6822, + "step": 41940 + }, + { + "epoch": 1.491422984623589, + "grad_norm": 1.8429104089736938, + "learning_rate": 1.0919883022760167e-05, + "loss": 1.6516, + "step": 41950 + }, + { + "epoch": 1.4917785085770154, + "grad_norm": 1.7905949354171753, + "learning_rate": 1.0915980465250653e-05, + "loss": 1.6476, + "step": 41960 + }, + { + "epoch": 1.4921340325304417, + "grad_norm": 1.840455412864685, + "learning_rate": 1.0912077767052285e-05, + "loss": 1.6886, + "step": 41970 + }, + { + "epoch": 1.492489556483868, + "grad_norm": 1.8871763944625854, + "learning_rate": 1.090817492876449e-05, + "loss": 1.7141, + "step": 41980 + }, + { + "epoch": 1.4928450804372946, + "grad_norm": 1.9186160564422607, + "learning_rate": 1.090427195098672e-05, + "loss": 1.6503, + "step": 41990 + }, + { + "epoch": 1.4932006043907209, + "grad_norm": 1.8501421213150024, + "learning_rate": 1.0900368834318451e-05, + "loss": 1.6743, + "step": 42000 + }, + { + "epoch": 1.4935561283441472, + "grad_norm": 1.8806571960449219, + "learning_rate": 1.0896465579359173e-05, + "loss": 1.6962, + "step": 42010 + }, + { + "epoch": 1.4939116522975735, + "grad_norm": 2.0296733379364014, + "learning_rate": 1.0892562186708404e-05, + "loss": 1.6732, + "step": 42020 + }, + { + "epoch": 1.4942671762509998, + "grad_norm": 2.0110843181610107, + "learning_rate": 1.0888658656965676e-05, + "loss": 1.6754, + "step": 42030 + }, + { + "epoch": 1.4946227002044263, + "grad_norm": 1.8877172470092773, + "learning_rate": 1.0884754990730552e-05, + "loss": 1.687, + "step": 42040 + }, + { + "epoch": 1.4949782241578526, + "grad_norm": 1.9644426107406616, + "learning_rate": 1.0880851188602608e-05, + "loss": 1.6976, + "step": 42050 + }, + { + "epoch": 1.495333748111279, + "grad_norm": 1.8313747644424438, + "learning_rate": 1.0876947251181445e-05, + "loss": 1.6712, + "step": 42060 + }, + { + "epoch": 1.4956892720647055, + "grad_norm": 1.7843058109283447, + "learning_rate": 1.0873043179066685e-05, + "loss": 1.6794, + "step": 42070 + }, + { + "epoch": 1.4960447960181318, + "grad_norm": 1.8117133378982544, + "learning_rate": 1.0869138972857967e-05, + "loss": 1.6775, + "step": 42080 + }, + { + "epoch": 1.496400319971558, + "grad_norm": 1.9456593990325928, + "learning_rate": 1.0865234633154948e-05, + "loss": 1.7082, + "step": 42090 + }, + { + "epoch": 1.4967558439249844, + "grad_norm": 1.7631021738052368, + "learning_rate": 1.0861330160557317e-05, + "loss": 1.6886, + "step": 42100 + }, + { + "epoch": 1.4971113678784107, + "grad_norm": 1.8768460750579834, + "learning_rate": 1.0857425555664773e-05, + "loss": 1.6646, + "step": 42110 + }, + { + "epoch": 1.4974668918318372, + "grad_norm": 1.8182553052902222, + "learning_rate": 1.085352081907704e-05, + "loss": 1.6766, + "step": 42120 + }, + { + "epoch": 1.4978224157852635, + "grad_norm": 2.0107078552246094, + "learning_rate": 1.0849615951393859e-05, + "loss": 1.656, + "step": 42130 + }, + { + "epoch": 1.4981779397386898, + "grad_norm": 1.837105631828308, + "learning_rate": 1.0845710953214998e-05, + "loss": 1.6473, + "step": 42140 + }, + { + "epoch": 1.4985334636921164, + "grad_norm": 1.784293532371521, + "learning_rate": 1.0841805825140238e-05, + "loss": 1.6912, + "step": 42150 + }, + { + "epoch": 1.4988889876455427, + "grad_norm": 1.8410252332687378, + "learning_rate": 1.083790056776938e-05, + "loss": 1.688, + "step": 42160 + }, + { + "epoch": 1.499244511598969, + "grad_norm": 1.7848275899887085, + "learning_rate": 1.0833995181702248e-05, + "loss": 1.6566, + "step": 42170 + }, + { + "epoch": 1.4996000355523953, + "grad_norm": 1.8361421823501587, + "learning_rate": 1.0830089667538683e-05, + "loss": 1.6873, + "step": 42180 + }, + { + "epoch": 1.4999555595058216, + "grad_norm": 1.978108525276184, + "learning_rate": 1.0826184025878552e-05, + "loss": 1.6962, + "step": 42190 + }, + { + "epoch": 1.500311083459248, + "grad_norm": 1.945601224899292, + "learning_rate": 1.082227825732174e-05, + "loss": 1.6857, + "step": 42200 + }, + { + "epoch": 1.5006666074126744, + "grad_norm": 1.898393988609314, + "learning_rate": 1.0818372362468134e-05, + "loss": 1.6354, + "step": 42210 + }, + { + "epoch": 1.5010221313661007, + "grad_norm": 1.9694855213165283, + "learning_rate": 1.0814466341917668e-05, + "loss": 1.698, + "step": 42220 + }, + { + "epoch": 1.5013776553195273, + "grad_norm": 1.743110179901123, + "learning_rate": 1.0810560196270282e-05, + "loss": 1.6665, + "step": 42230 + }, + { + "epoch": 1.5017331792729536, + "grad_norm": 1.8782473802566528, + "learning_rate": 1.080665392612593e-05, + "loss": 1.7023, + "step": 42240 + }, + { + "epoch": 1.5020887032263799, + "grad_norm": 1.9239259958267212, + "learning_rate": 1.0802747532084592e-05, + "loss": 1.681, + "step": 42250 + }, + { + "epoch": 1.5024442271798062, + "grad_norm": 1.898139238357544, + "learning_rate": 1.0798841014746264e-05, + "loss": 1.6735, + "step": 42260 + }, + { + "epoch": 1.5027997511332325, + "grad_norm": 1.8255397081375122, + "learning_rate": 1.079493437471097e-05, + "loss": 1.632, + "step": 42270 + }, + { + "epoch": 1.5031552750866588, + "grad_norm": 1.860229730606079, + "learning_rate": 1.0791027612578736e-05, + "loss": 1.6334, + "step": 42280 + }, + { + "epoch": 1.5035107990400853, + "grad_norm": 1.6982882022857666, + "learning_rate": 1.0787120728949622e-05, + "loss": 1.6891, + "step": 42290 + }, + { + "epoch": 1.5038663229935116, + "grad_norm": 1.6922639608383179, + "learning_rate": 1.0783213724423701e-05, + "loss": 1.7187, + "step": 42300 + }, + { + "epoch": 1.5042218469469382, + "grad_norm": 2.0529677867889404, + "learning_rate": 1.077930659960106e-05, + "loss": 1.6358, + "step": 42310 + }, + { + "epoch": 1.5045773709003645, + "grad_norm": 1.9002379179000854, + "learning_rate": 1.0775399355081815e-05, + "loss": 1.6742, + "step": 42320 + }, + { + "epoch": 1.5049328948537908, + "grad_norm": 1.9297168254852295, + "learning_rate": 1.077149199146609e-05, + "loss": 1.7008, + "step": 42330 + }, + { + "epoch": 1.505288418807217, + "grad_norm": 1.7973363399505615, + "learning_rate": 1.076758450935403e-05, + "loss": 1.7121, + "step": 42340 + }, + { + "epoch": 1.5056439427606434, + "grad_norm": 1.9050586223602295, + "learning_rate": 1.0763676909345805e-05, + "loss": 1.6551, + "step": 42350 + }, + { + "epoch": 1.5059994667140697, + "grad_norm": 1.843286395072937, + "learning_rate": 1.0759769192041592e-05, + "loss": 1.6768, + "step": 42360 + }, + { + "epoch": 1.5063549906674962, + "grad_norm": 1.7610872983932495, + "learning_rate": 1.0755861358041596e-05, + "loss": 1.631, + "step": 42370 + }, + { + "epoch": 1.5067105146209225, + "grad_norm": 1.7675762176513672, + "learning_rate": 1.0751953407946034e-05, + "loss": 1.6711, + "step": 42380 + }, + { + "epoch": 1.507066038574349, + "grad_norm": 1.920958161354065, + "learning_rate": 1.0748045342355145e-05, + "loss": 1.6583, + "step": 42390 + }, + { + "epoch": 1.5074215625277754, + "grad_norm": 1.882028579711914, + "learning_rate": 1.0744137161869181e-05, + "loss": 1.6389, + "step": 42400 + }, + { + "epoch": 1.5077770864812017, + "grad_norm": 1.73197340965271, + "learning_rate": 1.0740228867088417e-05, + "loss": 1.6905, + "step": 42410 + }, + { + "epoch": 1.508132610434628, + "grad_norm": 1.9308568239212036, + "learning_rate": 1.0736320458613137e-05, + "loss": 1.6706, + "step": 42420 + }, + { + "epoch": 1.5084881343880543, + "grad_norm": 1.8369220495224, + "learning_rate": 1.073241193704365e-05, + "loss": 1.7041, + "step": 42430 + }, + { + "epoch": 1.5088436583414806, + "grad_norm": 1.9122000932693481, + "learning_rate": 1.0728503302980284e-05, + "loss": 1.6227, + "step": 42440 + }, + { + "epoch": 1.5091991822949071, + "grad_norm": 1.8644506931304932, + "learning_rate": 1.072459455702338e-05, + "loss": 1.6756, + "step": 42450 + }, + { + "epoch": 1.5095547062483334, + "grad_norm": 1.8589321374893188, + "learning_rate": 1.0720685699773292e-05, + "loss": 1.6582, + "step": 42460 + }, + { + "epoch": 1.50991023020176, + "grad_norm": 2.146574020385742, + "learning_rate": 1.07167767318304e-05, + "loss": 1.715, + "step": 42470 + }, + { + "epoch": 1.5102657541551863, + "grad_norm": 1.9308947324752808, + "learning_rate": 1.0712867653795101e-05, + "loss": 1.6966, + "step": 42480 + }, + { + "epoch": 1.5106212781086126, + "grad_norm": 1.797224760055542, + "learning_rate": 1.0708958466267794e-05, + "loss": 1.6977, + "step": 42490 + }, + { + "epoch": 1.5109768020620389, + "grad_norm": 1.8617327213287354, + "learning_rate": 1.0705049169848914e-05, + "loss": 1.6539, + "step": 42500 + }, + { + "epoch": 1.5113323260154652, + "grad_norm": 1.9332149028778076, + "learning_rate": 1.0701139765138903e-05, + "loss": 1.659, + "step": 42510 + }, + { + "epoch": 1.5116878499688915, + "grad_norm": 2.0152933597564697, + "learning_rate": 1.069723025273822e-05, + "loss": 1.6548, + "step": 42520 + }, + { + "epoch": 1.512043373922318, + "grad_norm": 1.9726402759552002, + "learning_rate": 1.0693320633247342e-05, + "loss": 1.6655, + "step": 42530 + }, + { + "epoch": 1.5123988978757443, + "grad_norm": 1.789324402809143, + "learning_rate": 1.068941090726676e-05, + "loss": 1.66, + "step": 42540 + }, + { + "epoch": 1.5127544218291709, + "grad_norm": 1.8948030471801758, + "learning_rate": 1.0685501075396985e-05, + "loss": 1.6341, + "step": 42550 + }, + { + "epoch": 1.5131099457825972, + "grad_norm": 1.9974631071090698, + "learning_rate": 1.0681591138238545e-05, + "loss": 1.6326, + "step": 42560 + }, + { + "epoch": 1.5134654697360235, + "grad_norm": 2.1601462364196777, + "learning_rate": 1.067768109639198e-05, + "loss": 1.6761, + "step": 42570 + }, + { + "epoch": 1.5138209936894498, + "grad_norm": 1.733400821685791, + "learning_rate": 1.067377095045785e-05, + "loss": 1.6505, + "step": 42580 + }, + { + "epoch": 1.514176517642876, + "grad_norm": 1.9919558763504028, + "learning_rate": 1.066986070103672e-05, + "loss": 1.6616, + "step": 42590 + }, + { + "epoch": 1.5145320415963024, + "grad_norm": 1.8978710174560547, + "learning_rate": 1.0665950348729191e-05, + "loss": 1.6554, + "step": 42600 + }, + { + "epoch": 1.514887565549729, + "grad_norm": 1.9027674198150635, + "learning_rate": 1.066203989413586e-05, + "loss": 1.6757, + "step": 42610 + }, + { + "epoch": 1.5152430895031552, + "grad_norm": 1.774072527885437, + "learning_rate": 1.0658129337857356e-05, + "loss": 1.6608, + "step": 42620 + }, + { + "epoch": 1.5155986134565818, + "grad_norm": 1.911702275276184, + "learning_rate": 1.0654218680494313e-05, + "loss": 1.6923, + "step": 42630 + }, + { + "epoch": 1.515954137410008, + "grad_norm": 1.811378002166748, + "learning_rate": 1.0650307922647383e-05, + "loss": 1.6491, + "step": 42640 + }, + { + "epoch": 1.5163096613634344, + "grad_norm": 1.7372981309890747, + "learning_rate": 1.064639706491723e-05, + "loss": 1.6513, + "step": 42650 + }, + { + "epoch": 1.5166651853168607, + "grad_norm": 1.860128402709961, + "learning_rate": 1.0642486107904542e-05, + "loss": 1.6816, + "step": 42660 + }, + { + "epoch": 1.517020709270287, + "grad_norm": 1.7836940288543701, + "learning_rate": 1.0638575052210017e-05, + "loss": 1.6984, + "step": 42670 + }, + { + "epoch": 1.5173762332237133, + "grad_norm": 1.8072319030761719, + "learning_rate": 1.0634663898434365e-05, + "loss": 1.7054, + "step": 42680 + }, + { + "epoch": 1.5177317571771398, + "grad_norm": 1.8161203861236572, + "learning_rate": 1.0630752647178322e-05, + "loss": 1.6648, + "step": 42690 + }, + { + "epoch": 1.5180872811305661, + "grad_norm": 1.8995000123977661, + "learning_rate": 1.0626841299042626e-05, + "loss": 1.6746, + "step": 42700 + }, + { + "epoch": 1.5184428050839927, + "grad_norm": 1.8162468671798706, + "learning_rate": 1.0622929854628035e-05, + "loss": 1.6689, + "step": 42710 + }, + { + "epoch": 1.518798329037419, + "grad_norm": 1.919501543045044, + "learning_rate": 1.0619018314535328e-05, + "loss": 1.6541, + "step": 42720 + }, + { + "epoch": 1.5191538529908453, + "grad_norm": 1.8064863681793213, + "learning_rate": 1.0615106679365283e-05, + "loss": 1.7194, + "step": 42730 + }, + { + "epoch": 1.5195093769442716, + "grad_norm": 1.8879224061965942, + "learning_rate": 1.0611194949718712e-05, + "loss": 1.6766, + "step": 42740 + }, + { + "epoch": 1.5198649008976979, + "grad_norm": 1.9638595581054688, + "learning_rate": 1.0607283126196431e-05, + "loss": 1.6924, + "step": 42750 + }, + { + "epoch": 1.5202204248511242, + "grad_norm": 1.7927757501602173, + "learning_rate": 1.0603371209399267e-05, + "loss": 1.6996, + "step": 42760 + }, + { + "epoch": 1.5205759488045507, + "grad_norm": 1.8108702898025513, + "learning_rate": 1.0599459199928068e-05, + "loss": 1.6731, + "step": 42770 + }, + { + "epoch": 1.520931472757977, + "grad_norm": 1.9261244535446167, + "learning_rate": 1.0595547098383696e-05, + "loss": 1.6443, + "step": 42780 + }, + { + "epoch": 1.5212869967114035, + "grad_norm": 1.9858245849609375, + "learning_rate": 1.0591634905367024e-05, + "loss": 1.6617, + "step": 42790 + }, + { + "epoch": 1.5216425206648299, + "grad_norm": 1.9586430788040161, + "learning_rate": 1.058772262147894e-05, + "loss": 1.6843, + "step": 42800 + }, + { + "epoch": 1.5219980446182562, + "grad_norm": 1.9105232954025269, + "learning_rate": 1.0583810247320345e-05, + "loss": 1.6589, + "step": 42810 + }, + { + "epoch": 1.5223535685716825, + "grad_norm": 1.801959753036499, + "learning_rate": 1.057989778349216e-05, + "loss": 1.6795, + "step": 42820 + }, + { + "epoch": 1.5227090925251088, + "grad_norm": 1.908121109008789, + "learning_rate": 1.0575985230595307e-05, + "loss": 1.6764, + "step": 42830 + }, + { + "epoch": 1.523064616478535, + "grad_norm": 1.947690486907959, + "learning_rate": 1.0572072589230735e-05, + "loss": 1.6737, + "step": 42840 + }, + { + "epoch": 1.5234201404319616, + "grad_norm": 1.8410011529922485, + "learning_rate": 1.05681598599994e-05, + "loss": 1.6743, + "step": 42850 + }, + { + "epoch": 1.523775664385388, + "grad_norm": 1.9364418983459473, + "learning_rate": 1.0564247043502274e-05, + "loss": 1.6989, + "step": 42860 + }, + { + "epoch": 1.5241311883388144, + "grad_norm": 1.897105097770691, + "learning_rate": 1.056033414034034e-05, + "loss": 1.6967, + "step": 42870 + }, + { + "epoch": 1.5244867122922408, + "grad_norm": 1.74845552444458, + "learning_rate": 1.0556421151114598e-05, + "loss": 1.6728, + "step": 42880 + }, + { + "epoch": 1.524842236245667, + "grad_norm": 1.8528046607971191, + "learning_rate": 1.0552508076426053e-05, + "loss": 1.707, + "step": 42890 + }, + { + "epoch": 1.5251977601990934, + "grad_norm": 1.815856695175171, + "learning_rate": 1.0548594916875731e-05, + "loss": 1.6525, + "step": 42900 + }, + { + "epoch": 1.5255532841525197, + "grad_norm": 1.8629300594329834, + "learning_rate": 1.054468167306467e-05, + "loss": 1.6732, + "step": 42910 + }, + { + "epoch": 1.525908808105946, + "grad_norm": 1.8080686330795288, + "learning_rate": 1.054076834559392e-05, + "loss": 1.654, + "step": 42920 + }, + { + "epoch": 1.5262643320593725, + "grad_norm": 1.900989055633545, + "learning_rate": 1.0536854935064543e-05, + "loss": 1.6557, + "step": 42930 + }, + { + "epoch": 1.5266198560127988, + "grad_norm": 1.904876947402954, + "learning_rate": 1.0532941442077613e-05, + "loss": 1.6762, + "step": 42940 + }, + { + "epoch": 1.5269753799662253, + "grad_norm": 1.8932329416275024, + "learning_rate": 1.052902786723422e-05, + "loss": 1.6663, + "step": 42950 + }, + { + "epoch": 1.5273309039196517, + "grad_norm": 1.8724309206008911, + "learning_rate": 1.0525114211135466e-05, + "loss": 1.7035, + "step": 42960 + }, + { + "epoch": 1.527686427873078, + "grad_norm": 1.8949474096298218, + "learning_rate": 1.0521200474382456e-05, + "loss": 1.6709, + "step": 42970 + }, + { + "epoch": 1.5280419518265043, + "grad_norm": 1.7738467454910278, + "learning_rate": 1.0517286657576324e-05, + "loss": 1.6792, + "step": 42980 + }, + { + "epoch": 1.5283974757799306, + "grad_norm": 1.849966049194336, + "learning_rate": 1.0513372761318204e-05, + "loss": 1.7021, + "step": 42990 + }, + { + "epoch": 1.5287529997333569, + "grad_norm": 1.932142972946167, + "learning_rate": 1.0509458786209248e-05, + "loss": 1.6444, + "step": 43000 + }, + { + "epoch": 1.5291085236867834, + "grad_norm": 1.8554933071136475, + "learning_rate": 1.0505544732850617e-05, + "loss": 1.6926, + "step": 43010 + }, + { + "epoch": 1.5294640476402097, + "grad_norm": 1.861149549484253, + "learning_rate": 1.0501630601843484e-05, + "loss": 1.7165, + "step": 43020 + }, + { + "epoch": 1.5298195715936362, + "grad_norm": 2.019615888595581, + "learning_rate": 1.0497716393789034e-05, + "loss": 1.6617, + "step": 43030 + }, + { + "epoch": 1.5301750955470625, + "grad_norm": 1.7444164752960205, + "learning_rate": 1.0493802109288472e-05, + "loss": 1.6463, + "step": 43040 + }, + { + "epoch": 1.5305306195004889, + "grad_norm": 2.0510358810424805, + "learning_rate": 1.0489887748942997e-05, + "loss": 1.6721, + "step": 43050 + }, + { + "epoch": 1.5308861434539152, + "grad_norm": 1.8747687339782715, + "learning_rate": 1.0485973313353837e-05, + "loss": 1.6777, + "step": 43060 + }, + { + "epoch": 1.5312416674073415, + "grad_norm": 1.7638757228851318, + "learning_rate": 1.0482058803122223e-05, + "loss": 1.6326, + "step": 43070 + }, + { + "epoch": 1.5315971913607678, + "grad_norm": 1.9330956935882568, + "learning_rate": 1.04781442188494e-05, + "loss": 1.6331, + "step": 43080 + }, + { + "epoch": 1.5319527153141943, + "grad_norm": 1.7835172414779663, + "learning_rate": 1.0474229561136622e-05, + "loss": 1.6277, + "step": 43090 + }, + { + "epoch": 1.5323082392676206, + "grad_norm": 1.929072380065918, + "learning_rate": 1.0470314830585158e-05, + "loss": 1.6596, + "step": 43100 + }, + { + "epoch": 1.5326637632210471, + "grad_norm": 1.9375463724136353, + "learning_rate": 1.0466400027796283e-05, + "loss": 1.6795, + "step": 43110 + }, + { + "epoch": 1.5330192871744734, + "grad_norm": 1.801561713218689, + "learning_rate": 1.0462485153371291e-05, + "loss": 1.675, + "step": 43120 + }, + { + "epoch": 1.5333748111278998, + "grad_norm": 1.8926273584365845, + "learning_rate": 1.0458570207911479e-05, + "loss": 1.6618, + "step": 43130 + }, + { + "epoch": 1.533730335081326, + "grad_norm": 1.8269914388656616, + "learning_rate": 1.0454655192018158e-05, + "loss": 1.7096, + "step": 43140 + }, + { + "epoch": 1.5340858590347524, + "grad_norm": 1.7992956638336182, + "learning_rate": 1.0450740106292648e-05, + "loss": 1.6604, + "step": 43150 + }, + { + "epoch": 1.5344413829881787, + "grad_norm": 1.9935426712036133, + "learning_rate": 1.0446824951336283e-05, + "loss": 1.6469, + "step": 43160 + }, + { + "epoch": 1.5347969069416052, + "grad_norm": 1.8476448059082031, + "learning_rate": 1.0442909727750407e-05, + "loss": 1.6753, + "step": 43170 + }, + { + "epoch": 1.5351524308950315, + "grad_norm": 2.0077481269836426, + "learning_rate": 1.0438994436136378e-05, + "loss": 1.7129, + "step": 43180 + }, + { + "epoch": 1.535507954848458, + "grad_norm": 1.8412647247314453, + "learning_rate": 1.0435079077095555e-05, + "loss": 1.6599, + "step": 43190 + }, + { + "epoch": 1.5358634788018843, + "grad_norm": 1.9217013120651245, + "learning_rate": 1.0431163651229313e-05, + "loss": 1.6532, + "step": 43200 + }, + { + "epoch": 1.5362190027553106, + "grad_norm": 1.7764896154403687, + "learning_rate": 1.0427248159139038e-05, + "loss": 1.6287, + "step": 43210 + }, + { + "epoch": 1.536574526708737, + "grad_norm": 1.7792390584945679, + "learning_rate": 1.0423332601426123e-05, + "loss": 1.6412, + "step": 43220 + }, + { + "epoch": 1.5369300506621633, + "grad_norm": 1.8325653076171875, + "learning_rate": 1.0419416978691977e-05, + "loss": 1.6553, + "step": 43230 + }, + { + "epoch": 1.5372855746155896, + "grad_norm": 1.853739619255066, + "learning_rate": 1.041550129153801e-05, + "loss": 1.6539, + "step": 43240 + }, + { + "epoch": 1.537641098569016, + "grad_norm": 2.0269429683685303, + "learning_rate": 1.0411585540565654e-05, + "loss": 1.6467, + "step": 43250 + }, + { + "epoch": 1.5379966225224424, + "grad_norm": 1.9269630908966064, + "learning_rate": 1.0407669726376335e-05, + "loss": 1.651, + "step": 43260 + }, + { + "epoch": 1.538352146475869, + "grad_norm": 1.8084664344787598, + "learning_rate": 1.0403753849571505e-05, + "loss": 1.6735, + "step": 43270 + }, + { + "epoch": 1.5387076704292952, + "grad_norm": 1.8354097604751587, + "learning_rate": 1.0399837910752613e-05, + "loss": 1.705, + "step": 43280 + }, + { + "epoch": 1.5390631943827215, + "grad_norm": 1.8562625646591187, + "learning_rate": 1.0395921910521127e-05, + "loss": 1.6455, + "step": 43290 + }, + { + "epoch": 1.5394187183361479, + "grad_norm": 1.8544988632202148, + "learning_rate": 1.0392005849478516e-05, + "loss": 1.6868, + "step": 43300 + }, + { + "epoch": 1.5397742422895742, + "grad_norm": 1.7014425992965698, + "learning_rate": 1.0388089728226268e-05, + "loss": 1.7071, + "step": 43310 + }, + { + "epoch": 1.5401297662430005, + "grad_norm": 1.7603251934051514, + "learning_rate": 1.038417354736587e-05, + "loss": 1.6432, + "step": 43320 + }, + { + "epoch": 1.540485290196427, + "grad_norm": 1.8588751554489136, + "learning_rate": 1.0380257307498822e-05, + "loss": 1.6296, + "step": 43330 + }, + { + "epoch": 1.5408408141498533, + "grad_norm": 1.8138442039489746, + "learning_rate": 1.0376341009226636e-05, + "loss": 1.6166, + "step": 43340 + }, + { + "epoch": 1.5411963381032798, + "grad_norm": 1.9501382112503052, + "learning_rate": 1.037242465315083e-05, + "loss": 1.7226, + "step": 43350 + }, + { + "epoch": 1.5415518620567061, + "grad_norm": 1.8408522605895996, + "learning_rate": 1.0368508239872933e-05, + "loss": 1.6767, + "step": 43360 + }, + { + "epoch": 1.5419073860101324, + "grad_norm": 1.803480625152588, + "learning_rate": 1.0364591769994484e-05, + "loss": 1.6946, + "step": 43370 + }, + { + "epoch": 1.5422629099635587, + "grad_norm": 1.8548345565795898, + "learning_rate": 1.0360675244117024e-05, + "loss": 1.6889, + "step": 43380 + }, + { + "epoch": 1.542618433916985, + "grad_norm": 1.9906445741653442, + "learning_rate": 1.0356758662842106e-05, + "loss": 1.6794, + "step": 43390 + }, + { + "epoch": 1.5429739578704114, + "grad_norm": 2.089935302734375, + "learning_rate": 1.0352842026771295e-05, + "loss": 1.6765, + "step": 43400 + }, + { + "epoch": 1.543329481823838, + "grad_norm": 1.9084968566894531, + "learning_rate": 1.034892533650616e-05, + "loss": 1.718, + "step": 43410 + }, + { + "epoch": 1.5436850057772642, + "grad_norm": 1.7907205820083618, + "learning_rate": 1.0345008592648282e-05, + "loss": 1.6484, + "step": 43420 + }, + { + "epoch": 1.5440405297306907, + "grad_norm": 1.8574295043945312, + "learning_rate": 1.0341091795799247e-05, + "loss": 1.6807, + "step": 43430 + }, + { + "epoch": 1.544396053684117, + "grad_norm": 1.7244060039520264, + "learning_rate": 1.0337174946560652e-05, + "loss": 1.6497, + "step": 43440 + }, + { + "epoch": 1.5447515776375433, + "grad_norm": 1.7605082988739014, + "learning_rate": 1.03332580455341e-05, + "loss": 1.6575, + "step": 43450 + }, + { + "epoch": 1.5451071015909696, + "grad_norm": 1.7906380891799927, + "learning_rate": 1.03293410933212e-05, + "loss": 1.6667, + "step": 43460 + }, + { + "epoch": 1.545462625544396, + "grad_norm": 2.0502443313598633, + "learning_rate": 1.0325424090523573e-05, + "loss": 1.6421, + "step": 43470 + }, + { + "epoch": 1.5458181494978223, + "grad_norm": 1.7937922477722168, + "learning_rate": 1.0321507037742846e-05, + "loss": 1.6061, + "step": 43480 + }, + { + "epoch": 1.5461736734512488, + "grad_norm": 1.8461755514144897, + "learning_rate": 1.0317589935580654e-05, + "loss": 1.6319, + "step": 43490 + }, + { + "epoch": 1.546529197404675, + "grad_norm": 1.803354263305664, + "learning_rate": 1.031367278463864e-05, + "loss": 1.7119, + "step": 43500 + }, + { + "epoch": 1.5468847213581016, + "grad_norm": 1.878377079963684, + "learning_rate": 1.0309755585518453e-05, + "loss": 1.6353, + "step": 43510 + }, + { + "epoch": 1.547240245311528, + "grad_norm": 1.8846303224563599, + "learning_rate": 1.030583833882175e-05, + "loss": 1.6527, + "step": 43520 + }, + { + "epoch": 1.5475957692649542, + "grad_norm": 1.7359410524368286, + "learning_rate": 1.0301921045150196e-05, + "loss": 1.6628, + "step": 43530 + }, + { + "epoch": 1.5479512932183805, + "grad_norm": 1.7824934720993042, + "learning_rate": 1.0298003705105462e-05, + "loss": 1.6601, + "step": 43540 + }, + { + "epoch": 1.5483068171718068, + "grad_norm": 1.8383424282073975, + "learning_rate": 1.0294086319289227e-05, + "loss": 1.6773, + "step": 43550 + }, + { + "epoch": 1.5486623411252332, + "grad_norm": 1.9958280324935913, + "learning_rate": 1.029016888830318e-05, + "loss": 1.651, + "step": 43560 + }, + { + "epoch": 1.5490178650786597, + "grad_norm": 1.8739871978759766, + "learning_rate": 1.0286251412749009e-05, + "loss": 1.6578, + "step": 43570 + }, + { + "epoch": 1.549373389032086, + "grad_norm": 1.660166621208191, + "learning_rate": 1.0282333893228413e-05, + "loss": 1.6737, + "step": 43580 + }, + { + "epoch": 1.5497289129855125, + "grad_norm": 1.8416285514831543, + "learning_rate": 1.0278416330343104e-05, + "loss": 1.6798, + "step": 43590 + }, + { + "epoch": 1.5500844369389388, + "grad_norm": 1.8943604230880737, + "learning_rate": 1.0274498724694792e-05, + "loss": 1.6736, + "step": 43600 + }, + { + "epoch": 1.5504399608923651, + "grad_norm": 1.836141586303711, + "learning_rate": 1.02705810768852e-05, + "loss": 1.6376, + "step": 43610 + }, + { + "epoch": 1.5507954848457914, + "grad_norm": 1.83307945728302, + "learning_rate": 1.0266663387516047e-05, + "loss": 1.6696, + "step": 43620 + }, + { + "epoch": 1.5511510087992177, + "grad_norm": 1.7649224996566772, + "learning_rate": 1.0262745657189073e-05, + "loss": 1.6532, + "step": 43630 + }, + { + "epoch": 1.551506532752644, + "grad_norm": 1.9184762239456177, + "learning_rate": 1.025882788650601e-05, + "loss": 1.624, + "step": 43640 + }, + { + "epoch": 1.5518620567060706, + "grad_norm": 1.6937661170959473, + "learning_rate": 1.025491007606861e-05, + "loss": 1.6886, + "step": 43650 + }, + { + "epoch": 1.5522175806594969, + "grad_norm": 1.9641646146774292, + "learning_rate": 1.0250992226478618e-05, + "loss": 1.6505, + "step": 43660 + }, + { + "epoch": 1.5525731046129234, + "grad_norm": 1.8957557678222656, + "learning_rate": 1.0247074338337793e-05, + "loss": 1.6816, + "step": 43670 + }, + { + "epoch": 1.5529286285663497, + "grad_norm": 1.9250448942184448, + "learning_rate": 1.0243156412247901e-05, + "loss": 1.6568, + "step": 43680 + }, + { + "epoch": 1.553284152519776, + "grad_norm": 1.8737907409667969, + "learning_rate": 1.0239238448810711e-05, + "loss": 1.6421, + "step": 43690 + }, + { + "epoch": 1.5536396764732023, + "grad_norm": 1.9514378309249878, + "learning_rate": 1.023532044862799e-05, + "loss": 1.6403, + "step": 43700 + }, + { + "epoch": 1.5539952004266286, + "grad_norm": 1.8503978252410889, + "learning_rate": 1.0231402412301526e-05, + "loss": 1.698, + "step": 43710 + }, + { + "epoch": 1.554350724380055, + "grad_norm": 1.8692011833190918, + "learning_rate": 1.0227484340433102e-05, + "loss": 1.6409, + "step": 43720 + }, + { + "epoch": 1.5547062483334815, + "grad_norm": 1.9184273481369019, + "learning_rate": 1.022356623362451e-05, + "loss": 1.6459, + "step": 43730 + }, + { + "epoch": 1.5550617722869078, + "grad_norm": 1.9160983562469482, + "learning_rate": 1.0219648092477545e-05, + "loss": 1.6866, + "step": 43740 + }, + { + "epoch": 1.5554172962403343, + "grad_norm": 1.8987330198287964, + "learning_rate": 1.0215729917594013e-05, + "loss": 1.6661, + "step": 43750 + }, + { + "epoch": 1.5557728201937606, + "grad_norm": 1.7964797019958496, + "learning_rate": 1.0211811709575717e-05, + "loss": 1.6423, + "step": 43760 + }, + { + "epoch": 1.556128344147187, + "grad_norm": 1.8204917907714844, + "learning_rate": 1.020789346902447e-05, + "loss": 1.7091, + "step": 43770 + }, + { + "epoch": 1.5564838681006132, + "grad_norm": 1.8866429328918457, + "learning_rate": 1.020397519654209e-05, + "loss": 1.6482, + "step": 43780 + }, + { + "epoch": 1.5568393920540395, + "grad_norm": 1.9378949403762817, + "learning_rate": 1.0200056892730399e-05, + "loss": 1.6639, + "step": 43790 + }, + { + "epoch": 1.5571949160074658, + "grad_norm": 1.861153483390808, + "learning_rate": 1.0196138558191222e-05, + "loss": 1.6352, + "step": 43800 + }, + { + "epoch": 1.5575504399608924, + "grad_norm": 1.7982897758483887, + "learning_rate": 1.0192220193526395e-05, + "loss": 1.6641, + "step": 43810 + }, + { + "epoch": 1.5579059639143187, + "grad_norm": 1.8335530757904053, + "learning_rate": 1.018830179933775e-05, + "loss": 1.6008, + "step": 43820 + }, + { + "epoch": 1.5582614878677452, + "grad_norm": 1.9274227619171143, + "learning_rate": 1.0184383376227128e-05, + "loss": 1.6588, + "step": 43830 + }, + { + "epoch": 1.5586170118211715, + "grad_norm": 1.942944884300232, + "learning_rate": 1.0180464924796376e-05, + "loss": 1.6425, + "step": 43840 + }, + { + "epoch": 1.5589725357745978, + "grad_norm": 1.8403658866882324, + "learning_rate": 1.0176546445647346e-05, + "loss": 1.6101, + "step": 43850 + }, + { + "epoch": 1.5593280597280241, + "grad_norm": 1.8369762897491455, + "learning_rate": 1.0172627939381885e-05, + "loss": 1.6762, + "step": 43860 + }, + { + "epoch": 1.5596835836814504, + "grad_norm": 1.7631001472473145, + "learning_rate": 1.016870940660186e-05, + "loss": 1.6593, + "step": 43870 + }, + { + "epoch": 1.5600391076348767, + "grad_norm": 2.2434041500091553, + "learning_rate": 1.0164790847909122e-05, + "loss": 1.6731, + "step": 43880 + }, + { + "epoch": 1.5603946315883033, + "grad_norm": 1.7569379806518555, + "learning_rate": 1.0160872263905545e-05, + "loss": 1.6719, + "step": 43890 + }, + { + "epoch": 1.5607501555417296, + "grad_norm": 1.9306408166885376, + "learning_rate": 1.0156953655192998e-05, + "loss": 1.7287, + "step": 43900 + }, + { + "epoch": 1.561105679495156, + "grad_norm": 1.816402554512024, + "learning_rate": 1.0153035022373352e-05, + "loss": 1.6788, + "step": 43910 + }, + { + "epoch": 1.5614612034485824, + "grad_norm": 1.8105896711349487, + "learning_rate": 1.0149116366048488e-05, + "loss": 1.6809, + "step": 43920 + }, + { + "epoch": 1.5618167274020087, + "grad_norm": 1.7978081703186035, + "learning_rate": 1.0145197686820285e-05, + "loss": 1.6415, + "step": 43930 + }, + { + "epoch": 1.562172251355435, + "grad_norm": 1.8552039861679077, + "learning_rate": 1.014127898529063e-05, + "loss": 1.6978, + "step": 43940 + }, + { + "epoch": 1.5625277753088613, + "grad_norm": 2.05350923538208, + "learning_rate": 1.0137360262061404e-05, + "loss": 1.6772, + "step": 43950 + }, + { + "epoch": 1.5628832992622876, + "grad_norm": 1.7288767099380493, + "learning_rate": 1.0133441517734504e-05, + "loss": 1.6627, + "step": 43960 + }, + { + "epoch": 1.5632388232157142, + "grad_norm": 1.8969210386276245, + "learning_rate": 1.0129522752911824e-05, + "loss": 1.6602, + "step": 43970 + }, + { + "epoch": 1.5635943471691405, + "grad_norm": 1.8368299007415771, + "learning_rate": 1.012560396819526e-05, + "loss": 1.7052, + "step": 43980 + }, + { + "epoch": 1.563949871122567, + "grad_norm": 2.012700319290161, + "learning_rate": 1.0121685164186719e-05, + "loss": 1.6564, + "step": 43990 + }, + { + "epoch": 1.5643053950759933, + "grad_norm": 2.0532262325286865, + "learning_rate": 1.0117766341488096e-05, + "loss": 1.674, + "step": 44000 + }, + { + "epoch": 1.5646609190294196, + "grad_norm": 1.783205270767212, + "learning_rate": 1.0113847500701304e-05, + "loss": 1.6896, + "step": 44010 + }, + { + "epoch": 1.565016442982846, + "grad_norm": 1.8086717128753662, + "learning_rate": 1.0109928642428245e-05, + "loss": 1.6704, + "step": 44020 + }, + { + "epoch": 1.5653719669362722, + "grad_norm": 1.8439959287643433, + "learning_rate": 1.0106009767270839e-05, + "loss": 1.6411, + "step": 44030 + }, + { + "epoch": 1.5657274908896985, + "grad_norm": 1.915297269821167, + "learning_rate": 1.0102090875830997e-05, + "loss": 1.6614, + "step": 44040 + }, + { + "epoch": 1.566083014843125, + "grad_norm": 1.895838737487793, + "learning_rate": 1.0098171968710634e-05, + "loss": 1.7175, + "step": 44050 + }, + { + "epoch": 1.5664385387965514, + "grad_norm": 1.8406438827514648, + "learning_rate": 1.0094253046511678e-05, + "loss": 1.6552, + "step": 44060 + }, + { + "epoch": 1.566794062749978, + "grad_norm": 2.7574586868286133, + "learning_rate": 1.009033410983604e-05, + "loss": 1.672, + "step": 44070 + }, + { + "epoch": 1.5671495867034042, + "grad_norm": 1.9110980033874512, + "learning_rate": 1.008641515928565e-05, + "loss": 1.6929, + "step": 44080 + }, + { + "epoch": 1.5675051106568305, + "grad_norm": 2.3096587657928467, + "learning_rate": 1.0082496195462434e-05, + "loss": 1.6427, + "step": 44090 + }, + { + "epoch": 1.5678606346102568, + "grad_norm": 1.798247218132019, + "learning_rate": 1.007857721896832e-05, + "loss": 1.6606, + "step": 44100 + }, + { + "epoch": 1.5682161585636831, + "grad_norm": 1.7407604455947876, + "learning_rate": 1.0074658230405237e-05, + "loss": 1.6417, + "step": 44110 + }, + { + "epoch": 1.5685716825171094, + "grad_norm": 1.8989253044128418, + "learning_rate": 1.007073923037512e-05, + "loss": 1.6634, + "step": 44120 + }, + { + "epoch": 1.568927206470536, + "grad_norm": 1.8761156797409058, + "learning_rate": 1.0066820219479899e-05, + "loss": 1.6444, + "step": 44130 + }, + { + "epoch": 1.5692827304239623, + "grad_norm": 1.8708429336547852, + "learning_rate": 1.006290119832151e-05, + "loss": 1.6565, + "step": 44140 + }, + { + "epoch": 1.5696382543773888, + "grad_norm": 1.939834475517273, + "learning_rate": 1.0058982167501892e-05, + "loss": 1.6318, + "step": 44150 + }, + { + "epoch": 1.569993778330815, + "grad_norm": 1.8468369245529175, + "learning_rate": 1.0055063127622985e-05, + "loss": 1.6896, + "step": 44160 + }, + { + "epoch": 1.5703493022842414, + "grad_norm": 1.8790068626403809, + "learning_rate": 1.0051144079286725e-05, + "loss": 1.6607, + "step": 44170 + }, + { + "epoch": 1.5707048262376677, + "grad_norm": 1.975175142288208, + "learning_rate": 1.0047225023095056e-05, + "loss": 1.6839, + "step": 44180 + }, + { + "epoch": 1.571060350191094, + "grad_norm": 1.8378052711486816, + "learning_rate": 1.0043305959649921e-05, + "loss": 1.6619, + "step": 44190 + }, + { + "epoch": 1.5714158741445203, + "grad_norm": 1.7882580757141113, + "learning_rate": 1.003938688955326e-05, + "loss": 1.7007, + "step": 44200 + }, + { + "epoch": 1.5717713980979469, + "grad_norm": 1.7554378509521484, + "learning_rate": 1.0035467813407019e-05, + "loss": 1.6374, + "step": 44210 + }, + { + "epoch": 1.5721269220513732, + "grad_norm": 1.930601716041565, + "learning_rate": 1.0031548731813143e-05, + "loss": 1.6368, + "step": 44220 + }, + { + "epoch": 1.5724824460047997, + "grad_norm": 1.8473033905029297, + "learning_rate": 1.0027629645373582e-05, + "loss": 1.6585, + "step": 44230 + }, + { + "epoch": 1.572837969958226, + "grad_norm": 1.7615290880203247, + "learning_rate": 1.0023710554690282e-05, + "loss": 1.6343, + "step": 44240 + }, + { + "epoch": 1.5731934939116523, + "grad_norm": 1.8404291868209839, + "learning_rate": 1.001979146036519e-05, + "loss": 1.6345, + "step": 44250 + }, + { + "epoch": 1.5735490178650786, + "grad_norm": 1.7335777282714844, + "learning_rate": 1.0015872363000252e-05, + "loss": 1.66, + "step": 44260 + }, + { + "epoch": 1.573904541818505, + "grad_norm": 1.8358724117279053, + "learning_rate": 1.0011953263197418e-05, + "loss": 1.6428, + "step": 44270 + }, + { + "epoch": 1.5742600657719312, + "grad_norm": 1.8799185752868652, + "learning_rate": 1.0008034161558637e-05, + "loss": 1.6114, + "step": 44280 + }, + { + "epoch": 1.5746155897253578, + "grad_norm": 1.9736826419830322, + "learning_rate": 1.0004115058685859e-05, + "loss": 1.6545, + "step": 44290 + }, + { + "epoch": 1.574971113678784, + "grad_norm": 1.757722020149231, + "learning_rate": 1.0000195955181037e-05, + "loss": 1.6981, + "step": 44300 + }, + { + "epoch": 1.5753266376322106, + "grad_norm": 1.8227778673171997, + "learning_rate": 9.996276851646118e-06, + "loss": 1.6525, + "step": 44310 + }, + { + "epoch": 1.575682161585637, + "grad_norm": 1.8586345911026, + "learning_rate": 9.992357748683047e-06, + "loss": 1.6699, + "step": 44320 + }, + { + "epoch": 1.5760376855390632, + "grad_norm": 1.9969902038574219, + "learning_rate": 9.98843864689378e-06, + "loss": 1.6382, + "step": 44330 + }, + { + "epoch": 1.5763932094924895, + "grad_norm": 1.8302338123321533, + "learning_rate": 9.984519546880262e-06, + "loss": 1.6258, + "step": 44340 + }, + { + "epoch": 1.5767487334459158, + "grad_norm": 1.8657716512680054, + "learning_rate": 9.980600449244446e-06, + "loss": 1.6407, + "step": 44350 + }, + { + "epoch": 1.5771042573993421, + "grad_norm": 1.9813754558563232, + "learning_rate": 9.97668135458828e-06, + "loss": 1.6302, + "step": 44360 + }, + { + "epoch": 1.5774597813527687, + "grad_norm": 1.9710670709609985, + "learning_rate": 9.972762263513713e-06, + "loss": 1.6526, + "step": 44370 + }, + { + "epoch": 1.577815305306195, + "grad_norm": 1.9619462490081787, + "learning_rate": 9.96884317662269e-06, + "loss": 1.6125, + "step": 44380 + }, + { + "epoch": 1.5781708292596215, + "grad_norm": 1.7616794109344482, + "learning_rate": 9.964924094517158e-06, + "loss": 1.6731, + "step": 44390 + }, + { + "epoch": 1.5785263532130478, + "grad_norm": 1.8521478176116943, + "learning_rate": 9.961005017799068e-06, + "loss": 1.6455, + "step": 44400 + }, + { + "epoch": 1.578881877166474, + "grad_norm": 1.7463682889938354, + "learning_rate": 9.957085947070361e-06, + "loss": 1.6594, + "step": 44410 + }, + { + "epoch": 1.5792374011199004, + "grad_norm": 1.8019137382507324, + "learning_rate": 9.953166882932985e-06, + "loss": 1.6461, + "step": 44420 + }, + { + "epoch": 1.5795929250733267, + "grad_norm": 1.9217017889022827, + "learning_rate": 9.949247825988881e-06, + "loss": 1.6513, + "step": 44430 + }, + { + "epoch": 1.579948449026753, + "grad_norm": 1.771942138671875, + "learning_rate": 9.945328776839994e-06, + "loss": 1.6564, + "step": 44440 + }, + { + "epoch": 1.5803039729801795, + "grad_norm": 1.7770013809204102, + "learning_rate": 9.941409736088266e-06, + "loss": 1.6698, + "step": 44450 + }, + { + "epoch": 1.5806594969336059, + "grad_norm": 1.9023828506469727, + "learning_rate": 9.937490704335631e-06, + "loss": 1.6992, + "step": 44460 + }, + { + "epoch": 1.5810150208870324, + "grad_norm": 1.9144151210784912, + "learning_rate": 9.933571682184034e-06, + "loss": 1.6413, + "step": 44470 + }, + { + "epoch": 1.5813705448404587, + "grad_norm": 1.8934605121612549, + "learning_rate": 9.929652670235409e-06, + "loss": 1.6479, + "step": 44480 + }, + { + "epoch": 1.581726068793885, + "grad_norm": 1.8610419034957886, + "learning_rate": 9.925733669091692e-06, + "loss": 1.6547, + "step": 44490 + }, + { + "epoch": 1.5820815927473113, + "grad_norm": 1.870526909828186, + "learning_rate": 9.92181467935482e-06, + "loss": 1.6422, + "step": 44500 + }, + { + "epoch": 1.5824371167007376, + "grad_norm": 1.9301140308380127, + "learning_rate": 9.917895701626721e-06, + "loss": 1.635, + "step": 44510 + }, + { + "epoch": 1.582792640654164, + "grad_norm": 1.8191717863082886, + "learning_rate": 9.913976736509328e-06, + "loss": 1.6568, + "step": 44520 + }, + { + "epoch": 1.5831481646075904, + "grad_norm": 1.8834309577941895, + "learning_rate": 9.910057784604568e-06, + "loss": 1.666, + "step": 44530 + }, + { + "epoch": 1.5835036885610168, + "grad_norm": 1.832620620727539, + "learning_rate": 9.90613884651437e-06, + "loss": 1.6647, + "step": 44540 + }, + { + "epoch": 1.5838592125144433, + "grad_norm": 1.8681204319000244, + "learning_rate": 9.902219922840658e-06, + "loss": 1.6673, + "step": 44550 + }, + { + "epoch": 1.5842147364678696, + "grad_norm": 1.897397756576538, + "learning_rate": 9.898301014185351e-06, + "loss": 1.6643, + "step": 44560 + }, + { + "epoch": 1.584570260421296, + "grad_norm": 1.8625174760818481, + "learning_rate": 9.89438212115037e-06, + "loss": 1.6968, + "step": 44570 + }, + { + "epoch": 1.5849257843747222, + "grad_norm": 1.749588131904602, + "learning_rate": 9.890463244337633e-06, + "loss": 1.6315, + "step": 44580 + }, + { + "epoch": 1.5852813083281485, + "grad_norm": 1.8423128128051758, + "learning_rate": 9.886544384349053e-06, + "loss": 1.6436, + "step": 44590 + }, + { + "epoch": 1.5856368322815748, + "grad_norm": 1.7982268333435059, + "learning_rate": 9.882625541786548e-06, + "loss": 1.6562, + "step": 44600 + }, + { + "epoch": 1.5859923562350013, + "grad_norm": 1.850053310394287, + "learning_rate": 9.878706717252025e-06, + "loss": 1.6519, + "step": 44610 + }, + { + "epoch": 1.5863478801884277, + "grad_norm": 1.954933524131775, + "learning_rate": 9.874787911347388e-06, + "loss": 1.6159, + "step": 44620 + }, + { + "epoch": 1.5867034041418542, + "grad_norm": 2.061009645462036, + "learning_rate": 9.870869124674543e-06, + "loss": 1.6882, + "step": 44630 + }, + { + "epoch": 1.5870589280952805, + "grad_norm": 1.9090464115142822, + "learning_rate": 9.866950357835388e-06, + "loss": 1.6539, + "step": 44640 + }, + { + "epoch": 1.5874144520487068, + "grad_norm": 1.8968024253845215, + "learning_rate": 9.863031611431826e-06, + "loss": 1.6469, + "step": 44650 + }, + { + "epoch": 1.587769976002133, + "grad_norm": 1.9923118352890015, + "learning_rate": 9.859112886065748e-06, + "loss": 1.6424, + "step": 44660 + }, + { + "epoch": 1.5881254999555594, + "grad_norm": 1.6754106283187866, + "learning_rate": 9.855194182339048e-06, + "loss": 1.6609, + "step": 44670 + }, + { + "epoch": 1.5884810239089857, + "grad_norm": 1.7623523473739624, + "learning_rate": 9.851275500853616e-06, + "loss": 1.647, + "step": 44680 + }, + { + "epoch": 1.5888365478624122, + "grad_norm": 1.912752628326416, + "learning_rate": 9.847356842211332e-06, + "loss": 1.652, + "step": 44690 + }, + { + "epoch": 1.5891920718158385, + "grad_norm": 2.1574203968048096, + "learning_rate": 9.84343820701408e-06, + "loss": 1.6735, + "step": 44700 + }, + { + "epoch": 1.589547595769265, + "grad_norm": 1.8257741928100586, + "learning_rate": 9.839519595863738e-06, + "loss": 1.6568, + "step": 44710 + }, + { + "epoch": 1.5899031197226914, + "grad_norm": 2.158803701400757, + "learning_rate": 9.83560100936218e-06, + "loss": 1.6592, + "step": 44720 + }, + { + "epoch": 1.5902586436761177, + "grad_norm": 1.89064359664917, + "learning_rate": 9.831682448111278e-06, + "loss": 1.6566, + "step": 44730 + }, + { + "epoch": 1.590614167629544, + "grad_norm": 1.7633954286575317, + "learning_rate": 9.827763912712895e-06, + "loss": 1.6794, + "step": 44740 + }, + { + "epoch": 1.5909696915829703, + "grad_norm": 1.973594069480896, + "learning_rate": 9.823845403768894e-06, + "loss": 1.6717, + "step": 44750 + }, + { + "epoch": 1.5913252155363966, + "grad_norm": 1.7964091300964355, + "learning_rate": 9.819926921881136e-06, + "loss": 1.6958, + "step": 44760 + }, + { + "epoch": 1.5916807394898231, + "grad_norm": 1.911806344985962, + "learning_rate": 9.816008467651477e-06, + "loss": 1.6587, + "step": 44770 + }, + { + "epoch": 1.5920362634432494, + "grad_norm": 1.8351384401321411, + "learning_rate": 9.81209004168176e-06, + "loss": 1.6602, + "step": 44780 + }, + { + "epoch": 1.592391787396676, + "grad_norm": 1.9146497249603271, + "learning_rate": 9.808171644573832e-06, + "loss": 1.6896, + "step": 44790 + }, + { + "epoch": 1.5927473113501023, + "grad_norm": 1.922183871269226, + "learning_rate": 9.80425327692954e-06, + "loss": 1.6549, + "step": 44800 + }, + { + "epoch": 1.5931028353035286, + "grad_norm": 1.9478572607040405, + "learning_rate": 9.800334939350717e-06, + "loss": 1.6179, + "step": 44810 + }, + { + "epoch": 1.593458359256955, + "grad_norm": 1.950107455253601, + "learning_rate": 9.796416632439193e-06, + "loss": 1.6645, + "step": 44820 + }, + { + "epoch": 1.5938138832103812, + "grad_norm": 2.008389472961426, + "learning_rate": 9.7924983567968e-06, + "loss": 1.6387, + "step": 44830 + }, + { + "epoch": 1.5941694071638075, + "grad_norm": 1.8459104299545288, + "learning_rate": 9.788580113025358e-06, + "loss": 1.6924, + "step": 44840 + }, + { + "epoch": 1.594524931117234, + "grad_norm": 1.7717188596725464, + "learning_rate": 9.784661901726684e-06, + "loss": 1.6483, + "step": 44850 + }, + { + "epoch": 1.5948804550706603, + "grad_norm": 1.813963532447815, + "learning_rate": 9.780743723502594e-06, + "loss": 1.6854, + "step": 44860 + }, + { + "epoch": 1.5952359790240869, + "grad_norm": 1.8415963649749756, + "learning_rate": 9.776825578954891e-06, + "loss": 1.6045, + "step": 44870 + }, + { + "epoch": 1.5955915029775132, + "grad_norm": 2.128248453140259, + "learning_rate": 9.772907468685381e-06, + "loss": 1.6182, + "step": 44880 + }, + { + "epoch": 1.5959470269309395, + "grad_norm": 2.024554491043091, + "learning_rate": 9.768989393295858e-06, + "loss": 1.6417, + "step": 44890 + }, + { + "epoch": 1.5963025508843658, + "grad_norm": 1.7415904998779297, + "learning_rate": 9.765071353388119e-06, + "loss": 1.6527, + "step": 44900 + }, + { + "epoch": 1.596658074837792, + "grad_norm": 1.8566675186157227, + "learning_rate": 9.761153349563947e-06, + "loss": 1.6412, + "step": 44910 + }, + { + "epoch": 1.5970135987912184, + "grad_norm": 1.8245688676834106, + "learning_rate": 9.75723538242512e-06, + "loss": 1.6264, + "step": 44920 + }, + { + "epoch": 1.597369122744645, + "grad_norm": 1.8760472536087036, + "learning_rate": 9.75331745257342e-06, + "loss": 1.6614, + "step": 44930 + }, + { + "epoch": 1.5977246466980712, + "grad_norm": 2.0607192516326904, + "learning_rate": 9.749399560610612e-06, + "loss": 1.6491, + "step": 44940 + }, + { + "epoch": 1.5980801706514978, + "grad_norm": 2.002857208251953, + "learning_rate": 9.745481707138458e-06, + "loss": 1.6838, + "step": 44950 + }, + { + "epoch": 1.598435694604924, + "grad_norm": 1.7652947902679443, + "learning_rate": 9.741563892758717e-06, + "loss": 1.6937, + "step": 44960 + }, + { + "epoch": 1.5987912185583504, + "grad_norm": 1.86318838596344, + "learning_rate": 9.737646118073143e-06, + "loss": 1.6976, + "step": 44970 + }, + { + "epoch": 1.5991467425117767, + "grad_norm": 1.9008982181549072, + "learning_rate": 9.73372838368348e-06, + "loss": 1.6898, + "step": 44980 + }, + { + "epoch": 1.599502266465203, + "grad_norm": 1.8608051538467407, + "learning_rate": 9.729810690191468e-06, + "loss": 1.6505, + "step": 44990 + }, + { + "epoch": 1.5998577904186293, + "grad_norm": 1.8600138425827026, + "learning_rate": 9.72589303819884e-06, + "loss": 1.6205, + "step": 45000 + }, + { + "epoch": 1.6002133143720558, + "grad_norm": 1.96610689163208, + "learning_rate": 9.721975428307322e-06, + "loss": 1.6532, + "step": 45010 + }, + { + "epoch": 1.6005688383254821, + "grad_norm": 1.90450119972229, + "learning_rate": 9.718057861118634e-06, + "loss": 1.6325, + "step": 45020 + }, + { + "epoch": 1.6009243622789087, + "grad_norm": 1.9000378847122192, + "learning_rate": 9.714140337234493e-06, + "loss": 1.6688, + "step": 45030 + }, + { + "epoch": 1.601279886232335, + "grad_norm": 1.8246891498565674, + "learning_rate": 9.710222857256605e-06, + "loss": 1.6716, + "step": 45040 + }, + { + "epoch": 1.6016354101857613, + "grad_norm": 2.0795390605926514, + "learning_rate": 9.706305421786666e-06, + "loss": 1.6275, + "step": 45050 + }, + { + "epoch": 1.6019909341391876, + "grad_norm": 1.9774959087371826, + "learning_rate": 9.702388031426373e-06, + "loss": 1.626, + "step": 45060 + }, + { + "epoch": 1.602346458092614, + "grad_norm": 1.8395792245864868, + "learning_rate": 9.698470686777414e-06, + "loss": 1.6671, + "step": 45070 + }, + { + "epoch": 1.6027019820460402, + "grad_norm": 1.8831536769866943, + "learning_rate": 9.694553388441466e-06, + "loss": 1.6759, + "step": 45080 + }, + { + "epoch": 1.6030575059994667, + "grad_norm": 1.9469261169433594, + "learning_rate": 9.690636137020207e-06, + "loss": 1.6633, + "step": 45090 + }, + { + "epoch": 1.603413029952893, + "grad_norm": 1.8184782266616821, + "learning_rate": 9.68671893311529e-06, + "loss": 1.6192, + "step": 45100 + }, + { + "epoch": 1.6037685539063196, + "grad_norm": 1.8013615608215332, + "learning_rate": 9.682801777328386e-06, + "loss": 1.6433, + "step": 45110 + }, + { + "epoch": 1.6041240778597459, + "grad_norm": 1.8400076627731323, + "learning_rate": 9.678884670261138e-06, + "loss": 1.6392, + "step": 45120 + }, + { + "epoch": 1.6044796018131722, + "grad_norm": 1.9208611249923706, + "learning_rate": 9.674967612515192e-06, + "loss": 1.6094, + "step": 45130 + }, + { + "epoch": 1.6048351257665985, + "grad_norm": 1.9665645360946655, + "learning_rate": 9.67105060469218e-06, + "loss": 1.6943, + "step": 45140 + }, + { + "epoch": 1.6051906497200248, + "grad_norm": 1.8760857582092285, + "learning_rate": 9.667133647393736e-06, + "loss": 1.7079, + "step": 45150 + }, + { + "epoch": 1.605546173673451, + "grad_norm": 1.9586379528045654, + "learning_rate": 9.663216741221474e-06, + "loss": 1.6676, + "step": 45160 + }, + { + "epoch": 1.6059016976268776, + "grad_norm": 1.9238276481628418, + "learning_rate": 9.659299886777012e-06, + "loss": 1.6716, + "step": 45170 + }, + { + "epoch": 1.606257221580304, + "grad_norm": 1.8094438314437866, + "learning_rate": 9.655383084661949e-06, + "loss": 1.6769, + "step": 45180 + }, + { + "epoch": 1.6066127455337305, + "grad_norm": 1.9990025758743286, + "learning_rate": 9.651466335477883e-06, + "loss": 1.6519, + "step": 45190 + }, + { + "epoch": 1.6069682694871568, + "grad_norm": 2.1307549476623535, + "learning_rate": 9.647549639826402e-06, + "loss": 1.644, + "step": 45200 + }, + { + "epoch": 1.607323793440583, + "grad_norm": 1.9053353071212769, + "learning_rate": 9.643632998309087e-06, + "loss": 1.6658, + "step": 45210 + }, + { + "epoch": 1.6076793173940094, + "grad_norm": 1.9088976383209229, + "learning_rate": 9.63971641152751e-06, + "loss": 1.6273, + "step": 45220 + }, + { + "epoch": 1.6080348413474357, + "grad_norm": 1.8036609888076782, + "learning_rate": 9.635799880083233e-06, + "loss": 1.6782, + "step": 45230 + }, + { + "epoch": 1.608390365300862, + "grad_norm": 2.0587072372436523, + "learning_rate": 9.631883404577812e-06, + "loss": 1.682, + "step": 45240 + }, + { + "epoch": 1.6087458892542885, + "grad_norm": 1.741700530052185, + "learning_rate": 9.627966985612794e-06, + "loss": 1.6443, + "step": 45250 + }, + { + "epoch": 1.6091014132077148, + "grad_norm": 1.8790136575698853, + "learning_rate": 9.624050623789711e-06, + "loss": 1.6095, + "step": 45260 + }, + { + "epoch": 1.6094569371611414, + "grad_norm": 1.8529118299484253, + "learning_rate": 9.620134319710095e-06, + "loss": 1.6545, + "step": 45270 + }, + { + "epoch": 1.6098124611145677, + "grad_norm": 1.9063934087753296, + "learning_rate": 9.616218073975465e-06, + "loss": 1.6814, + "step": 45280 + }, + { + "epoch": 1.610167985067994, + "grad_norm": 1.8092721700668335, + "learning_rate": 9.612301887187332e-06, + "loss": 1.6598, + "step": 45290 + }, + { + "epoch": 1.6105235090214203, + "grad_norm": 1.9182853698730469, + "learning_rate": 9.608385759947201e-06, + "loss": 1.6163, + "step": 45300 + }, + { + "epoch": 1.6108790329748466, + "grad_norm": 1.8925962448120117, + "learning_rate": 9.60446969285656e-06, + "loss": 1.6523, + "step": 45310 + }, + { + "epoch": 1.6112345569282729, + "grad_norm": 1.7887133359909058, + "learning_rate": 9.600553686516894e-06, + "loss": 1.6825, + "step": 45320 + }, + { + "epoch": 1.6115900808816994, + "grad_norm": 1.8483270406723022, + "learning_rate": 9.596637741529678e-06, + "loss": 1.6532, + "step": 45330 + }, + { + "epoch": 1.6119456048351257, + "grad_norm": 2.0222012996673584, + "learning_rate": 9.592721858496376e-06, + "loss": 1.6712, + "step": 45340 + }, + { + "epoch": 1.6123011287885523, + "grad_norm": 1.7934224605560303, + "learning_rate": 9.588806038018444e-06, + "loss": 1.6788, + "step": 45350 + }, + { + "epoch": 1.6126566527419786, + "grad_norm": 1.889206051826477, + "learning_rate": 9.584890280697325e-06, + "loss": 1.6501, + "step": 45360 + }, + { + "epoch": 1.6130121766954049, + "grad_norm": 1.8354737758636475, + "learning_rate": 9.580974587134454e-06, + "loss": 1.6405, + "step": 45370 + }, + { + "epoch": 1.6133677006488312, + "grad_norm": 1.8703527450561523, + "learning_rate": 9.577058957931261e-06, + "loss": 1.6734, + "step": 45380 + }, + { + "epoch": 1.6137232246022575, + "grad_norm": 2.211743116378784, + "learning_rate": 9.57314339368916e-06, + "loss": 1.6122, + "step": 45390 + }, + { + "epoch": 1.6140787485556838, + "grad_norm": 1.9559645652770996, + "learning_rate": 9.569227895009556e-06, + "loss": 1.6778, + "step": 45400 + }, + { + "epoch": 1.6144342725091103, + "grad_norm": 1.844721794128418, + "learning_rate": 9.565312462493853e-06, + "loss": 1.6679, + "step": 45410 + }, + { + "epoch": 1.6147897964625366, + "grad_norm": 1.7419184446334839, + "learning_rate": 9.561397096743424e-06, + "loss": 1.6205, + "step": 45420 + }, + { + "epoch": 1.6151453204159631, + "grad_norm": 1.9916986227035522, + "learning_rate": 9.557481798359653e-06, + "loss": 1.6342, + "step": 45430 + }, + { + "epoch": 1.6155008443693895, + "grad_norm": 1.785646915435791, + "learning_rate": 9.553566567943902e-06, + "loss": 1.6274, + "step": 45440 + }, + { + "epoch": 1.6158563683228158, + "grad_norm": 1.9873098134994507, + "learning_rate": 9.549651406097528e-06, + "loss": 1.6946, + "step": 45450 + }, + { + "epoch": 1.616211892276242, + "grad_norm": 1.9251453876495361, + "learning_rate": 9.545736313421873e-06, + "loss": 1.6585, + "step": 45460 + }, + { + "epoch": 1.6165674162296684, + "grad_norm": 1.9129831790924072, + "learning_rate": 9.541821290518272e-06, + "loss": 1.6093, + "step": 45470 + }, + { + "epoch": 1.6169229401830947, + "grad_norm": 1.938437581062317, + "learning_rate": 9.53790633798805e-06, + "loss": 1.6951, + "step": 45480 + }, + { + "epoch": 1.6172784641365212, + "grad_norm": 1.880732774734497, + "learning_rate": 9.533991456432514e-06, + "loss": 1.6648, + "step": 45490 + }, + { + "epoch": 1.6176339880899475, + "grad_norm": 1.8452805280685425, + "learning_rate": 9.53007664645297e-06, + "loss": 1.7125, + "step": 45500 + }, + { + "epoch": 1.617989512043374, + "grad_norm": 1.8619343042373657, + "learning_rate": 9.526161908650707e-06, + "loss": 1.6481, + "step": 45510 + }, + { + "epoch": 1.6183450359968004, + "grad_norm": 1.9093763828277588, + "learning_rate": 9.522247243627003e-06, + "loss": 1.6185, + "step": 45520 + }, + { + "epoch": 1.6187005599502267, + "grad_norm": 1.8485217094421387, + "learning_rate": 9.518332651983126e-06, + "loss": 1.644, + "step": 45530 + }, + { + "epoch": 1.619056083903653, + "grad_norm": 2.0003044605255127, + "learning_rate": 9.514418134320338e-06, + "loss": 1.7058, + "step": 45540 + }, + { + "epoch": 1.6194116078570793, + "grad_norm": 1.8701895475387573, + "learning_rate": 9.510503691239874e-06, + "loss": 1.6464, + "step": 45550 + }, + { + "epoch": 1.6197671318105056, + "grad_norm": 1.814052939414978, + "learning_rate": 9.506589323342977e-06, + "loss": 1.6613, + "step": 45560 + }, + { + "epoch": 1.620122655763932, + "grad_norm": 1.9273310899734497, + "learning_rate": 9.50267503123087e-06, + "loss": 1.6323, + "step": 45570 + }, + { + "epoch": 1.6204781797173584, + "grad_norm": 1.8216038942337036, + "learning_rate": 9.498760815504757e-06, + "loss": 1.6257, + "step": 45580 + }, + { + "epoch": 1.620833703670785, + "grad_norm": 1.8107649087905884, + "learning_rate": 9.494846676765837e-06, + "loss": 1.6397, + "step": 45590 + }, + { + "epoch": 1.6211892276242112, + "grad_norm": 1.8750101327896118, + "learning_rate": 9.490932615615303e-06, + "loss": 1.6932, + "step": 45600 + }, + { + "epoch": 1.6215447515776376, + "grad_norm": 1.7151609659194946, + "learning_rate": 9.487018632654326e-06, + "loss": 1.6466, + "step": 45610 + }, + { + "epoch": 1.6219002755310639, + "grad_norm": 2.0541727542877197, + "learning_rate": 9.483104728484071e-06, + "loss": 1.6535, + "step": 45620 + }, + { + "epoch": 1.6222557994844902, + "grad_norm": 1.9208871126174927, + "learning_rate": 9.479190903705689e-06, + "loss": 1.6895, + "step": 45630 + }, + { + "epoch": 1.6226113234379165, + "grad_norm": 1.9287028312683105, + "learning_rate": 9.475277158920317e-06, + "loss": 1.6471, + "step": 45640 + }, + { + "epoch": 1.622966847391343, + "grad_norm": 1.8937067985534668, + "learning_rate": 9.471363494729084e-06, + "loss": 1.6675, + "step": 45650 + }, + { + "epoch": 1.6233223713447693, + "grad_norm": 1.8728097677230835, + "learning_rate": 9.467449911733106e-06, + "loss": 1.6872, + "step": 45660 + }, + { + "epoch": 1.6236778952981958, + "grad_norm": 1.8582308292388916, + "learning_rate": 9.463536410533482e-06, + "loss": 1.6465, + "step": 45670 + }, + { + "epoch": 1.6240334192516221, + "grad_norm": 1.7875969409942627, + "learning_rate": 9.4596229917313e-06, + "loss": 1.6823, + "step": 45680 + }, + { + "epoch": 1.6243889432050485, + "grad_norm": 2.1125075817108154, + "learning_rate": 9.455709655927637e-06, + "loss": 1.6869, + "step": 45690 + }, + { + "epoch": 1.6247444671584748, + "grad_norm": 2.0550649166107178, + "learning_rate": 9.45179640372356e-06, + "loss": 1.676, + "step": 45700 + }, + { + "epoch": 1.625099991111901, + "grad_norm": 1.9072517156600952, + "learning_rate": 9.447883235720118e-06, + "loss": 1.6536, + "step": 45710 + }, + { + "epoch": 1.6254555150653274, + "grad_norm": 1.7906911373138428, + "learning_rate": 9.443970152518351e-06, + "loss": 1.6242, + "step": 45720 + }, + { + "epoch": 1.625811039018754, + "grad_norm": 1.8642396926879883, + "learning_rate": 9.440057154719282e-06, + "loss": 1.6641, + "step": 45730 + }, + { + "epoch": 1.6261665629721802, + "grad_norm": 1.772557258605957, + "learning_rate": 9.436144242923924e-06, + "loss": 1.6442, + "step": 45740 + }, + { + "epoch": 1.6265220869256067, + "grad_norm": 1.9297966957092285, + "learning_rate": 9.432231417733272e-06, + "loss": 1.6824, + "step": 45750 + }, + { + "epoch": 1.626877610879033, + "grad_norm": 1.942812204360962, + "learning_rate": 9.428318679748314e-06, + "loss": 1.6509, + "step": 45760 + }, + { + "epoch": 1.6272331348324593, + "grad_norm": 1.9203059673309326, + "learning_rate": 9.424406029570022e-06, + "loss": 1.656, + "step": 45770 + }, + { + "epoch": 1.6275886587858857, + "grad_norm": 1.7564131021499634, + "learning_rate": 9.420493467799356e-06, + "loss": 1.6264, + "step": 45780 + }, + { + "epoch": 1.627944182739312, + "grad_norm": 1.772635817527771, + "learning_rate": 9.416580995037261e-06, + "loss": 1.6301, + "step": 45790 + }, + { + "epoch": 1.6282997066927383, + "grad_norm": 1.7491180896759033, + "learning_rate": 9.412668611884666e-06, + "loss": 1.6372, + "step": 45800 + }, + { + "epoch": 1.6286552306461648, + "grad_norm": 1.8844412565231323, + "learning_rate": 9.408756318942489e-06, + "loss": 1.7025, + "step": 45810 + }, + { + "epoch": 1.629010754599591, + "grad_norm": 1.8353954553604126, + "learning_rate": 9.404844116811634e-06, + "loss": 1.6664, + "step": 45820 + }, + { + "epoch": 1.6293662785530176, + "grad_norm": 1.952030897140503, + "learning_rate": 9.400932006092992e-06, + "loss": 1.6324, + "step": 45830 + }, + { + "epoch": 1.629721802506444, + "grad_norm": 1.7468416690826416, + "learning_rate": 9.397019987387437e-06, + "loss": 1.6635, + "step": 45840 + }, + { + "epoch": 1.6300773264598702, + "grad_norm": 1.8611869812011719, + "learning_rate": 9.393108061295832e-06, + "loss": 1.6152, + "step": 45850 + }, + { + "epoch": 1.6304328504132966, + "grad_norm": 1.8374412059783936, + "learning_rate": 9.389196228419025e-06, + "loss": 1.6389, + "step": 45860 + }, + { + "epoch": 1.6307883743667229, + "grad_norm": 1.7448116540908813, + "learning_rate": 9.385284489357845e-06, + "loss": 1.6487, + "step": 45870 + }, + { + "epoch": 1.6311438983201492, + "grad_norm": 2.0443167686462402, + "learning_rate": 9.381372844713112e-06, + "loss": 1.6542, + "step": 45880 + }, + { + "epoch": 1.6314994222735757, + "grad_norm": 1.8457872867584229, + "learning_rate": 9.377461295085633e-06, + "loss": 1.6289, + "step": 45890 + }, + { + "epoch": 1.631854946227002, + "grad_norm": 1.85100257396698, + "learning_rate": 9.3735498410762e-06, + "loss": 1.6703, + "step": 45900 + }, + { + "epoch": 1.6322104701804285, + "grad_norm": 1.9700928926467896, + "learning_rate": 9.36963848328558e-06, + "loss": 1.6933, + "step": 45910 + }, + { + "epoch": 1.6325659941338548, + "grad_norm": 1.913554072380066, + "learning_rate": 9.365727222314537e-06, + "loss": 1.6647, + "step": 45920 + }, + { + "epoch": 1.6329215180872811, + "grad_norm": 1.9028133153915405, + "learning_rate": 9.361816058763815e-06, + "loss": 1.69, + "step": 45930 + }, + { + "epoch": 1.6332770420407074, + "grad_norm": 2.0402705669403076, + "learning_rate": 9.357904993234144e-06, + "loss": 1.666, + "step": 45940 + }, + { + "epoch": 1.6336325659941338, + "grad_norm": 1.7308645248413086, + "learning_rate": 9.353994026326239e-06, + "loss": 1.6441, + "step": 45950 + }, + { + "epoch": 1.63398808994756, + "grad_norm": 1.9061516523361206, + "learning_rate": 9.350083158640803e-06, + "loss": 1.6332, + "step": 45960 + }, + { + "epoch": 1.6343436139009866, + "grad_norm": 1.780673861503601, + "learning_rate": 9.346172390778519e-06, + "loss": 1.6511, + "step": 45970 + }, + { + "epoch": 1.634699137854413, + "grad_norm": 1.911218285560608, + "learning_rate": 9.342261723340057e-06, + "loss": 1.6229, + "step": 45980 + }, + { + "epoch": 1.6350546618078394, + "grad_norm": 1.9093784093856812, + "learning_rate": 9.338351156926068e-06, + "loss": 1.6529, + "step": 45990 + }, + { + "epoch": 1.6354101857612657, + "grad_norm": 1.9924782514572144, + "learning_rate": 9.334440692137194e-06, + "loss": 1.6688, + "step": 46000 + }, + { + "epoch": 1.635765709714692, + "grad_norm": 1.9529900550842285, + "learning_rate": 9.330530329574055e-06, + "loss": 1.6208, + "step": 46010 + }, + { + "epoch": 1.6361212336681183, + "grad_norm": 1.913313388824463, + "learning_rate": 9.326620069837261e-06, + "loss": 1.6739, + "step": 46020 + }, + { + "epoch": 1.6364767576215447, + "grad_norm": 1.8248666524887085, + "learning_rate": 9.322709913527405e-06, + "loss": 1.6198, + "step": 46030 + }, + { + "epoch": 1.636832281574971, + "grad_norm": 2.080410957336426, + "learning_rate": 9.318799861245056e-06, + "loss": 1.6236, + "step": 46040 + }, + { + "epoch": 1.6371878055283975, + "grad_norm": 1.839089274406433, + "learning_rate": 9.314889913590778e-06, + "loss": 1.6578, + "step": 46050 + }, + { + "epoch": 1.6375433294818238, + "grad_norm": 1.9410159587860107, + "learning_rate": 9.310980071165118e-06, + "loss": 1.6916, + "step": 46060 + }, + { + "epoch": 1.6378988534352503, + "grad_norm": 1.8802642822265625, + "learning_rate": 9.307070334568592e-06, + "loss": 1.6286, + "step": 46070 + }, + { + "epoch": 1.6382543773886766, + "grad_norm": 1.7182576656341553, + "learning_rate": 9.303160704401721e-06, + "loss": 1.6485, + "step": 46080 + }, + { + "epoch": 1.638609901342103, + "grad_norm": 1.786268949508667, + "learning_rate": 9.299251181264997e-06, + "loss": 1.699, + "step": 46090 + }, + { + "epoch": 1.6389654252955292, + "grad_norm": 1.8786602020263672, + "learning_rate": 9.295341765758897e-06, + "loss": 1.6475, + "step": 46100 + }, + { + "epoch": 1.6393209492489556, + "grad_norm": 1.8300297260284424, + "learning_rate": 9.291432458483884e-06, + "loss": 1.6512, + "step": 46110 + }, + { + "epoch": 1.6396764732023819, + "grad_norm": 1.9037065505981445, + "learning_rate": 9.287523260040402e-06, + "loss": 1.6868, + "step": 46120 + }, + { + "epoch": 1.6400319971558084, + "grad_norm": 2.003335952758789, + "learning_rate": 9.28361417102888e-06, + "loss": 1.6196, + "step": 46130 + }, + { + "epoch": 1.6403875211092347, + "grad_norm": 1.788839340209961, + "learning_rate": 9.279705192049729e-06, + "loss": 1.6135, + "step": 46140 + }, + { + "epoch": 1.6407430450626612, + "grad_norm": 2.0094990730285645, + "learning_rate": 9.275796323703344e-06, + "loss": 1.673, + "step": 46150 + }, + { + "epoch": 1.6410985690160875, + "grad_norm": 1.7968368530273438, + "learning_rate": 9.271887566590106e-06, + "loss": 1.6354, + "step": 46160 + }, + { + "epoch": 1.6414540929695138, + "grad_norm": 1.926692247390747, + "learning_rate": 9.267978921310369e-06, + "loss": 1.65, + "step": 46170 + }, + { + "epoch": 1.6418096169229401, + "grad_norm": 1.8809410333633423, + "learning_rate": 9.264070388464481e-06, + "loss": 1.6341, + "step": 46180 + }, + { + "epoch": 1.6421651408763664, + "grad_norm": 1.714543104171753, + "learning_rate": 9.260161968652767e-06, + "loss": 1.6531, + "step": 46190 + }, + { + "epoch": 1.6425206648297928, + "grad_norm": 1.8635157346725464, + "learning_rate": 9.256253662475535e-06, + "loss": 1.6606, + "step": 46200 + }, + { + "epoch": 1.6428761887832193, + "grad_norm": 1.8542648553848267, + "learning_rate": 9.25234547053308e-06, + "loss": 1.6439, + "step": 46210 + }, + { + "epoch": 1.6432317127366456, + "grad_norm": 1.8535089492797852, + "learning_rate": 9.248437393425673e-06, + "loss": 1.6184, + "step": 46220 + }, + { + "epoch": 1.6435872366900721, + "grad_norm": 2.300245761871338, + "learning_rate": 9.24452943175357e-06, + "loss": 1.6301, + "step": 46230 + }, + { + "epoch": 1.6439427606434984, + "grad_norm": 1.8947709798812866, + "learning_rate": 9.240621586117006e-06, + "loss": 1.6802, + "step": 46240 + }, + { + "epoch": 1.6442982845969247, + "grad_norm": 1.6864510774612427, + "learning_rate": 9.236713857116207e-06, + "loss": 1.6322, + "step": 46250 + }, + { + "epoch": 1.644653808550351, + "grad_norm": 1.9338759183883667, + "learning_rate": 9.232806245351373e-06, + "loss": 1.6979, + "step": 46260 + }, + { + "epoch": 1.6450093325037773, + "grad_norm": 1.9570294618606567, + "learning_rate": 9.228898751422687e-06, + "loss": 1.6405, + "step": 46270 + }, + { + "epoch": 1.6453648564572037, + "grad_norm": 1.834808349609375, + "learning_rate": 9.224991375930321e-06, + "loss": 1.6452, + "step": 46280 + }, + { + "epoch": 1.6457203804106302, + "grad_norm": 1.7941116094589233, + "learning_rate": 9.22108411947442e-06, + "loss": 1.6734, + "step": 46290 + }, + { + "epoch": 1.6460759043640565, + "grad_norm": 1.899541974067688, + "learning_rate": 9.217176982655116e-06, + "loss": 1.6549, + "step": 46300 + }, + { + "epoch": 1.646431428317483, + "grad_norm": 1.819176435470581, + "learning_rate": 9.213269966072515e-06, + "loss": 1.681, + "step": 46310 + }, + { + "epoch": 1.6467869522709093, + "grad_norm": 1.7770761251449585, + "learning_rate": 9.209363070326717e-06, + "loss": 1.6408, + "step": 46320 + }, + { + "epoch": 1.6471424762243356, + "grad_norm": 2.044375419616699, + "learning_rate": 9.205456296017795e-06, + "loss": 1.7003, + "step": 46330 + }, + { + "epoch": 1.647498000177762, + "grad_norm": 1.8414000272750854, + "learning_rate": 9.201549643745803e-06, + "loss": 1.6818, + "step": 46340 + }, + { + "epoch": 1.6478535241311882, + "grad_norm": 1.8396013975143433, + "learning_rate": 9.197643114110779e-06, + "loss": 1.6984, + "step": 46350 + }, + { + "epoch": 1.6482090480846145, + "grad_norm": 1.8234518766403198, + "learning_rate": 9.193736707712741e-06, + "loss": 1.6635, + "step": 46360 + }, + { + "epoch": 1.648564572038041, + "grad_norm": 1.8317780494689941, + "learning_rate": 9.189830425151691e-06, + "loss": 1.656, + "step": 46370 + }, + { + "epoch": 1.6489200959914674, + "grad_norm": 1.935102939605713, + "learning_rate": 9.185924267027611e-06, + "loss": 1.6474, + "step": 46380 + }, + { + "epoch": 1.649275619944894, + "grad_norm": 1.9568835496902466, + "learning_rate": 9.182018233940455e-06, + "loss": 1.6568, + "step": 46390 + }, + { + "epoch": 1.6496311438983202, + "grad_norm": 1.7801934480667114, + "learning_rate": 9.17811232649017e-06, + "loss": 1.6024, + "step": 46400 + }, + { + "epoch": 1.6499866678517465, + "grad_norm": 1.8486586809158325, + "learning_rate": 9.174206545276678e-06, + "loss": 1.6467, + "step": 46410 + }, + { + "epoch": 1.6503421918051728, + "grad_norm": 1.9290732145309448, + "learning_rate": 9.170300890899885e-06, + "loss": 1.6414, + "step": 46420 + }, + { + "epoch": 1.6506977157585991, + "grad_norm": 1.742576241493225, + "learning_rate": 9.166395363959672e-06, + "loss": 1.6737, + "step": 46430 + }, + { + "epoch": 1.6510532397120254, + "grad_norm": 1.829534888267517, + "learning_rate": 9.162489965055901e-06, + "loss": 1.6578, + "step": 46440 + }, + { + "epoch": 1.651408763665452, + "grad_norm": 1.8022770881652832, + "learning_rate": 9.158584694788425e-06, + "loss": 1.6884, + "step": 46450 + }, + { + "epoch": 1.6517642876188783, + "grad_norm": 1.9050766229629517, + "learning_rate": 9.154679553757059e-06, + "loss": 1.6478, + "step": 46460 + }, + { + "epoch": 1.6521198115723048, + "grad_norm": 1.8912161588668823, + "learning_rate": 9.15077454256162e-06, + "loss": 1.696, + "step": 46470 + }, + { + "epoch": 1.6524753355257311, + "grad_norm": 1.9854507446289062, + "learning_rate": 9.146869661801882e-06, + "loss": 1.6734, + "step": 46480 + }, + { + "epoch": 1.6528308594791574, + "grad_norm": 2.0170986652374268, + "learning_rate": 9.142964912077615e-06, + "loss": 1.6255, + "step": 46490 + }, + { + "epoch": 1.6531863834325837, + "grad_norm": 1.7834538221359253, + "learning_rate": 9.139060293988564e-06, + "loss": 1.6196, + "step": 46500 + }, + { + "epoch": 1.65354190738601, + "grad_norm": 1.8555934429168701, + "learning_rate": 9.135155808134454e-06, + "loss": 1.6007, + "step": 46510 + }, + { + "epoch": 1.6538974313394363, + "grad_norm": 1.8404572010040283, + "learning_rate": 9.131251455114991e-06, + "loss": 1.6705, + "step": 46520 + }, + { + "epoch": 1.6542529552928629, + "grad_norm": 1.8445826768875122, + "learning_rate": 9.127347235529856e-06, + "loss": 1.634, + "step": 46530 + }, + { + "epoch": 1.6546084792462892, + "grad_norm": 1.8148281574249268, + "learning_rate": 9.123443149978714e-06, + "loss": 1.6293, + "step": 46540 + }, + { + "epoch": 1.6549640031997157, + "grad_norm": 2.113463878631592, + "learning_rate": 9.119539199061211e-06, + "loss": 1.6654, + "step": 46550 + }, + { + "epoch": 1.655319527153142, + "grad_norm": 1.8898741006851196, + "learning_rate": 9.115635383376963e-06, + "loss": 1.6806, + "step": 46560 + }, + { + "epoch": 1.6556750511065683, + "grad_norm": 1.9250837564468384, + "learning_rate": 9.111731703525575e-06, + "loss": 1.6601, + "step": 46570 + }, + { + "epoch": 1.6560305750599946, + "grad_norm": 1.8493584394454956, + "learning_rate": 9.10782816010663e-06, + "loss": 1.6724, + "step": 46580 + }, + { + "epoch": 1.656386099013421, + "grad_norm": 1.9017980098724365, + "learning_rate": 9.103924753719681e-06, + "loss": 1.6255, + "step": 46590 + }, + { + "epoch": 1.6567416229668472, + "grad_norm": 1.790822982788086, + "learning_rate": 9.100021484964277e-06, + "loss": 1.624, + "step": 46600 + }, + { + "epoch": 1.6570971469202738, + "grad_norm": 1.837369441986084, + "learning_rate": 9.096118354439927e-06, + "loss": 1.6454, + "step": 46610 + }, + { + "epoch": 1.6574526708737, + "grad_norm": 1.9058386087417603, + "learning_rate": 9.09221536274613e-06, + "loss": 1.6457, + "step": 46620 + }, + { + "epoch": 1.6578081948271266, + "grad_norm": 1.996495008468628, + "learning_rate": 9.088312510482363e-06, + "loss": 1.691, + "step": 46630 + }, + { + "epoch": 1.658163718780553, + "grad_norm": 1.959643006324768, + "learning_rate": 9.084409798248076e-06, + "loss": 1.6305, + "step": 46640 + }, + { + "epoch": 1.6585192427339792, + "grad_norm": 1.8285119533538818, + "learning_rate": 9.080507226642708e-06, + "loss": 1.6712, + "step": 46650 + }, + { + "epoch": 1.6588747666874055, + "grad_norm": 2.047224521636963, + "learning_rate": 9.07660479626566e-06, + "loss": 1.5961, + "step": 46660 + }, + { + "epoch": 1.6592302906408318, + "grad_norm": 1.8310611248016357, + "learning_rate": 9.072702507716327e-06, + "loss": 1.6286, + "step": 46670 + }, + { + "epoch": 1.6595858145942581, + "grad_norm": 1.9075833559036255, + "learning_rate": 9.068800361594073e-06, + "loss": 1.6366, + "step": 46680 + }, + { + "epoch": 1.6599413385476847, + "grad_norm": 1.903961420059204, + "learning_rate": 9.064898358498246e-06, + "loss": 1.696, + "step": 46690 + }, + { + "epoch": 1.660296862501111, + "grad_norm": 2.237349033355713, + "learning_rate": 9.060996499028173e-06, + "loss": 1.6458, + "step": 46700 + }, + { + "epoch": 1.6606523864545375, + "grad_norm": 1.8397843837738037, + "learning_rate": 9.057094783783144e-06, + "loss": 1.6819, + "step": 46710 + }, + { + "epoch": 1.6610079104079638, + "grad_norm": 1.9820095300674438, + "learning_rate": 9.053193213362445e-06, + "loss": 1.658, + "step": 46720 + }, + { + "epoch": 1.6613634343613901, + "grad_norm": 1.8050580024719238, + "learning_rate": 9.049291788365335e-06, + "loss": 1.6458, + "step": 46730 + }, + { + "epoch": 1.6617189583148164, + "grad_norm": 1.89781653881073, + "learning_rate": 9.045390509391043e-06, + "loss": 1.7096, + "step": 46740 + }, + { + "epoch": 1.6620744822682427, + "grad_norm": 1.8570796251296997, + "learning_rate": 9.041489377038783e-06, + "loss": 1.6281, + "step": 46750 + }, + { + "epoch": 1.662430006221669, + "grad_norm": 1.7772490978240967, + "learning_rate": 9.037588391907744e-06, + "loss": 1.6876, + "step": 46760 + }, + { + "epoch": 1.6627855301750956, + "grad_norm": 2.075768232345581, + "learning_rate": 9.033687554597093e-06, + "loss": 1.6348, + "step": 46770 + }, + { + "epoch": 1.6631410541285219, + "grad_norm": 1.9685065746307373, + "learning_rate": 9.029786865705978e-06, + "loss": 1.6813, + "step": 46780 + }, + { + "epoch": 1.6634965780819484, + "grad_norm": 1.8220576047897339, + "learning_rate": 9.025886325833517e-06, + "loss": 1.6675, + "step": 46790 + }, + { + "epoch": 1.6638521020353747, + "grad_norm": 1.6773161888122559, + "learning_rate": 9.021985935578805e-06, + "loss": 1.6648, + "step": 46800 + }, + { + "epoch": 1.664207625988801, + "grad_norm": 2.089994192123413, + "learning_rate": 9.018085695540923e-06, + "loss": 1.5902, + "step": 46810 + }, + { + "epoch": 1.6645631499422273, + "grad_norm": 1.9748122692108154, + "learning_rate": 9.014185606318921e-06, + "loss": 1.6401, + "step": 46820 + }, + { + "epoch": 1.6649186738956536, + "grad_norm": 1.8910719156265259, + "learning_rate": 9.01028566851183e-06, + "loss": 1.6716, + "step": 46830 + }, + { + "epoch": 1.66527419784908, + "grad_norm": 1.948042392730713, + "learning_rate": 9.006385882718655e-06, + "loss": 1.6564, + "step": 46840 + }, + { + "epoch": 1.6656297218025065, + "grad_norm": 1.7150905132293701, + "learning_rate": 9.002486249538379e-06, + "loss": 1.6166, + "step": 46850 + }, + { + "epoch": 1.6659852457559328, + "grad_norm": 1.9290721416473389, + "learning_rate": 8.998586769569962e-06, + "loss": 1.6315, + "step": 46860 + }, + { + "epoch": 1.6663407697093593, + "grad_norm": 1.9267395734786987, + "learning_rate": 8.994687443412336e-06, + "loss": 1.6725, + "step": 46870 + }, + { + "epoch": 1.6666962936627856, + "grad_norm": 2.0027971267700195, + "learning_rate": 8.990788271664414e-06, + "loss": 1.6515, + "step": 46880 + }, + { + "epoch": 1.667051817616212, + "grad_norm": 1.913419246673584, + "learning_rate": 8.986889254925086e-06, + "loss": 1.643, + "step": 46890 + }, + { + "epoch": 1.6674073415696382, + "grad_norm": 1.940077304840088, + "learning_rate": 8.982990393793216e-06, + "loss": 1.6508, + "step": 46900 + }, + { + "epoch": 1.6677628655230645, + "grad_norm": 1.7701328992843628, + "learning_rate": 8.979091688867648e-06, + "loss": 1.6323, + "step": 46910 + }, + { + "epoch": 1.6681183894764908, + "grad_norm": 1.9629327058792114, + "learning_rate": 8.975193140747192e-06, + "loss": 1.6514, + "step": 46920 + }, + { + "epoch": 1.6684739134299174, + "grad_norm": 1.7670210599899292, + "learning_rate": 8.971294750030643e-06, + "loss": 1.6442, + "step": 46930 + }, + { + "epoch": 1.6688294373833437, + "grad_norm": 1.852724313735962, + "learning_rate": 8.967396517316772e-06, + "loss": 1.6637, + "step": 46940 + }, + { + "epoch": 1.6691849613367702, + "grad_norm": 1.8711625337600708, + "learning_rate": 8.963498443204322e-06, + "loss": 1.6879, + "step": 46950 + }, + { + "epoch": 1.6695404852901965, + "grad_norm": 1.9303196668624878, + "learning_rate": 8.959600528292015e-06, + "loss": 1.6889, + "step": 46960 + }, + { + "epoch": 1.6698960092436228, + "grad_norm": 2.0949487686157227, + "learning_rate": 8.955702773178539e-06, + "loss": 1.6167, + "step": 46970 + }, + { + "epoch": 1.670251533197049, + "grad_norm": 1.7735263109207153, + "learning_rate": 8.951805178462571e-06, + "loss": 1.6337, + "step": 46980 + }, + { + "epoch": 1.6706070571504754, + "grad_norm": 1.8674527406692505, + "learning_rate": 8.947907744742754e-06, + "loss": 1.6506, + "step": 46990 + }, + { + "epoch": 1.6709625811039017, + "grad_norm": 1.892725944519043, + "learning_rate": 8.944010472617712e-06, + "loss": 1.5994, + "step": 47000 + }, + { + "epoch": 1.6713181050573283, + "grad_norm": 1.900651216506958, + "learning_rate": 8.94011336268604e-06, + "loss": 1.6443, + "step": 47010 + }, + { + "epoch": 1.6716736290107546, + "grad_norm": 1.9501687288284302, + "learning_rate": 8.936216415546313e-06, + "loss": 1.6676, + "step": 47020 + }, + { + "epoch": 1.672029152964181, + "grad_norm": 1.8256280422210693, + "learning_rate": 8.932319631797072e-06, + "loss": 1.7012, + "step": 47030 + }, + { + "epoch": 1.6723846769176074, + "grad_norm": 1.8548064231872559, + "learning_rate": 8.928423012036842e-06, + "loss": 1.6551, + "step": 47040 + }, + { + "epoch": 1.6727402008710337, + "grad_norm": 1.8793046474456787, + "learning_rate": 8.924526556864117e-06, + "loss": 1.666, + "step": 47050 + }, + { + "epoch": 1.67309572482446, + "grad_norm": 1.877183198928833, + "learning_rate": 8.920630266877369e-06, + "loss": 1.6759, + "step": 47060 + }, + { + "epoch": 1.6734512487778863, + "grad_norm": 1.9366111755371094, + "learning_rate": 8.916734142675045e-06, + "loss": 1.6687, + "step": 47070 + }, + { + "epoch": 1.6738067727313126, + "grad_norm": 1.7445632219314575, + "learning_rate": 8.912838184855565e-06, + "loss": 1.6967, + "step": 47080 + }, + { + "epoch": 1.6741622966847391, + "grad_norm": 1.8954089879989624, + "learning_rate": 8.908942394017325e-06, + "loss": 1.6688, + "step": 47090 + }, + { + "epoch": 1.6745178206381655, + "grad_norm": 1.915601134300232, + "learning_rate": 8.90504677075869e-06, + "loss": 1.6158, + "step": 47100 + }, + { + "epoch": 1.674873344591592, + "grad_norm": 1.992204189300537, + "learning_rate": 8.901151315678005e-06, + "loss": 1.6899, + "step": 47110 + }, + { + "epoch": 1.6752288685450183, + "grad_norm": 2.290611505508423, + "learning_rate": 8.897256029373588e-06, + "loss": 1.6479, + "step": 47120 + }, + { + "epoch": 1.6755843924984446, + "grad_norm": 1.8793078660964966, + "learning_rate": 8.89336091244373e-06, + "loss": 1.6834, + "step": 47130 + }, + { + "epoch": 1.675939916451871, + "grad_norm": 2.002035140991211, + "learning_rate": 8.8894659654867e-06, + "loss": 1.6357, + "step": 47140 + }, + { + "epoch": 1.6762954404052972, + "grad_norm": 1.8884364366531372, + "learning_rate": 8.885571189100736e-06, + "loss": 1.6243, + "step": 47150 + }, + { + "epoch": 1.6766509643587235, + "grad_norm": 1.9206151962280273, + "learning_rate": 8.881676583884047e-06, + "loss": 1.6377, + "step": 47160 + }, + { + "epoch": 1.67700648831215, + "grad_norm": 1.73343026638031, + "learning_rate": 8.877782150434822e-06, + "loss": 1.6473, + "step": 47170 + }, + { + "epoch": 1.6773620122655764, + "grad_norm": 1.940598487854004, + "learning_rate": 8.873887889351228e-06, + "loss": 1.6213, + "step": 47180 + }, + { + "epoch": 1.6777175362190029, + "grad_norm": 1.9257320165634155, + "learning_rate": 8.869993801231388e-06, + "loss": 1.6867, + "step": 47190 + }, + { + "epoch": 1.6780730601724292, + "grad_norm": 1.8307676315307617, + "learning_rate": 8.866099886673415e-06, + "loss": 1.6532, + "step": 47200 + }, + { + "epoch": 1.6784285841258555, + "grad_norm": 1.7601826190948486, + "learning_rate": 8.86220614627539e-06, + "loss": 1.6535, + "step": 47210 + }, + { + "epoch": 1.6787841080792818, + "grad_norm": 1.8694700002670288, + "learning_rate": 8.858312580635369e-06, + "loss": 1.6278, + "step": 47220 + }, + { + "epoch": 1.679139632032708, + "grad_norm": 1.9378466606140137, + "learning_rate": 8.854419190351376e-06, + "loss": 1.5898, + "step": 47230 + }, + { + "epoch": 1.6794951559861344, + "grad_norm": 1.9514213800430298, + "learning_rate": 8.850525976021411e-06, + "loss": 1.6166, + "step": 47240 + }, + { + "epoch": 1.679850679939561, + "grad_norm": 1.9289764165878296, + "learning_rate": 8.846632938243449e-06, + "loss": 1.647, + "step": 47250 + }, + { + "epoch": 1.6802062038929872, + "grad_norm": 1.8685369491577148, + "learning_rate": 8.842740077615437e-06, + "loss": 1.608, + "step": 47260 + }, + { + "epoch": 1.6805617278464138, + "grad_norm": 1.9230856895446777, + "learning_rate": 8.838847394735295e-06, + "loss": 1.6099, + "step": 47270 + }, + { + "epoch": 1.68091725179984, + "grad_norm": 1.829556941986084, + "learning_rate": 8.83495489020091e-06, + "loss": 1.679, + "step": 47280 + }, + { + "epoch": 1.6812727757532664, + "grad_norm": 1.9097543954849243, + "learning_rate": 8.831062564610149e-06, + "loss": 1.6295, + "step": 47290 + }, + { + "epoch": 1.6816282997066927, + "grad_norm": 1.8748795986175537, + "learning_rate": 8.827170418560848e-06, + "loss": 1.6632, + "step": 47300 + }, + { + "epoch": 1.681983823660119, + "grad_norm": 1.975656270980835, + "learning_rate": 8.823278452650818e-06, + "loss": 1.6355, + "step": 47310 + }, + { + "epoch": 1.6823393476135453, + "grad_norm": 1.9507516622543335, + "learning_rate": 8.81938666747784e-06, + "loss": 1.6695, + "step": 47320 + }, + { + "epoch": 1.6826948715669718, + "grad_norm": 2.046321153640747, + "learning_rate": 8.81549506363967e-06, + "loss": 1.6775, + "step": 47330 + }, + { + "epoch": 1.6830503955203981, + "grad_norm": 1.8780962228775024, + "learning_rate": 8.81160364173403e-06, + "loss": 1.6728, + "step": 47340 + }, + { + "epoch": 1.6834059194738247, + "grad_norm": 1.9072388410568237, + "learning_rate": 8.80771240235862e-06, + "loss": 1.6772, + "step": 47350 + }, + { + "epoch": 1.683761443427251, + "grad_norm": 1.8728219270706177, + "learning_rate": 8.803821346111107e-06, + "loss": 1.6678, + "step": 47360 + }, + { + "epoch": 1.6841169673806773, + "grad_norm": 1.8046735525131226, + "learning_rate": 8.799930473589135e-06, + "loss": 1.6423, + "step": 47370 + }, + { + "epoch": 1.6844724913341036, + "grad_norm": 1.9991408586502075, + "learning_rate": 8.796039785390318e-06, + "loss": 1.6361, + "step": 47380 + }, + { + "epoch": 1.68482801528753, + "grad_norm": 1.8107160329818726, + "learning_rate": 8.792149282112242e-06, + "loss": 1.6728, + "step": 47390 + }, + { + "epoch": 1.6851835392409562, + "grad_norm": 1.8943120241165161, + "learning_rate": 8.788258964352462e-06, + "loss": 1.6312, + "step": 47400 + }, + { + "epoch": 1.6855390631943827, + "grad_norm": 1.8581575155258179, + "learning_rate": 8.784368832708508e-06, + "loss": 1.6802, + "step": 47410 + }, + { + "epoch": 1.685894587147809, + "grad_norm": 1.838925838470459, + "learning_rate": 8.780478887777878e-06, + "loss": 1.6856, + "step": 47420 + }, + { + "epoch": 1.6862501111012356, + "grad_norm": 1.986754298210144, + "learning_rate": 8.776589130158045e-06, + "loss": 1.6369, + "step": 47430 + }, + { + "epoch": 1.6866056350546619, + "grad_norm": 1.8603802919387817, + "learning_rate": 8.77269956044645e-06, + "loss": 1.6509, + "step": 47440 + }, + { + "epoch": 1.6869611590080882, + "grad_norm": 1.826055884361267, + "learning_rate": 8.768810179240509e-06, + "loss": 1.6599, + "step": 47450 + }, + { + "epoch": 1.6873166829615145, + "grad_norm": 1.924135684967041, + "learning_rate": 8.764920987137607e-06, + "loss": 1.6413, + "step": 47460 + }, + { + "epoch": 1.6876722069149408, + "grad_norm": 1.8157100677490234, + "learning_rate": 8.761031984735093e-06, + "loss": 1.6616, + "step": 47470 + }, + { + "epoch": 1.688027730868367, + "grad_norm": 1.8275957107543945, + "learning_rate": 8.757143172630297e-06, + "loss": 1.653, + "step": 47480 + }, + { + "epoch": 1.6883832548217936, + "grad_norm": 1.926822304725647, + "learning_rate": 8.753254551420521e-06, + "loss": 1.6631, + "step": 47490 + }, + { + "epoch": 1.68873877877522, + "grad_norm": 1.8407974243164062, + "learning_rate": 8.74936612170303e-06, + "loss": 1.6275, + "step": 47500 + }, + { + "epoch": 1.6890943027286465, + "grad_norm": 2.0156941413879395, + "learning_rate": 8.745477884075058e-06, + "loss": 1.6607, + "step": 47510 + }, + { + "epoch": 1.6894498266820728, + "grad_norm": 1.8555457592010498, + "learning_rate": 8.741589839133817e-06, + "loss": 1.6482, + "step": 47520 + }, + { + "epoch": 1.689805350635499, + "grad_norm": 2.3219103813171387, + "learning_rate": 8.737701987476487e-06, + "loss": 1.682, + "step": 47530 + }, + { + "epoch": 1.6901608745889254, + "grad_norm": 1.843186378479004, + "learning_rate": 8.733814329700218e-06, + "loss": 1.6759, + "step": 47540 + }, + { + "epoch": 1.6905163985423517, + "grad_norm": 2.1189942359924316, + "learning_rate": 8.729926866402126e-06, + "loss": 1.6852, + "step": 47550 + }, + { + "epoch": 1.690871922495778, + "grad_norm": 2.0449013710021973, + "learning_rate": 8.726039598179305e-06, + "loss": 1.6711, + "step": 47560 + }, + { + "epoch": 1.6912274464492045, + "grad_norm": 1.9948835372924805, + "learning_rate": 8.722152525628816e-06, + "loss": 1.6492, + "step": 47570 + }, + { + "epoch": 1.6915829704026308, + "grad_norm": 1.9276567697525024, + "learning_rate": 8.718265649347684e-06, + "loss": 1.6344, + "step": 47580 + }, + { + "epoch": 1.6919384943560574, + "grad_norm": 1.9229627847671509, + "learning_rate": 8.714378969932914e-06, + "loss": 1.645, + "step": 47590 + }, + { + "epoch": 1.6922940183094837, + "grad_norm": 1.7437576055526733, + "learning_rate": 8.710492487981472e-06, + "loss": 1.6547, + "step": 47600 + }, + { + "epoch": 1.69264954226291, + "grad_norm": 1.8802158832550049, + "learning_rate": 8.706606204090299e-06, + "loss": 1.6974, + "step": 47610 + }, + { + "epoch": 1.6930050662163363, + "grad_norm": 1.8326317071914673, + "learning_rate": 8.702720118856302e-06, + "loss": 1.6751, + "step": 47620 + }, + { + "epoch": 1.6933605901697626, + "grad_norm": 1.897667646408081, + "learning_rate": 8.698834232876362e-06, + "loss": 1.6785, + "step": 47630 + }, + { + "epoch": 1.693716114123189, + "grad_norm": 2.0973215103149414, + "learning_rate": 8.694948546747328e-06, + "loss": 1.6683, + "step": 47640 + }, + { + "epoch": 1.6940716380766154, + "grad_norm": 1.8899247646331787, + "learning_rate": 8.691063061066011e-06, + "loss": 1.6666, + "step": 47650 + }, + { + "epoch": 1.6944271620300417, + "grad_norm": 1.9390369653701782, + "learning_rate": 8.687177776429205e-06, + "loss": 1.6676, + "step": 47660 + }, + { + "epoch": 1.6947826859834683, + "grad_norm": 1.9885674715042114, + "learning_rate": 8.683292693433658e-06, + "loss": 1.6651, + "step": 47670 + }, + { + "epoch": 1.6951382099368946, + "grad_norm": 1.8569226264953613, + "learning_rate": 8.679407812676098e-06, + "loss": 1.6536, + "step": 47680 + }, + { + "epoch": 1.6954937338903209, + "grad_norm": 2.06219482421875, + "learning_rate": 8.675523134753216e-06, + "loss": 1.6487, + "step": 47690 + }, + { + "epoch": 1.6958492578437472, + "grad_norm": 1.8178149461746216, + "learning_rate": 8.671638660261677e-06, + "loss": 1.6418, + "step": 47700 + }, + { + "epoch": 1.6962047817971735, + "grad_norm": 1.7728937864303589, + "learning_rate": 8.667754389798112e-06, + "loss": 1.6617, + "step": 47710 + }, + { + "epoch": 1.6965603057505998, + "grad_norm": 1.7685102224349976, + "learning_rate": 8.663870323959118e-06, + "loss": 1.6747, + "step": 47720 + }, + { + "epoch": 1.6969158297040263, + "grad_norm": 1.9068578481674194, + "learning_rate": 8.659986463341266e-06, + "loss": 1.6484, + "step": 47730 + }, + { + "epoch": 1.6972713536574526, + "grad_norm": 1.7723239660263062, + "learning_rate": 8.656102808541089e-06, + "loss": 1.6443, + "step": 47740 + }, + { + "epoch": 1.6976268776108792, + "grad_norm": 1.8427810668945312, + "learning_rate": 8.652219360155096e-06, + "loss": 1.6363, + "step": 47750 + }, + { + "epoch": 1.6979824015643055, + "grad_norm": 1.959257960319519, + "learning_rate": 8.648336118779757e-06, + "loss": 1.643, + "step": 47760 + }, + { + "epoch": 1.6983379255177318, + "grad_norm": 2.000145196914673, + "learning_rate": 8.644453085011518e-06, + "loss": 1.6638, + "step": 47770 + }, + { + "epoch": 1.698693449471158, + "grad_norm": 1.7659945487976074, + "learning_rate": 8.640570259446782e-06, + "loss": 1.6807, + "step": 47780 + }, + { + "epoch": 1.6990489734245844, + "grad_norm": 1.9426976442337036, + "learning_rate": 8.636687642681935e-06, + "loss": 1.65, + "step": 47790 + }, + { + "epoch": 1.6994044973780107, + "grad_norm": 1.8343504667282104, + "learning_rate": 8.632805235313315e-06, + "loss": 1.6104, + "step": 47800 + }, + { + "epoch": 1.6997600213314372, + "grad_norm": 1.7841132879257202, + "learning_rate": 8.628923037937238e-06, + "loss": 1.6606, + "step": 47810 + }, + { + "epoch": 1.7001155452848635, + "grad_norm": 1.7915278673171997, + "learning_rate": 8.625041051149992e-06, + "loss": 1.6563, + "step": 47820 + }, + { + "epoch": 1.70047106923829, + "grad_norm": 1.7958619594573975, + "learning_rate": 8.621159275547813e-06, + "loss": 1.6554, + "step": 47830 + }, + { + "epoch": 1.7008265931917164, + "grad_norm": 1.9656076431274414, + "learning_rate": 8.617277711726927e-06, + "loss": 1.6628, + "step": 47840 + }, + { + "epoch": 1.7011821171451427, + "grad_norm": 1.9339438676834106, + "learning_rate": 8.613396360283513e-06, + "loss": 1.6671, + "step": 47850 + }, + { + "epoch": 1.701537641098569, + "grad_norm": 1.8888641595840454, + "learning_rate": 8.609515221813723e-06, + "loss": 1.6346, + "step": 47860 + }, + { + "epoch": 1.7018931650519953, + "grad_norm": 1.8836134672164917, + "learning_rate": 8.605634296913677e-06, + "loss": 1.6757, + "step": 47870 + }, + { + "epoch": 1.7022486890054216, + "grad_norm": 1.9890522956848145, + "learning_rate": 8.60175358617946e-06, + "loss": 1.6638, + "step": 47880 + }, + { + "epoch": 1.7026042129588481, + "grad_norm": 1.8154431581497192, + "learning_rate": 8.597873090207126e-06, + "loss": 1.6747, + "step": 47890 + }, + { + "epoch": 1.7029597369122744, + "grad_norm": 1.8716108798980713, + "learning_rate": 8.593992809592696e-06, + "loss": 1.6357, + "step": 47900 + }, + { + "epoch": 1.703315260865701, + "grad_norm": 1.9345163106918335, + "learning_rate": 8.590112744932152e-06, + "loss": 1.6609, + "step": 47910 + }, + { + "epoch": 1.7036707848191273, + "grad_norm": 1.8247530460357666, + "learning_rate": 8.58623289682145e-06, + "loss": 1.6321, + "step": 47920 + }, + { + "epoch": 1.7040263087725536, + "grad_norm": 1.8243434429168701, + "learning_rate": 8.58235326585651e-06, + "loss": 1.6495, + "step": 47930 + }, + { + "epoch": 1.7043818327259799, + "grad_norm": 2.006085157394409, + "learning_rate": 8.578473852633223e-06, + "loss": 1.6593, + "step": 47940 + }, + { + "epoch": 1.7047373566794062, + "grad_norm": 1.8560900688171387, + "learning_rate": 8.574594657747438e-06, + "loss": 1.6531, + "step": 47950 + }, + { + "epoch": 1.7050928806328325, + "grad_norm": 2.0169386863708496, + "learning_rate": 8.570715681794975e-06, + "loss": 1.6395, + "step": 47960 + }, + { + "epoch": 1.705448404586259, + "grad_norm": 1.9506243467330933, + "learning_rate": 8.566836925371622e-06, + "loss": 1.6153, + "step": 47970 + }, + { + "epoch": 1.7058039285396853, + "grad_norm": 1.9792253971099854, + "learning_rate": 8.562958389073135e-06, + "loss": 1.659, + "step": 47980 + }, + { + "epoch": 1.7061594524931118, + "grad_norm": 1.7830065488815308, + "learning_rate": 8.559080073495225e-06, + "loss": 1.6293, + "step": 47990 + }, + { + "epoch": 1.7065149764465382, + "grad_norm": 1.8974437713623047, + "learning_rate": 8.555201979233582e-06, + "loss": 1.6888, + "step": 48000 + }, + { + "epoch": 1.7068705003999645, + "grad_norm": 1.8268883228302002, + "learning_rate": 8.551324106883855e-06, + "loss": 1.6514, + "step": 48010 + }, + { + "epoch": 1.7072260243533908, + "grad_norm": 1.7850176095962524, + "learning_rate": 8.547446457041661e-06, + "loss": 1.6396, + "step": 48020 + }, + { + "epoch": 1.707581548306817, + "grad_norm": 1.9473520517349243, + "learning_rate": 8.543569030302588e-06, + "loss": 1.6424, + "step": 48030 + }, + { + "epoch": 1.7079370722602434, + "grad_norm": 1.8418095111846924, + "learning_rate": 8.539691827262176e-06, + "loss": 1.651, + "step": 48040 + }, + { + "epoch": 1.70829259621367, + "grad_norm": 1.8214665651321411, + "learning_rate": 8.535814848515945e-06, + "loss": 1.6395, + "step": 48050 + }, + { + "epoch": 1.7086481201670962, + "grad_norm": 1.9348556995391846, + "learning_rate": 8.531938094659372e-06, + "loss": 1.671, + "step": 48060 + }, + { + "epoch": 1.7090036441205227, + "grad_norm": 1.7536616325378418, + "learning_rate": 8.528061566287903e-06, + "loss": 1.652, + "step": 48070 + }, + { + "epoch": 1.709359168073949, + "grad_norm": 1.8494676351547241, + "learning_rate": 8.524185263996949e-06, + "loss": 1.6481, + "step": 48080 + }, + { + "epoch": 1.7097146920273754, + "grad_norm": 1.9387456178665161, + "learning_rate": 8.520309188381883e-06, + "loss": 1.6661, + "step": 48090 + }, + { + "epoch": 1.7100702159808017, + "grad_norm": 2.008708953857422, + "learning_rate": 8.516433340038048e-06, + "loss": 1.6913, + "step": 48100 + }, + { + "epoch": 1.710425739934228, + "grad_norm": 1.864981770515442, + "learning_rate": 8.512557719560753e-06, + "loss": 1.7076, + "step": 48110 + }, + { + "epoch": 1.7107812638876543, + "grad_norm": 1.906530737876892, + "learning_rate": 8.508682327545262e-06, + "loss": 1.6635, + "step": 48120 + }, + { + "epoch": 1.7111367878410808, + "grad_norm": 1.8444490432739258, + "learning_rate": 8.504807164586819e-06, + "loss": 1.6394, + "step": 48130 + }, + { + "epoch": 1.7114923117945071, + "grad_norm": 1.902989149093628, + "learning_rate": 8.500932231280621e-06, + "loss": 1.6327, + "step": 48140 + }, + { + "epoch": 1.7118478357479336, + "grad_norm": 1.8431181907653809, + "learning_rate": 8.49705752822183e-06, + "loss": 1.6779, + "step": 48150 + }, + { + "epoch": 1.71220335970136, + "grad_norm": 1.8293133974075317, + "learning_rate": 8.49318305600558e-06, + "loss": 1.6268, + "step": 48160 + }, + { + "epoch": 1.7125588836547863, + "grad_norm": 1.8206945657730103, + "learning_rate": 8.489308815226964e-06, + "loss": 1.6714, + "step": 48170 + }, + { + "epoch": 1.7129144076082126, + "grad_norm": 1.8169163465499878, + "learning_rate": 8.485434806481043e-06, + "loss": 1.6115, + "step": 48180 + }, + { + "epoch": 1.7132699315616389, + "grad_norm": 1.916739583015442, + "learning_rate": 8.481561030362838e-06, + "loss": 1.6676, + "step": 48190 + }, + { + "epoch": 1.7136254555150652, + "grad_norm": 1.8636739253997803, + "learning_rate": 8.477687487467339e-06, + "loss": 1.6472, + "step": 48200 + }, + { + "epoch": 1.7139809794684917, + "grad_norm": 1.800962209701538, + "learning_rate": 8.473814178389498e-06, + "loss": 1.6537, + "step": 48210 + }, + { + "epoch": 1.714336503421918, + "grad_norm": 1.931645393371582, + "learning_rate": 8.469941103724228e-06, + "loss": 1.6173, + "step": 48220 + }, + { + "epoch": 1.7146920273753445, + "grad_norm": 1.9857007265090942, + "learning_rate": 8.466068264066412e-06, + "loss": 1.6992, + "step": 48230 + }, + { + "epoch": 1.7150475513287708, + "grad_norm": 2.0565314292907715, + "learning_rate": 8.462195660010891e-06, + "loss": 1.6542, + "step": 48240 + }, + { + "epoch": 1.7154030752821972, + "grad_norm": 1.9118432998657227, + "learning_rate": 8.458323292152475e-06, + "loss": 1.6868, + "step": 48250 + }, + { + "epoch": 1.7157585992356235, + "grad_norm": 1.9311240911483765, + "learning_rate": 8.454451161085939e-06, + "loss": 1.6531, + "step": 48260 + }, + { + "epoch": 1.7161141231890498, + "grad_norm": 1.836564064025879, + "learning_rate": 8.450579267406009e-06, + "loss": 1.6497, + "step": 48270 + }, + { + "epoch": 1.716469647142476, + "grad_norm": 1.8827811479568481, + "learning_rate": 8.44670761170739e-06, + "loss": 1.6564, + "step": 48280 + }, + { + "epoch": 1.7168251710959026, + "grad_norm": 1.8383026123046875, + "learning_rate": 8.442836194584742e-06, + "loss": 1.6792, + "step": 48290 + }, + { + "epoch": 1.717180695049329, + "grad_norm": 1.927247166633606, + "learning_rate": 8.438965016632696e-06, + "loss": 1.6611, + "step": 48300 + }, + { + "epoch": 1.7175362190027554, + "grad_norm": 1.8960721492767334, + "learning_rate": 8.435094078445832e-06, + "loss": 1.6543, + "step": 48310 + }, + { + "epoch": 1.7178917429561817, + "grad_norm": 2.0390431880950928, + "learning_rate": 8.431223380618705e-06, + "loss": 1.6784, + "step": 48320 + }, + { + "epoch": 1.718247266909608, + "grad_norm": 1.7168264389038086, + "learning_rate": 8.42735292374583e-06, + "loss": 1.6135, + "step": 48330 + }, + { + "epoch": 1.7186027908630344, + "grad_norm": 2.055449962615967, + "learning_rate": 8.423482708421688e-06, + "loss": 1.6575, + "step": 48340 + }, + { + "epoch": 1.7189583148164607, + "grad_norm": 1.9829033613204956, + "learning_rate": 8.419612735240715e-06, + "loss": 1.6444, + "step": 48350 + }, + { + "epoch": 1.719313838769887, + "grad_norm": 1.8641419410705566, + "learning_rate": 8.415743004797316e-06, + "loss": 1.6166, + "step": 48360 + }, + { + "epoch": 1.7196693627233135, + "grad_norm": 1.9465245008468628, + "learning_rate": 8.411873517685857e-06, + "loss": 1.6515, + "step": 48370 + }, + { + "epoch": 1.7200248866767398, + "grad_norm": 1.771341323852539, + "learning_rate": 8.408004274500672e-06, + "loss": 1.6284, + "step": 48380 + }, + { + "epoch": 1.7203804106301663, + "grad_norm": 1.7727006673812866, + "learning_rate": 8.404135275836048e-06, + "loss": 1.6571, + "step": 48390 + }, + { + "epoch": 1.7207359345835926, + "grad_norm": 1.8482820987701416, + "learning_rate": 8.400266522286236e-06, + "loss": 1.6374, + "step": 48400 + }, + { + "epoch": 1.721091458537019, + "grad_norm": 1.762482762336731, + "learning_rate": 8.396398014445458e-06, + "loss": 1.6637, + "step": 48410 + }, + { + "epoch": 1.7214469824904453, + "grad_norm": 1.965485692024231, + "learning_rate": 8.392529752907889e-06, + "loss": 1.6498, + "step": 48420 + }, + { + "epoch": 1.7218025064438716, + "grad_norm": 1.8650978803634644, + "learning_rate": 8.388661738267672e-06, + "loss": 1.6588, + "step": 48430 + }, + { + "epoch": 1.7221580303972979, + "grad_norm": 1.8982642889022827, + "learning_rate": 8.384793971118912e-06, + "loss": 1.6392, + "step": 48440 + }, + { + "epoch": 1.7225135543507244, + "grad_norm": 1.8832117319107056, + "learning_rate": 8.380926452055667e-06, + "loss": 1.658, + "step": 48450 + }, + { + "epoch": 1.7228690783041507, + "grad_norm": 1.882262110710144, + "learning_rate": 8.37705918167197e-06, + "loss": 1.6739, + "step": 48460 + }, + { + "epoch": 1.7232246022575772, + "grad_norm": 1.9449636936187744, + "learning_rate": 8.373192160561807e-06, + "loss": 1.6682, + "step": 48470 + }, + { + "epoch": 1.7235801262110035, + "grad_norm": 1.9521130323410034, + "learning_rate": 8.369325389319126e-06, + "loss": 1.6181, + "step": 48480 + }, + { + "epoch": 1.7239356501644298, + "grad_norm": 1.724274754524231, + "learning_rate": 8.36545886853784e-06, + "loss": 1.6648, + "step": 48490 + }, + { + "epoch": 1.7242911741178562, + "grad_norm": 1.8489983081817627, + "learning_rate": 8.361592598811822e-06, + "loss": 1.6589, + "step": 48500 + }, + { + "epoch": 1.7246466980712825, + "grad_norm": 2.001936197280884, + "learning_rate": 8.357726580734909e-06, + "loss": 1.6294, + "step": 48510 + }, + { + "epoch": 1.7250022220247088, + "grad_norm": 2.116563320159912, + "learning_rate": 8.353860814900898e-06, + "loss": 1.6479, + "step": 48520 + }, + { + "epoch": 1.7253577459781353, + "grad_norm": 1.769295573234558, + "learning_rate": 8.349995301903543e-06, + "loss": 1.6134, + "step": 48530 + }, + { + "epoch": 1.7257132699315616, + "grad_norm": 2.013415813446045, + "learning_rate": 8.346130042336563e-06, + "loss": 1.7189, + "step": 48540 + }, + { + "epoch": 1.7260687938849881, + "grad_norm": 1.9870202541351318, + "learning_rate": 8.34226503679364e-06, + "loss": 1.6479, + "step": 48550 + }, + { + "epoch": 1.7264243178384144, + "grad_norm": 1.8795701265335083, + "learning_rate": 8.338400285868412e-06, + "loss": 1.6145, + "step": 48560 + }, + { + "epoch": 1.7267798417918407, + "grad_norm": 1.7985905408859253, + "learning_rate": 8.334535790154485e-06, + "loss": 1.6709, + "step": 48570 + }, + { + "epoch": 1.727135365745267, + "grad_norm": 1.8081409931182861, + "learning_rate": 8.330671550245415e-06, + "loss": 1.6225, + "step": 48580 + }, + { + "epoch": 1.7274908896986934, + "grad_norm": 1.790563702583313, + "learning_rate": 8.32680756673473e-06, + "loss": 1.5796, + "step": 48590 + }, + { + "epoch": 1.7278464136521197, + "grad_norm": 1.9314616918563843, + "learning_rate": 8.32294384021591e-06, + "loss": 1.6238, + "step": 48600 + }, + { + "epoch": 1.7282019376055462, + "grad_norm": 1.8861994743347168, + "learning_rate": 8.319080371282401e-06, + "loss": 1.644, + "step": 48610 + }, + { + "epoch": 1.7285574615589725, + "grad_norm": 1.9431285858154297, + "learning_rate": 8.315217160527608e-06, + "loss": 1.6395, + "step": 48620 + }, + { + "epoch": 1.728912985512399, + "grad_norm": 1.8947607278823853, + "learning_rate": 8.3113542085449e-06, + "loss": 1.647, + "step": 48630 + }, + { + "epoch": 1.7292685094658253, + "grad_norm": 2.009699583053589, + "learning_rate": 8.307491515927592e-06, + "loss": 1.6631, + "step": 48640 + }, + { + "epoch": 1.7296240334192516, + "grad_norm": 2.165419340133667, + "learning_rate": 8.303629083268979e-06, + "loss": 1.6434, + "step": 48650 + }, + { + "epoch": 1.729979557372678, + "grad_norm": 1.774941325187683, + "learning_rate": 8.2997669111623e-06, + "loss": 1.6216, + "step": 48660 + }, + { + "epoch": 1.7303350813261043, + "grad_norm": 2.1260578632354736, + "learning_rate": 8.295905000200762e-06, + "loss": 1.6332, + "step": 48670 + }, + { + "epoch": 1.7306906052795306, + "grad_norm": 1.8239442110061646, + "learning_rate": 8.29204335097753e-06, + "loss": 1.6628, + "step": 48680 + }, + { + "epoch": 1.731046129232957, + "grad_norm": 1.8172132968902588, + "learning_rate": 8.288181964085732e-06, + "loss": 1.6164, + "step": 48690 + }, + { + "epoch": 1.7314016531863834, + "grad_norm": 1.971981406211853, + "learning_rate": 8.284320840118454e-06, + "loss": 1.6506, + "step": 48700 + }, + { + "epoch": 1.73175717713981, + "grad_norm": 1.8251330852508545, + "learning_rate": 8.280459979668733e-06, + "loss": 1.6201, + "step": 48710 + }, + { + "epoch": 1.7321127010932362, + "grad_norm": 1.8595893383026123, + "learning_rate": 8.27659938332958e-06, + "loss": 1.6461, + "step": 48720 + }, + { + "epoch": 1.7324682250466625, + "grad_norm": 1.7532918453216553, + "learning_rate": 8.272739051693956e-06, + "loss": 1.606, + "step": 48730 + }, + { + "epoch": 1.7328237490000888, + "grad_norm": 1.788098931312561, + "learning_rate": 8.268878985354782e-06, + "loss": 1.6624, + "step": 48740 + }, + { + "epoch": 1.7331792729535151, + "grad_norm": 1.756616473197937, + "learning_rate": 8.265019184904944e-06, + "loss": 1.6264, + "step": 48750 + }, + { + "epoch": 1.7335347969069415, + "grad_norm": 1.7843860387802124, + "learning_rate": 8.261159650937279e-06, + "loss": 1.6808, + "step": 48760 + }, + { + "epoch": 1.733890320860368, + "grad_norm": 2.107708692550659, + "learning_rate": 8.25730038404459e-06, + "loss": 1.6513, + "step": 48770 + }, + { + "epoch": 1.7342458448137943, + "grad_norm": 1.9271059036254883, + "learning_rate": 8.253441384819633e-06, + "loss": 1.6843, + "step": 48780 + }, + { + "epoch": 1.7346013687672208, + "grad_norm": 1.7932971715927124, + "learning_rate": 8.249582653855134e-06, + "loss": 1.6347, + "step": 48790 + }, + { + "epoch": 1.7349568927206471, + "grad_norm": 1.8732764720916748, + "learning_rate": 8.24572419174376e-06, + "loss": 1.6266, + "step": 48800 + }, + { + "epoch": 1.7353124166740734, + "grad_norm": 1.7979843616485596, + "learning_rate": 8.241865999078152e-06, + "loss": 1.6351, + "step": 48810 + }, + { + "epoch": 1.7356679406274997, + "grad_norm": 1.7244874238967896, + "learning_rate": 8.2380080764509e-06, + "loss": 1.6142, + "step": 48820 + }, + { + "epoch": 1.736023464580926, + "grad_norm": 1.789110779762268, + "learning_rate": 8.234150424454564e-06, + "loss": 1.6237, + "step": 48830 + }, + { + "epoch": 1.7363789885343524, + "grad_norm": 1.8341730833053589, + "learning_rate": 8.230293043681647e-06, + "loss": 1.6738, + "step": 48840 + }, + { + "epoch": 1.7367345124877789, + "grad_norm": 1.818542718887329, + "learning_rate": 8.226435934724624e-06, + "loss": 1.6691, + "step": 48850 + }, + { + "epoch": 1.7370900364412052, + "grad_norm": 1.8525408506393433, + "learning_rate": 8.22257909817592e-06, + "loss": 1.6211, + "step": 48860 + }, + { + "epoch": 1.7374455603946317, + "grad_norm": 1.92518150806427, + "learning_rate": 8.218722534627923e-06, + "loss": 1.6193, + "step": 48870 + }, + { + "epoch": 1.737801084348058, + "grad_norm": 2.171698808670044, + "learning_rate": 8.214866244672977e-06, + "loss": 1.6144, + "step": 48880 + }, + { + "epoch": 1.7381566083014843, + "grad_norm": 2.083942174911499, + "learning_rate": 8.211010228903382e-06, + "loss": 1.6741, + "step": 48890 + }, + { + "epoch": 1.7385121322549106, + "grad_norm": 1.959802508354187, + "learning_rate": 8.207154487911397e-06, + "loss": 1.6517, + "step": 48900 + }, + { + "epoch": 1.738867656208337, + "grad_norm": 1.8355531692504883, + "learning_rate": 8.203299022289244e-06, + "loss": 1.6382, + "step": 48910 + }, + { + "epoch": 1.7392231801617632, + "grad_norm": 2.048764705657959, + "learning_rate": 8.199443832629093e-06, + "loss": 1.6289, + "step": 48920 + }, + { + "epoch": 1.7395787041151898, + "grad_norm": 1.98831045627594, + "learning_rate": 8.195588919523083e-06, + "loss": 1.6617, + "step": 48930 + }, + { + "epoch": 1.739934228068616, + "grad_norm": 2.0530476570129395, + "learning_rate": 8.1917342835633e-06, + "loss": 1.6808, + "step": 48940 + }, + { + "epoch": 1.7402897520220426, + "grad_norm": 1.8697683811187744, + "learning_rate": 8.187879925341795e-06, + "loss": 1.6542, + "step": 48950 + }, + { + "epoch": 1.740645275975469, + "grad_norm": 1.8175350427627563, + "learning_rate": 8.184025845450571e-06, + "loss": 1.6497, + "step": 48960 + }, + { + "epoch": 1.7410007999288952, + "grad_norm": 2.0738396644592285, + "learning_rate": 8.18017204448159e-06, + "loss": 1.6428, + "step": 48970 + }, + { + "epoch": 1.7413563238823215, + "grad_norm": 1.8094022274017334, + "learning_rate": 8.176318523026769e-06, + "loss": 1.6206, + "step": 48980 + }, + { + "epoch": 1.7417118478357478, + "grad_norm": 1.8019884824752808, + "learning_rate": 8.172465281677992e-06, + "loss": 1.6582, + "step": 48990 + }, + { + "epoch": 1.7420673717891741, + "grad_norm": 1.8854589462280273, + "learning_rate": 8.168612321027087e-06, + "loss": 1.669, + "step": 49000 + }, + { + "epoch": 1.7424228957426007, + "grad_norm": 1.8984565734863281, + "learning_rate": 8.16475964166585e-06, + "loss": 1.665, + "step": 49010 + }, + { + "epoch": 1.742778419696027, + "grad_norm": 2.357611656188965, + "learning_rate": 8.160907244186022e-06, + "loss": 1.6877, + "step": 49020 + }, + { + "epoch": 1.7431339436494535, + "grad_norm": 2.006403923034668, + "learning_rate": 8.15705512917931e-06, + "loss": 1.6531, + "step": 49030 + }, + { + "epoch": 1.7434894676028798, + "grad_norm": 2.006674289703369, + "learning_rate": 8.153203297237375e-06, + "loss": 1.659, + "step": 49040 + }, + { + "epoch": 1.7438449915563061, + "grad_norm": 1.8706060647964478, + "learning_rate": 8.149351748951834e-06, + "loss": 1.6299, + "step": 49050 + }, + { + "epoch": 1.7442005155097324, + "grad_norm": 1.9062683582305908, + "learning_rate": 8.14550048491426e-06, + "loss": 1.6134, + "step": 49060 + }, + { + "epoch": 1.7445560394631587, + "grad_norm": 1.959368348121643, + "learning_rate": 8.141649505716187e-06, + "loss": 1.596, + "step": 49070 + }, + { + "epoch": 1.744911563416585, + "grad_norm": 1.8529287576675415, + "learning_rate": 8.137798811949096e-06, + "loss": 1.6149, + "step": 49080 + }, + { + "epoch": 1.7452670873700116, + "grad_norm": 1.9746596813201904, + "learning_rate": 8.13394840420443e-06, + "loss": 1.6894, + "step": 49090 + }, + { + "epoch": 1.7456226113234379, + "grad_norm": 1.8950711488723755, + "learning_rate": 8.130098283073591e-06, + "loss": 1.6461, + "step": 49100 + }, + { + "epoch": 1.7459781352768644, + "grad_norm": 1.800689697265625, + "learning_rate": 8.126248449147933e-06, + "loss": 1.6526, + "step": 49110 + }, + { + "epoch": 1.7463336592302907, + "grad_norm": 1.7372263669967651, + "learning_rate": 8.122398903018762e-06, + "loss": 1.6448, + "step": 49120 + }, + { + "epoch": 1.746689183183717, + "grad_norm": 2.0515880584716797, + "learning_rate": 8.118549645277347e-06, + "loss": 1.6301, + "step": 49130 + }, + { + "epoch": 1.7470447071371433, + "grad_norm": 1.994960069656372, + "learning_rate": 8.114700676514912e-06, + "loss": 1.6879, + "step": 49140 + }, + { + "epoch": 1.7474002310905696, + "grad_norm": 2.0096182823181152, + "learning_rate": 8.110851997322628e-06, + "loss": 1.6471, + "step": 49150 + }, + { + "epoch": 1.747755755043996, + "grad_norm": 2.0000274181365967, + "learning_rate": 8.107003608291634e-06, + "loss": 1.6305, + "step": 49160 + }, + { + "epoch": 1.7481112789974225, + "grad_norm": 1.9292528629302979, + "learning_rate": 8.103155510013016e-06, + "loss": 1.702, + "step": 49170 + }, + { + "epoch": 1.7484668029508488, + "grad_norm": 1.8559280633926392, + "learning_rate": 8.09930770307782e-06, + "loss": 1.6611, + "step": 49180 + }, + { + "epoch": 1.7488223269042753, + "grad_norm": 2.0235233306884766, + "learning_rate": 8.095460188077043e-06, + "loss": 1.6847, + "step": 49190 + }, + { + "epoch": 1.7491778508577016, + "grad_norm": 1.8203125, + "learning_rate": 8.091612965601639e-06, + "loss": 1.6576, + "step": 49200 + }, + { + "epoch": 1.749533374811128, + "grad_norm": 1.8425822257995605, + "learning_rate": 8.087766036242516e-06, + "loss": 1.6762, + "step": 49210 + }, + { + "epoch": 1.7498888987645542, + "grad_norm": 1.8479509353637695, + "learning_rate": 8.083919400590544e-06, + "loss": 1.6412, + "step": 49220 + }, + { + "epoch": 1.7502444227179805, + "grad_norm": 1.9452064037322998, + "learning_rate": 8.080073059236535e-06, + "loss": 1.6381, + "step": 49230 + }, + { + "epoch": 1.7505999466714068, + "grad_norm": 1.8753700256347656, + "learning_rate": 8.076227012771266e-06, + "loss": 1.6642, + "step": 49240 + }, + { + "epoch": 1.7509554706248334, + "grad_norm": 1.87269127368927, + "learning_rate": 8.072381261785469e-06, + "loss": 1.6624, + "step": 49250 + }, + { + "epoch": 1.7513109945782597, + "grad_norm": 1.8374950885772705, + "learning_rate": 8.068535806869821e-06, + "loss": 1.6988, + "step": 49260 + }, + { + "epoch": 1.7516665185316862, + "grad_norm": 2.0532567501068115, + "learning_rate": 8.064690648614966e-06, + "loss": 1.6251, + "step": 49270 + }, + { + "epoch": 1.7520220424851125, + "grad_norm": 1.9432693719863892, + "learning_rate": 8.060845787611491e-06, + "loss": 1.6493, + "step": 49280 + }, + { + "epoch": 1.7523775664385388, + "grad_norm": 1.6951652765274048, + "learning_rate": 8.057001224449943e-06, + "loss": 1.6139, + "step": 49290 + }, + { + "epoch": 1.7527330903919651, + "grad_norm": 1.902296543121338, + "learning_rate": 8.053156959720826e-06, + "loss": 1.6348, + "step": 49300 + }, + { + "epoch": 1.7530886143453914, + "grad_norm": 2.1470212936401367, + "learning_rate": 8.049312994014589e-06, + "loss": 1.6944, + "step": 49310 + }, + { + "epoch": 1.7534441382988177, + "grad_norm": 1.822934627532959, + "learning_rate": 8.045469327921651e-06, + "loss": 1.6366, + "step": 49320 + }, + { + "epoch": 1.7537996622522443, + "grad_norm": 1.9577301740646362, + "learning_rate": 8.041625962032367e-06, + "loss": 1.6651, + "step": 49330 + }, + { + "epoch": 1.7541551862056706, + "grad_norm": 1.9063875675201416, + "learning_rate": 8.037782896937054e-06, + "loss": 1.6522, + "step": 49340 + }, + { + "epoch": 1.754510710159097, + "grad_norm": 1.7476140260696411, + "learning_rate": 8.033940133225986e-06, + "loss": 1.6984, + "step": 49350 + }, + { + "epoch": 1.7548662341125234, + "grad_norm": 2.013690233230591, + "learning_rate": 8.030097671489387e-06, + "loss": 1.6184, + "step": 49360 + }, + { + "epoch": 1.7552217580659497, + "grad_norm": 1.9201568365097046, + "learning_rate": 8.026255512317434e-06, + "loss": 1.6145, + "step": 49370 + }, + { + "epoch": 1.755577282019376, + "grad_norm": 1.874620795249939, + "learning_rate": 8.02241365630026e-06, + "loss": 1.6559, + "step": 49380 + }, + { + "epoch": 1.7559328059728023, + "grad_norm": 1.8998016119003296, + "learning_rate": 8.018572104027948e-06, + "loss": 1.667, + "step": 49390 + }, + { + "epoch": 1.7562883299262286, + "grad_norm": 1.9586131572723389, + "learning_rate": 8.014730856090535e-06, + "loss": 1.6391, + "step": 49400 + }, + { + "epoch": 1.7566438538796552, + "grad_norm": 1.9174072742462158, + "learning_rate": 8.010889913078017e-06, + "loss": 1.6337, + "step": 49410 + }, + { + "epoch": 1.7569993778330815, + "grad_norm": 1.8285397291183472, + "learning_rate": 8.007049275580335e-06, + "loss": 1.6383, + "step": 49420 + }, + { + "epoch": 1.757354901786508, + "grad_norm": 1.9597249031066895, + "learning_rate": 8.003208944187394e-06, + "loss": 1.6446, + "step": 49430 + }, + { + "epoch": 1.7577104257399343, + "grad_norm": 1.9941136837005615, + "learning_rate": 7.999368919489034e-06, + "loss": 1.6533, + "step": 49440 + }, + { + "epoch": 1.7580659496933606, + "grad_norm": 1.950408697128296, + "learning_rate": 7.995529202075066e-06, + "loss": 1.6704, + "step": 49450 + }, + { + "epoch": 1.758421473646787, + "grad_norm": 1.843684434890747, + "learning_rate": 7.991689792535244e-06, + "loss": 1.6248, + "step": 49460 + }, + { + "epoch": 1.7587769976002132, + "grad_norm": 2.0115206241607666, + "learning_rate": 7.987850691459275e-06, + "loss": 1.6599, + "step": 49470 + }, + { + "epoch": 1.7591325215536395, + "grad_norm": 1.9936758279800415, + "learning_rate": 7.984011899436826e-06, + "loss": 1.684, + "step": 49480 + }, + { + "epoch": 1.759488045507066, + "grad_norm": 1.991235613822937, + "learning_rate": 7.980173417057507e-06, + "loss": 1.6603, + "step": 49490 + }, + { + "epoch": 1.7598435694604924, + "grad_norm": 1.8691331148147583, + "learning_rate": 7.976335244910887e-06, + "loss": 1.7209, + "step": 49500 + }, + { + "epoch": 1.760199093413919, + "grad_norm": 1.801890254020691, + "learning_rate": 7.972497383586489e-06, + "loss": 1.6109, + "step": 49510 + }, + { + "epoch": 1.7605546173673452, + "grad_norm": 1.8995479345321655, + "learning_rate": 7.968659833673776e-06, + "loss": 1.6582, + "step": 49520 + }, + { + "epoch": 1.7609101413207715, + "grad_norm": 2.027247428894043, + "learning_rate": 7.964822595762176e-06, + "loss": 1.7072, + "step": 49530 + }, + { + "epoch": 1.7612656652741978, + "grad_norm": 1.877119541168213, + "learning_rate": 7.960985670441067e-06, + "loss": 1.6737, + "step": 49540 + }, + { + "epoch": 1.7616211892276241, + "grad_norm": 2.048544406890869, + "learning_rate": 7.957149058299771e-06, + "loss": 1.6738, + "step": 49550 + }, + { + "epoch": 1.7619767131810504, + "grad_norm": 1.965006709098816, + "learning_rate": 7.953312759927576e-06, + "loss": 1.6252, + "step": 49560 + }, + { + "epoch": 1.762332237134477, + "grad_norm": 1.9446678161621094, + "learning_rate": 7.949476775913703e-06, + "loss": 1.6972, + "step": 49570 + }, + { + "epoch": 1.7626877610879033, + "grad_norm": 2.228520154953003, + "learning_rate": 7.945641106847343e-06, + "loss": 1.6422, + "step": 49580 + }, + { + "epoch": 1.7630432850413298, + "grad_norm": 1.7643654346466064, + "learning_rate": 7.941805753317631e-06, + "loss": 1.6238, + "step": 49590 + }, + { + "epoch": 1.763398808994756, + "grad_norm": 1.9489067792892456, + "learning_rate": 7.937970715913647e-06, + "loss": 1.6082, + "step": 49600 + }, + { + "epoch": 1.7637543329481824, + "grad_norm": 2.0185320377349854, + "learning_rate": 7.934135995224431e-06, + "loss": 1.6458, + "step": 49610 + }, + { + "epoch": 1.7641098569016087, + "grad_norm": 1.8964686393737793, + "learning_rate": 7.930301591838973e-06, + "loss": 1.6666, + "step": 49620 + }, + { + "epoch": 1.764465380855035, + "grad_norm": 1.9244173765182495, + "learning_rate": 7.926467506346215e-06, + "loss": 1.6743, + "step": 49630 + }, + { + "epoch": 1.7648209048084613, + "grad_norm": 1.8678314685821533, + "learning_rate": 7.922633739335047e-06, + "loss": 1.6113, + "step": 49640 + }, + { + "epoch": 1.7651764287618879, + "grad_norm": 1.8226903676986694, + "learning_rate": 7.91880029139431e-06, + "loss": 1.6496, + "step": 49650 + }, + { + "epoch": 1.7655319527153142, + "grad_norm": 1.890203833580017, + "learning_rate": 7.914967163112799e-06, + "loss": 1.6457, + "step": 49660 + }, + { + "epoch": 1.7658874766687407, + "grad_norm": 1.8880730867385864, + "learning_rate": 7.91113435507926e-06, + "loss": 1.678, + "step": 49670 + }, + { + "epoch": 1.766243000622167, + "grad_norm": 1.901037573814392, + "learning_rate": 7.907301867882384e-06, + "loss": 1.6444, + "step": 49680 + }, + { + "epoch": 1.7665985245755933, + "grad_norm": 1.8280609846115112, + "learning_rate": 7.903469702110824e-06, + "loss": 1.6791, + "step": 49690 + }, + { + "epoch": 1.7669540485290196, + "grad_norm": 1.799757719039917, + "learning_rate": 7.89963785835317e-06, + "loss": 1.6189, + "step": 49700 + }, + { + "epoch": 1.767309572482446, + "grad_norm": 1.8317128419876099, + "learning_rate": 7.895806337197971e-06, + "loss": 1.6828, + "step": 49710 + }, + { + "epoch": 1.7676650964358722, + "grad_norm": 1.954054832458496, + "learning_rate": 7.891975139233726e-06, + "loss": 1.6297, + "step": 49720 + }, + { + "epoch": 1.7680206203892987, + "grad_norm": 1.7993971109390259, + "learning_rate": 7.88814426504888e-06, + "loss": 1.6793, + "step": 49730 + }, + { + "epoch": 1.768376144342725, + "grad_norm": 1.8766379356384277, + "learning_rate": 7.884313715231838e-06, + "loss": 1.6927, + "step": 49740 + }, + { + "epoch": 1.7687316682961516, + "grad_norm": 2.0255801677703857, + "learning_rate": 7.880483490370943e-06, + "loss": 1.6619, + "step": 49750 + }, + { + "epoch": 1.7690871922495779, + "grad_norm": 1.9055756330490112, + "learning_rate": 7.876653591054495e-06, + "loss": 1.6556, + "step": 49760 + }, + { + "epoch": 1.7694427162030042, + "grad_norm": 1.8836147785186768, + "learning_rate": 7.87282401787074e-06, + "loss": 1.6847, + "step": 49770 + }, + { + "epoch": 1.7697982401564305, + "grad_norm": 2.059903860092163, + "learning_rate": 7.868994771407876e-06, + "loss": 1.6061, + "step": 49780 + }, + { + "epoch": 1.7701537641098568, + "grad_norm": 1.9796390533447266, + "learning_rate": 7.865165852254056e-06, + "loss": 1.6801, + "step": 49790 + }, + { + "epoch": 1.7705092880632831, + "grad_norm": 1.8722137212753296, + "learning_rate": 7.861337260997375e-06, + "loss": 1.6349, + "step": 49800 + }, + { + "epoch": 1.7708648120167096, + "grad_norm": 1.951656460762024, + "learning_rate": 7.857508998225881e-06, + "loss": 1.6252, + "step": 49810 + }, + { + "epoch": 1.771220335970136, + "grad_norm": 1.750725507736206, + "learning_rate": 7.853681064527573e-06, + "loss": 1.6021, + "step": 49820 + }, + { + "epoch": 1.7715758599235625, + "grad_norm": 1.8703793287277222, + "learning_rate": 7.849853460490396e-06, + "loss": 1.6572, + "step": 49830 + }, + { + "epoch": 1.7719313838769888, + "grad_norm": 1.9249235391616821, + "learning_rate": 7.846026186702242e-06, + "loss": 1.644, + "step": 49840 + }, + { + "epoch": 1.772286907830415, + "grad_norm": 1.7748090028762817, + "learning_rate": 7.842199243750962e-06, + "loss": 1.6518, + "step": 49850 + }, + { + "epoch": 1.7726424317838414, + "grad_norm": 1.9108740091323853, + "learning_rate": 7.838372632224351e-06, + "loss": 1.6537, + "step": 49860 + }, + { + "epoch": 1.7729979557372677, + "grad_norm": 1.9827098846435547, + "learning_rate": 7.83454635271015e-06, + "loss": 1.665, + "step": 49870 + }, + { + "epoch": 1.773353479690694, + "grad_norm": 1.978049874305725, + "learning_rate": 7.83072040579605e-06, + "loss": 1.6618, + "step": 49880 + }, + { + "epoch": 1.7737090036441205, + "grad_norm": 1.8595516681671143, + "learning_rate": 7.826894792069694e-06, + "loss": 1.6266, + "step": 49890 + }, + { + "epoch": 1.7740645275975468, + "grad_norm": 2.042062759399414, + "learning_rate": 7.823069512118673e-06, + "loss": 1.6352, + "step": 49900 + }, + { + "epoch": 1.7744200515509734, + "grad_norm": 1.8743336200714111, + "learning_rate": 7.81924456653053e-06, + "loss": 1.63, + "step": 49910 + }, + { + "epoch": 1.7747755755043997, + "grad_norm": 2.0691182613372803, + "learning_rate": 7.815419955892744e-06, + "loss": 1.6679, + "step": 49920 + }, + { + "epoch": 1.775131099457826, + "grad_norm": 1.824062705039978, + "learning_rate": 7.811595680792755e-06, + "loss": 1.6083, + "step": 49930 + }, + { + "epoch": 1.7754866234112523, + "grad_norm": 1.7824209928512573, + "learning_rate": 7.807771741817947e-06, + "loss": 1.6047, + "step": 49940 + }, + { + "epoch": 1.7758421473646786, + "grad_norm": 2.0143516063690186, + "learning_rate": 7.803948139555657e-06, + "loss": 1.6744, + "step": 49950 + }, + { + "epoch": 1.776197671318105, + "grad_norm": 1.9708036184310913, + "learning_rate": 7.80012487459316e-06, + "loss": 1.6705, + "step": 49960 + }, + { + "epoch": 1.7765531952715314, + "grad_norm": 1.932173252105713, + "learning_rate": 7.79630194751769e-06, + "loss": 1.6251, + "step": 49970 + }, + { + "epoch": 1.7769087192249577, + "grad_norm": 1.9063539505004883, + "learning_rate": 7.792479358916425e-06, + "loss": 1.6432, + "step": 49980 + }, + { + "epoch": 1.7772642431783843, + "grad_norm": 2.011800527572632, + "learning_rate": 7.788657109376488e-06, + "loss": 1.6514, + "step": 49990 + }, + { + "epoch": 1.7776197671318106, + "grad_norm": 2.0261800289154053, + "learning_rate": 7.784835199484954e-06, + "loss": 1.6612, + "step": 50000 + }, + { + "epoch": 1.7779752910852369, + "grad_norm": 1.8654749393463135, + "learning_rate": 7.781013629828845e-06, + "loss": 1.6984, + "step": 50010 + }, + { + "epoch": 1.7783308150386632, + "grad_norm": 1.7497426271438599, + "learning_rate": 7.777192400995128e-06, + "loss": 1.6403, + "step": 50020 + }, + { + "epoch": 1.7786863389920895, + "grad_norm": 1.7313170433044434, + "learning_rate": 7.773371513570723e-06, + "loss": 1.6368, + "step": 50030 + }, + { + "epoch": 1.7790418629455158, + "grad_norm": 1.8992712497711182, + "learning_rate": 7.76955096814249e-06, + "loss": 1.6526, + "step": 50040 + }, + { + "epoch": 1.7793973868989423, + "grad_norm": 1.8442968130111694, + "learning_rate": 7.765730765297246e-06, + "loss": 1.6573, + "step": 50050 + }, + { + "epoch": 1.7797529108523686, + "grad_norm": 1.8860867023468018, + "learning_rate": 7.761910905621745e-06, + "loss": 1.5908, + "step": 50060 + }, + { + "epoch": 1.7801084348057952, + "grad_norm": 1.8785138130187988, + "learning_rate": 7.7580913897027e-06, + "loss": 1.6388, + "step": 50070 + }, + { + "epoch": 1.7804639587592215, + "grad_norm": 1.8686470985412598, + "learning_rate": 7.754272218126757e-06, + "loss": 1.622, + "step": 50080 + }, + { + "epoch": 1.7808194827126478, + "grad_norm": 1.92991042137146, + "learning_rate": 7.750453391480522e-06, + "loss": 1.6813, + "step": 50090 + }, + { + "epoch": 1.781175006666074, + "grad_norm": 1.7424781322479248, + "learning_rate": 7.746634910350538e-06, + "loss": 1.6498, + "step": 50100 + }, + { + "epoch": 1.7815305306195004, + "grad_norm": 2.404031991958618, + "learning_rate": 7.742816775323306e-06, + "loss": 1.6185, + "step": 50110 + }, + { + "epoch": 1.7818860545729267, + "grad_norm": 1.7803823947906494, + "learning_rate": 7.738998986985263e-06, + "loss": 1.6103, + "step": 50120 + }, + { + "epoch": 1.7822415785263532, + "grad_norm": 1.9528794288635254, + "learning_rate": 7.735181545922804e-06, + "loss": 1.6373, + "step": 50130 + }, + { + "epoch": 1.7825971024797795, + "grad_norm": 1.7719610929489136, + "learning_rate": 7.731364452722253e-06, + "loss": 1.6575, + "step": 50140 + }, + { + "epoch": 1.782952626433206, + "grad_norm": 1.8989477157592773, + "learning_rate": 7.727547707969899e-06, + "loss": 1.6341, + "step": 50150 + }, + { + "epoch": 1.7833081503866324, + "grad_norm": 1.91252601146698, + "learning_rate": 7.723731312251969e-06, + "loss": 1.6367, + "step": 50160 + }, + { + "epoch": 1.7836636743400587, + "grad_norm": 1.777572512626648, + "learning_rate": 7.719915266154637e-06, + "loss": 1.6309, + "step": 50170 + }, + { + "epoch": 1.784019198293485, + "grad_norm": 1.8915362358093262, + "learning_rate": 7.716099570264027e-06, + "loss": 1.66, + "step": 50180 + }, + { + "epoch": 1.7843747222469113, + "grad_norm": 2.0040836334228516, + "learning_rate": 7.7122842251662e-06, + "loss": 1.6722, + "step": 50190 + }, + { + "epoch": 1.7847302462003376, + "grad_norm": 1.9604754447937012, + "learning_rate": 7.708469231447171e-06, + "loss": 1.679, + "step": 50200 + }, + { + "epoch": 1.7850857701537641, + "grad_norm": 2.096215009689331, + "learning_rate": 7.7046545896929e-06, + "loss": 1.6247, + "step": 50210 + }, + { + "epoch": 1.7854412941071904, + "grad_norm": 2.1358866691589355, + "learning_rate": 7.700840300489292e-06, + "loss": 1.6332, + "step": 50220 + }, + { + "epoch": 1.785796818060617, + "grad_norm": 1.9483267068862915, + "learning_rate": 7.697026364422204e-06, + "loss": 1.6584, + "step": 50230 + }, + { + "epoch": 1.7861523420140433, + "grad_norm": 1.8866466283798218, + "learning_rate": 7.693212782077422e-06, + "loss": 1.6412, + "step": 50240 + }, + { + "epoch": 1.7865078659674696, + "grad_norm": 1.8588370084762573, + "learning_rate": 7.689399554040692e-06, + "loss": 1.5853, + "step": 50250 + }, + { + "epoch": 1.7868633899208959, + "grad_norm": 2.025243043899536, + "learning_rate": 7.685586680897706e-06, + "loss": 1.6427, + "step": 50260 + }, + { + "epoch": 1.7872189138743222, + "grad_norm": 1.9626860618591309, + "learning_rate": 7.681774163234091e-06, + "loss": 1.6968, + "step": 50270 + }, + { + "epoch": 1.7875744378277485, + "grad_norm": 1.9678057432174683, + "learning_rate": 7.67796200163543e-06, + "loss": 1.6988, + "step": 50280 + }, + { + "epoch": 1.787929961781175, + "grad_norm": 1.9846853017807007, + "learning_rate": 7.674150196687247e-06, + "loss": 1.659, + "step": 50290 + }, + { + "epoch": 1.7882854857346013, + "grad_norm": 1.9511381387710571, + "learning_rate": 7.67033874897501e-06, + "loss": 1.6484, + "step": 50300 + }, + { + "epoch": 1.7886410096880279, + "grad_norm": 2.210939884185791, + "learning_rate": 7.666527659084135e-06, + "loss": 1.6239, + "step": 50310 + }, + { + "epoch": 1.7889965336414542, + "grad_norm": 1.7664453983306885, + "learning_rate": 7.66271692759998e-06, + "loss": 1.6703, + "step": 50320 + }, + { + "epoch": 1.7893520575948805, + "grad_norm": 1.9270297288894653, + "learning_rate": 7.658906555107848e-06, + "loss": 1.6343, + "step": 50330 + }, + { + "epoch": 1.7897075815483068, + "grad_norm": 1.9423515796661377, + "learning_rate": 7.655096542192992e-06, + "loss": 1.6383, + "step": 50340 + }, + { + "epoch": 1.790063105501733, + "grad_norm": 1.9486314058303833, + "learning_rate": 7.651286889440605e-06, + "loss": 1.6595, + "step": 50350 + }, + { + "epoch": 1.7904186294551594, + "grad_norm": 1.8882137537002563, + "learning_rate": 7.647477597435826e-06, + "loss": 1.6311, + "step": 50360 + }, + { + "epoch": 1.790774153408586, + "grad_norm": 1.9961556196212769, + "learning_rate": 7.643668666763736e-06, + "loss": 1.656, + "step": 50370 + }, + { + "epoch": 1.7911296773620122, + "grad_norm": 1.8319236040115356, + "learning_rate": 7.639860098009366e-06, + "loss": 1.6111, + "step": 50380 + }, + { + "epoch": 1.7914852013154388, + "grad_norm": 2.002304792404175, + "learning_rate": 7.636051891757688e-06, + "loss": 1.6362, + "step": 50390 + }, + { + "epoch": 1.791840725268865, + "grad_norm": 1.8890831470489502, + "learning_rate": 7.632244048593616e-06, + "loss": 1.6935, + "step": 50400 + }, + { + "epoch": 1.7921962492222914, + "grad_norm": 1.926177978515625, + "learning_rate": 7.62843656910201e-06, + "loss": 1.6639, + "step": 50410 + }, + { + "epoch": 1.7925517731757177, + "grad_norm": 1.9095516204833984, + "learning_rate": 7.624629453867678e-06, + "loss": 1.6437, + "step": 50420 + }, + { + "epoch": 1.792907297129144, + "grad_norm": 1.785805106163025, + "learning_rate": 7.620822703475368e-06, + "loss": 1.66, + "step": 50430 + }, + { + "epoch": 1.7932628210825703, + "grad_norm": 1.992326259613037, + "learning_rate": 7.617016318509776e-06, + "loss": 1.6632, + "step": 50440 + }, + { + "epoch": 1.7936183450359968, + "grad_norm": 1.7599471807479858, + "learning_rate": 7.613210299555534e-06, + "loss": 1.6556, + "step": 50450 + }, + { + "epoch": 1.7939738689894231, + "grad_norm": 2.0583577156066895, + "learning_rate": 7.609404647197224e-06, + "loss": 1.6079, + "step": 50460 + }, + { + "epoch": 1.7943293929428497, + "grad_norm": 1.8822389841079712, + "learning_rate": 7.605599362019371e-06, + "loss": 1.6571, + "step": 50470 + }, + { + "epoch": 1.794684916896276, + "grad_norm": 2.0984292030334473, + "learning_rate": 7.601794444606443e-06, + "loss": 1.6544, + "step": 50480 + }, + { + "epoch": 1.7950404408497023, + "grad_norm": 1.8730183839797974, + "learning_rate": 7.597989895542854e-06, + "loss": 1.6628, + "step": 50490 + }, + { + "epoch": 1.7953959648031286, + "grad_norm": 1.9191683530807495, + "learning_rate": 7.594185715412954e-06, + "loss": 1.6725, + "step": 50500 + }, + { + "epoch": 1.7957514887565549, + "grad_norm": 1.9092954397201538, + "learning_rate": 7.590381904801043e-06, + "loss": 1.6753, + "step": 50510 + }, + { + "epoch": 1.7961070127099812, + "grad_norm": 1.960010290145874, + "learning_rate": 7.586578464291364e-06, + "loss": 1.5978, + "step": 50520 + }, + { + "epoch": 1.7964625366634077, + "grad_norm": 1.9142476320266724, + "learning_rate": 7.5827753944681e-06, + "loss": 1.657, + "step": 50530 + }, + { + "epoch": 1.796818060616834, + "grad_norm": 1.9966776371002197, + "learning_rate": 7.5789726959153795e-06, + "loss": 1.6301, + "step": 50540 + }, + { + "epoch": 1.7971735845702606, + "grad_norm": 1.8700714111328125, + "learning_rate": 7.575170369217277e-06, + "loss": 1.6687, + "step": 50550 + }, + { + "epoch": 1.7975291085236869, + "grad_norm": 1.9684398174285889, + "learning_rate": 7.571368414957798e-06, + "loss": 1.6068, + "step": 50560 + }, + { + "epoch": 1.7978846324771132, + "grad_norm": 1.9292086362838745, + "learning_rate": 7.567566833720905e-06, + "loss": 1.644, + "step": 50570 + }, + { + "epoch": 1.7982401564305395, + "grad_norm": 1.9431310892105103, + "learning_rate": 7.563765626090493e-06, + "loss": 1.684, + "step": 50580 + }, + { + "epoch": 1.7985956803839658, + "grad_norm": 2.0605413913726807, + "learning_rate": 7.559964792650405e-06, + "loss": 1.6587, + "step": 50590 + }, + { + "epoch": 1.798951204337392, + "grad_norm": 1.8475911617279053, + "learning_rate": 7.556164333984425e-06, + "loss": 1.6829, + "step": 50600 + }, + { + "epoch": 1.7993067282908186, + "grad_norm": 1.995463490486145, + "learning_rate": 7.552364250676282e-06, + "loss": 1.6511, + "step": 50610 + }, + { + "epoch": 1.799662252244245, + "grad_norm": 1.9126129150390625, + "learning_rate": 7.548564543309645e-06, + "loss": 1.6134, + "step": 50620 + }, + { + "epoch": 1.8000177761976714, + "grad_norm": 2.0450148582458496, + "learning_rate": 7.544765212468119e-06, + "loss": 1.623, + "step": 50630 + }, + { + "epoch": 1.8003733001510978, + "grad_norm": 1.9064500331878662, + "learning_rate": 7.540966258735265e-06, + "loss": 1.6364, + "step": 50640 + }, + { + "epoch": 1.800728824104524, + "grad_norm": 1.8855499029159546, + "learning_rate": 7.537167682694574e-06, + "loss": 1.6494, + "step": 50650 + }, + { + "epoch": 1.8010843480579504, + "grad_norm": 1.9371328353881836, + "learning_rate": 7.533369484929484e-06, + "loss": 1.6667, + "step": 50660 + }, + { + "epoch": 1.8014398720113767, + "grad_norm": 1.9383577108383179, + "learning_rate": 7.52957166602338e-06, + "loss": 1.6576, + "step": 50670 + }, + { + "epoch": 1.801795395964803, + "grad_norm": 1.8724509477615356, + "learning_rate": 7.525774226559575e-06, + "loss": 1.6832, + "step": 50680 + }, + { + "epoch": 1.8021509199182295, + "grad_norm": 1.9099206924438477, + "learning_rate": 7.521977167121335e-06, + "loss": 1.6408, + "step": 50690 + }, + { + "epoch": 1.8025064438716558, + "grad_norm": 2.0748062133789062, + "learning_rate": 7.5181804882918645e-06, + "loss": 1.6146, + "step": 50700 + }, + { + "epoch": 1.8028619678250823, + "grad_norm": 1.8202855587005615, + "learning_rate": 7.5143841906543135e-06, + "loss": 1.6116, + "step": 50710 + }, + { + "epoch": 1.8032174917785087, + "grad_norm": 2.272087574005127, + "learning_rate": 7.510588274791763e-06, + "loss": 1.6305, + "step": 50720 + }, + { + "epoch": 1.803573015731935, + "grad_norm": 1.9922109842300415, + "learning_rate": 7.506792741287245e-06, + "loss": 1.6432, + "step": 50730 + }, + { + "epoch": 1.8039285396853613, + "grad_norm": 1.9014875888824463, + "learning_rate": 7.502997590723729e-06, + "loss": 1.6595, + "step": 50740 + }, + { + "epoch": 1.8042840636387876, + "grad_norm": 1.8036856651306152, + "learning_rate": 7.499202823684129e-06, + "loss": 1.634, + "step": 50750 + }, + { + "epoch": 1.8046395875922139, + "grad_norm": 2.0609259605407715, + "learning_rate": 7.4954084407512915e-06, + "loss": 1.6135, + "step": 50760 + }, + { + "epoch": 1.8049951115456404, + "grad_norm": 2.009648084640503, + "learning_rate": 7.491614442508015e-06, + "loss": 1.6143, + "step": 50770 + }, + { + "epoch": 1.8053506354990667, + "grad_norm": 1.8452153205871582, + "learning_rate": 7.487820829537031e-06, + "loss": 1.5965, + "step": 50780 + }, + { + "epoch": 1.8057061594524932, + "grad_norm": 1.9561113119125366, + "learning_rate": 7.4840276024210175e-06, + "loss": 1.5791, + "step": 50790 + }, + { + "epoch": 1.8060616834059195, + "grad_norm": 1.8616101741790771, + "learning_rate": 7.480234761742592e-06, + "loss": 1.6199, + "step": 50800 + }, + { + "epoch": 1.8064172073593459, + "grad_norm": 1.8184536695480347, + "learning_rate": 7.476442308084304e-06, + "loss": 1.6182, + "step": 50810 + }, + { + "epoch": 1.8067727313127722, + "grad_norm": 1.9425073862075806, + "learning_rate": 7.472650242028656e-06, + "loss": 1.6653, + "step": 50820 + }, + { + "epoch": 1.8071282552661985, + "grad_norm": 1.9306671619415283, + "learning_rate": 7.468858564158083e-06, + "loss": 1.6021, + "step": 50830 + }, + { + "epoch": 1.8074837792196248, + "grad_norm": 2.0422468185424805, + "learning_rate": 7.4650672750549655e-06, + "loss": 1.634, + "step": 50840 + }, + { + "epoch": 1.8078393031730513, + "grad_norm": 1.7657324075698853, + "learning_rate": 7.46127637530162e-06, + "loss": 1.6569, + "step": 50850 + }, + { + "epoch": 1.8081948271264776, + "grad_norm": 2.0055558681488037, + "learning_rate": 7.4574858654803075e-06, + "loss": 1.6297, + "step": 50860 + }, + { + "epoch": 1.8085503510799041, + "grad_norm": 1.8196109533309937, + "learning_rate": 7.453695746173224e-06, + "loss": 1.6497, + "step": 50870 + }, + { + "epoch": 1.8089058750333304, + "grad_norm": 1.7514034509658813, + "learning_rate": 7.449906017962508e-06, + "loss": 1.6158, + "step": 50880 + }, + { + "epoch": 1.8092613989867568, + "grad_norm": 2.044034004211426, + "learning_rate": 7.446116681430238e-06, + "loss": 1.6472, + "step": 50890 + }, + { + "epoch": 1.809616922940183, + "grad_norm": 1.816916584968567, + "learning_rate": 7.44232773715843e-06, + "loss": 1.6352, + "step": 50900 + }, + { + "epoch": 1.8099724468936094, + "grad_norm": 2.0290634632110596, + "learning_rate": 7.438539185729048e-06, + "loss": 1.6396, + "step": 50910 + }, + { + "epoch": 1.8103279708470357, + "grad_norm": 1.9274016618728638, + "learning_rate": 7.434751027723984e-06, + "loss": 1.6257, + "step": 50920 + }, + { + "epoch": 1.8106834948004622, + "grad_norm": 1.9005857706069946, + "learning_rate": 7.430963263725081e-06, + "loss": 1.6422, + "step": 50930 + }, + { + "epoch": 1.8110390187538885, + "grad_norm": 1.9277817010879517, + "learning_rate": 7.427175894314112e-06, + "loss": 1.6347, + "step": 50940 + }, + { + "epoch": 1.811394542707315, + "grad_norm": 1.816381573677063, + "learning_rate": 7.423388920072792e-06, + "loss": 1.6504, + "step": 50950 + }, + { + "epoch": 1.8117500666607413, + "grad_norm": 1.9121003150939941, + "learning_rate": 7.419602341582779e-06, + "loss": 1.6757, + "step": 50960 + }, + { + "epoch": 1.8121055906141676, + "grad_norm": 2.0025718212127686, + "learning_rate": 7.415816159425666e-06, + "loss": 1.6708, + "step": 50970 + }, + { + "epoch": 1.812461114567594, + "grad_norm": 1.9708784818649292, + "learning_rate": 7.412030374182989e-06, + "loss": 1.6861, + "step": 50980 + }, + { + "epoch": 1.8128166385210203, + "grad_norm": 2.0150258541107178, + "learning_rate": 7.408244986436222e-06, + "loss": 1.6494, + "step": 50990 + }, + { + "epoch": 1.8131721624744466, + "grad_norm": 1.8205463886260986, + "learning_rate": 7.404459996766773e-06, + "loss": 1.6255, + "step": 51000 + }, + { + "epoch": 1.813527686427873, + "grad_norm": 1.8556621074676514, + "learning_rate": 7.400675405755994e-06, + "loss": 1.5942, + "step": 51010 + }, + { + "epoch": 1.8138832103812994, + "grad_norm": 1.997451663017273, + "learning_rate": 7.3968912139851735e-06, + "loss": 1.6468, + "step": 51020 + }, + { + "epoch": 1.814238734334726, + "grad_norm": 1.9072209596633911, + "learning_rate": 7.393107422035547e-06, + "loss": 1.6418, + "step": 51030 + }, + { + "epoch": 1.8145942582881522, + "grad_norm": 1.880759835243225, + "learning_rate": 7.3893240304882694e-06, + "loss": 1.6816, + "step": 51040 + }, + { + "epoch": 1.8149497822415785, + "grad_norm": 1.8945934772491455, + "learning_rate": 7.385541039924453e-06, + "loss": 1.6307, + "step": 51050 + }, + { + "epoch": 1.8153053061950049, + "grad_norm": 1.8671655654907227, + "learning_rate": 7.381758450925141e-06, + "loss": 1.6668, + "step": 51060 + }, + { + "epoch": 1.8156608301484312, + "grad_norm": 1.8045915365219116, + "learning_rate": 7.377976264071314e-06, + "loss": 1.6522, + "step": 51070 + }, + { + "epoch": 1.8160163541018575, + "grad_norm": 1.8873698711395264, + "learning_rate": 7.374194479943892e-06, + "loss": 1.6658, + "step": 51080 + }, + { + "epoch": 1.816371878055284, + "grad_norm": 2.0109992027282715, + "learning_rate": 7.370413099123732e-06, + "loss": 1.6037, + "step": 51090 + }, + { + "epoch": 1.8167274020087103, + "grad_norm": 1.9579392671585083, + "learning_rate": 7.366632122191635e-06, + "loss": 1.6547, + "step": 51100 + }, + { + "epoch": 1.8170829259621368, + "grad_norm": 1.8878474235534668, + "learning_rate": 7.362851549728334e-06, + "loss": 1.6412, + "step": 51110 + }, + { + "epoch": 1.8174384499155631, + "grad_norm": 2.0014569759368896, + "learning_rate": 7.359071382314497e-06, + "loss": 1.6627, + "step": 51120 + }, + { + "epoch": 1.8177939738689894, + "grad_norm": 1.850886344909668, + "learning_rate": 7.3552916205307375e-06, + "loss": 1.6991, + "step": 51130 + }, + { + "epoch": 1.8181494978224157, + "grad_norm": 1.8765913248062134, + "learning_rate": 7.351512264957602e-06, + "loss": 1.6695, + "step": 51140 + }, + { + "epoch": 1.818505021775842, + "grad_norm": 1.9025858640670776, + "learning_rate": 7.347733316175577e-06, + "loss": 1.6472, + "step": 51150 + }, + { + "epoch": 1.8188605457292684, + "grad_norm": 1.9211353063583374, + "learning_rate": 7.343954774765085e-06, + "loss": 1.6251, + "step": 51160 + }, + { + "epoch": 1.819216069682695, + "grad_norm": 1.9756635427474976, + "learning_rate": 7.340176641306488e-06, + "loss": 1.6854, + "step": 51170 + }, + { + "epoch": 1.8195715936361212, + "grad_norm": 1.8511515855789185, + "learning_rate": 7.3363989163800786e-06, + "loss": 1.6373, + "step": 51180 + }, + { + "epoch": 1.8199271175895477, + "grad_norm": 1.8389215469360352, + "learning_rate": 7.332621600566101e-06, + "loss": 1.6088, + "step": 51190 + }, + { + "epoch": 1.820282641542974, + "grad_norm": 2.011458396911621, + "learning_rate": 7.328844694444714e-06, + "loss": 1.6239, + "step": 51200 + }, + { + "epoch": 1.8206381654964003, + "grad_norm": 1.9376658201217651, + "learning_rate": 7.325068198596037e-06, + "loss": 1.6479, + "step": 51210 + }, + { + "epoch": 1.8209936894498266, + "grad_norm": 1.9281283617019653, + "learning_rate": 7.32129211360011e-06, + "loss": 1.6244, + "step": 51220 + }, + { + "epoch": 1.821349213403253, + "grad_norm": 1.9247887134552002, + "learning_rate": 7.317516440036921e-06, + "loss": 1.6828, + "step": 51230 + }, + { + "epoch": 1.8217047373566793, + "grad_norm": 2.0128231048583984, + "learning_rate": 7.3137411784863875e-06, + "loss": 1.671, + "step": 51240 + }, + { + "epoch": 1.8220602613101058, + "grad_norm": 1.8623316287994385, + "learning_rate": 7.309966329528364e-06, + "loss": 1.6425, + "step": 51250 + }, + { + "epoch": 1.822415785263532, + "grad_norm": 1.8655879497528076, + "learning_rate": 7.306191893742647e-06, + "loss": 1.6237, + "step": 51260 + }, + { + "epoch": 1.8227713092169586, + "grad_norm": 2.163519859313965, + "learning_rate": 7.302417871708965e-06, + "loss": 1.624, + "step": 51270 + }, + { + "epoch": 1.823126833170385, + "grad_norm": 1.7776243686676025, + "learning_rate": 7.2986442640069825e-06, + "loss": 1.6252, + "step": 51280 + }, + { + "epoch": 1.8234823571238112, + "grad_norm": 1.8731697797775269, + "learning_rate": 7.294871071216304e-06, + "loss": 1.6459, + "step": 51290 + }, + { + "epoch": 1.8238378810772375, + "grad_norm": 2.144305944442749, + "learning_rate": 7.29109829391647e-06, + "loss": 1.6274, + "step": 51300 + }, + { + "epoch": 1.8241934050306639, + "grad_norm": 2.031801223754883, + "learning_rate": 7.287325932686951e-06, + "loss": 1.6162, + "step": 51310 + }, + { + "epoch": 1.8245489289840902, + "grad_norm": 1.8657842874526978, + "learning_rate": 7.283553988107159e-06, + "loss": 1.6696, + "step": 51320 + }, + { + "epoch": 1.8249044529375167, + "grad_norm": 1.7387397289276123, + "learning_rate": 7.279782460756444e-06, + "loss": 1.6042, + "step": 51330 + }, + { + "epoch": 1.825259976890943, + "grad_norm": 2.1133036613464355, + "learning_rate": 7.276011351214086e-06, + "loss": 1.6408, + "step": 51340 + }, + { + "epoch": 1.8256155008443695, + "grad_norm": 1.9837652444839478, + "learning_rate": 7.2722406600593085e-06, + "loss": 1.6293, + "step": 51350 + }, + { + "epoch": 1.8259710247977958, + "grad_norm": 1.8695124387741089, + "learning_rate": 7.26847038787126e-06, + "loss": 1.6729, + "step": 51360 + }, + { + "epoch": 1.8263265487512221, + "grad_norm": 1.9198970794677734, + "learning_rate": 7.264700535229034e-06, + "loss": 1.5999, + "step": 51370 + }, + { + "epoch": 1.8266820727046484, + "grad_norm": 2.110482692718506, + "learning_rate": 7.260931102711655e-06, + "loss": 1.6974, + "step": 51380 + }, + { + "epoch": 1.8270375966580747, + "grad_norm": 1.60737943649292, + "learning_rate": 7.257162090898082e-06, + "loss": 1.6974, + "step": 51390 + }, + { + "epoch": 1.827393120611501, + "grad_norm": 1.9014869928359985, + "learning_rate": 7.2533935003672155e-06, + "loss": 1.6543, + "step": 51400 + }, + { + "epoch": 1.8277486445649276, + "grad_norm": 1.9607372283935547, + "learning_rate": 7.2496253316978845e-06, + "loss": 1.6592, + "step": 51410 + }, + { + "epoch": 1.828104168518354, + "grad_norm": 1.8342626094818115, + "learning_rate": 7.245857585468859e-06, + "loss": 1.6359, + "step": 51420 + }, + { + "epoch": 1.8284596924717804, + "grad_norm": 1.9489827156066895, + "learning_rate": 7.242090262258843e-06, + "loss": 1.6203, + "step": 51430 + }, + { + "epoch": 1.8288152164252067, + "grad_norm": 1.9720518589019775, + "learning_rate": 7.238323362646467e-06, + "loss": 1.6486, + "step": 51440 + }, + { + "epoch": 1.829170740378633, + "grad_norm": 1.8984580039978027, + "learning_rate": 7.2345568872103066e-06, + "loss": 1.6462, + "step": 51450 + }, + { + "epoch": 1.8295262643320593, + "grad_norm": 1.8438483476638794, + "learning_rate": 7.230790836528868e-06, + "loss": 1.6396, + "step": 51460 + }, + { + "epoch": 1.8298817882854856, + "grad_norm": 2.130063772201538, + "learning_rate": 7.227025211180595e-06, + "loss": 1.636, + "step": 51470 + }, + { + "epoch": 1.830237312238912, + "grad_norm": 1.7676109075546265, + "learning_rate": 7.223260011743864e-06, + "loss": 1.668, + "step": 51480 + }, + { + "epoch": 1.8305928361923385, + "grad_norm": 1.8324123620986938, + "learning_rate": 7.219495238796984e-06, + "loss": 1.6186, + "step": 51490 + }, + { + "epoch": 1.8309483601457648, + "grad_norm": 1.9791982173919678, + "learning_rate": 7.2157308929182015e-06, + "loss": 1.6493, + "step": 51500 + }, + { + "epoch": 1.8313038840991913, + "grad_norm": 1.9997572898864746, + "learning_rate": 7.211966974685696e-06, + "loss": 1.6644, + "step": 51510 + }, + { + "epoch": 1.8316594080526176, + "grad_norm": 1.9054361581802368, + "learning_rate": 7.208203484677585e-06, + "loss": 1.6271, + "step": 51520 + }, + { + "epoch": 1.832014932006044, + "grad_norm": 1.9085414409637451, + "learning_rate": 7.204440423471912e-06, + "loss": 1.6195, + "step": 51530 + }, + { + "epoch": 1.8323704559594702, + "grad_norm": 1.890094518661499, + "learning_rate": 7.20067779164666e-06, + "loss": 1.639, + "step": 51540 + }, + { + "epoch": 1.8327259799128965, + "grad_norm": 1.8796864748001099, + "learning_rate": 7.196915589779751e-06, + "loss": 1.6624, + "step": 51550 + }, + { + "epoch": 1.8330815038663228, + "grad_norm": 2.056006669998169, + "learning_rate": 7.193153818449028e-06, + "loss": 1.6749, + "step": 51560 + }, + { + "epoch": 1.8334370278197494, + "grad_norm": 1.8881081342697144, + "learning_rate": 7.18939247823228e-06, + "loss": 1.612, + "step": 51570 + }, + { + "epoch": 1.8337925517731757, + "grad_norm": 1.8354383707046509, + "learning_rate": 7.185631569707225e-06, + "loss": 1.6587, + "step": 51580 + }, + { + "epoch": 1.8341480757266022, + "grad_norm": 1.985830307006836, + "learning_rate": 7.181871093451516e-06, + "loss": 1.6561, + "step": 51590 + }, + { + "epoch": 1.8345035996800285, + "grad_norm": 1.9907962083816528, + "learning_rate": 7.178111050042735e-06, + "loss": 1.6557, + "step": 51600 + }, + { + "epoch": 1.8348591236334548, + "grad_norm": 1.9100240468978882, + "learning_rate": 7.174351440058407e-06, + "loss": 1.6453, + "step": 51610 + }, + { + "epoch": 1.8352146475868811, + "grad_norm": 1.8884332180023193, + "learning_rate": 7.1705922640759775e-06, + "loss": 1.6647, + "step": 51620 + }, + { + "epoch": 1.8355701715403074, + "grad_norm": 1.9803351163864136, + "learning_rate": 7.1668335226728355e-06, + "loss": 1.6602, + "step": 51630 + }, + { + "epoch": 1.8359256954937337, + "grad_norm": 2.021226406097412, + "learning_rate": 7.1630752164263015e-06, + "loss": 1.6323, + "step": 51640 + }, + { + "epoch": 1.8362812194471603, + "grad_norm": 2.012850761413574, + "learning_rate": 7.159317345913626e-06, + "loss": 1.6705, + "step": 51650 + }, + { + "epoch": 1.8366367434005866, + "grad_norm": 1.9528616666793823, + "learning_rate": 7.155559911711998e-06, + "loss": 1.6693, + "step": 51660 + }, + { + "epoch": 1.836992267354013, + "grad_norm": 2.012749433517456, + "learning_rate": 7.151802914398529e-06, + "loss": 1.6858, + "step": 51670 + }, + { + "epoch": 1.8373477913074394, + "grad_norm": 1.9602713584899902, + "learning_rate": 7.14804635455028e-06, + "loss": 1.6632, + "step": 51680 + }, + { + "epoch": 1.8377033152608657, + "grad_norm": 1.8004664182662964, + "learning_rate": 7.144290232744224e-06, + "loss": 1.6309, + "step": 51690 + }, + { + "epoch": 1.838058839214292, + "grad_norm": 1.745529294013977, + "learning_rate": 7.140534549557283e-06, + "loss": 1.6526, + "step": 51700 + }, + { + "epoch": 1.8384143631677183, + "grad_norm": 1.8921000957489014, + "learning_rate": 7.136779305566306e-06, + "loss": 1.6416, + "step": 51710 + }, + { + "epoch": 1.8387698871211446, + "grad_norm": 2.0046639442443848, + "learning_rate": 7.133024501348075e-06, + "loss": 1.6262, + "step": 51720 + }, + { + "epoch": 1.8391254110745712, + "grad_norm": 2.1108291149139404, + "learning_rate": 7.129270137479305e-06, + "loss": 1.6445, + "step": 51730 + }, + { + "epoch": 1.8394809350279975, + "grad_norm": 1.8767834901809692, + "learning_rate": 7.125516214536643e-06, + "loss": 1.6542, + "step": 51740 + }, + { + "epoch": 1.839836458981424, + "grad_norm": 1.8338385820388794, + "learning_rate": 7.121762733096666e-06, + "loss": 1.675, + "step": 51750 + }, + { + "epoch": 1.8401919829348503, + "grad_norm": 1.8881033658981323, + "learning_rate": 7.118009693735888e-06, + "loss": 1.6364, + "step": 51760 + }, + { + "epoch": 1.8405475068882766, + "grad_norm": 2.1846683025360107, + "learning_rate": 7.114257097030749e-06, + "loss": 1.6522, + "step": 51770 + }, + { + "epoch": 1.840903030841703, + "grad_norm": 1.8050435781478882, + "learning_rate": 7.110504943557627e-06, + "loss": 1.6297, + "step": 51780 + }, + { + "epoch": 1.8412585547951292, + "grad_norm": 1.8365222215652466, + "learning_rate": 7.10675323389283e-06, + "loss": 1.6818, + "step": 51790 + }, + { + "epoch": 1.8416140787485555, + "grad_norm": 1.8047209978103638, + "learning_rate": 7.103001968612593e-06, + "loss": 1.6475, + "step": 51800 + }, + { + "epoch": 1.841969602701982, + "grad_norm": 1.920447826385498, + "learning_rate": 7.0992511482930905e-06, + "loss": 1.6629, + "step": 51810 + }, + { + "epoch": 1.8423251266554084, + "grad_norm": 1.8633935451507568, + "learning_rate": 7.095500773510423e-06, + "loss": 1.6475, + "step": 51820 + }, + { + "epoch": 1.842680650608835, + "grad_norm": 2.0102744102478027, + "learning_rate": 7.0917508448406256e-06, + "loss": 1.6218, + "step": 51830 + }, + { + "epoch": 1.8430361745622612, + "grad_norm": 2.0085699558258057, + "learning_rate": 7.0880013628596675e-06, + "loss": 1.6499, + "step": 51840 + }, + { + "epoch": 1.8433916985156875, + "grad_norm": 1.9968560934066772, + "learning_rate": 7.084252328143437e-06, + "loss": 1.6404, + "step": 51850 + }, + { + "epoch": 1.8437472224691138, + "grad_norm": 1.7302302122116089, + "learning_rate": 7.080503741267768e-06, + "loss": 1.6511, + "step": 51860 + }, + { + "epoch": 1.8441027464225401, + "grad_norm": 1.8278783559799194, + "learning_rate": 7.07675560280842e-06, + "loss": 1.6605, + "step": 51870 + }, + { + "epoch": 1.8444582703759664, + "grad_norm": 1.9455891847610474, + "learning_rate": 7.0730079133410825e-06, + "loss": 1.6489, + "step": 51880 + }, + { + "epoch": 1.844813794329393, + "grad_norm": 2.237320899963379, + "learning_rate": 7.069260673441376e-06, + "loss": 1.6566, + "step": 51890 + }, + { + "epoch": 1.8451693182828193, + "grad_norm": 2.096794843673706, + "learning_rate": 7.065513883684853e-06, + "loss": 1.6156, + "step": 51900 + }, + { + "epoch": 1.8455248422362458, + "grad_norm": 1.753090500831604, + "learning_rate": 7.061767544647e-06, + "loss": 1.6459, + "step": 51910 + }, + { + "epoch": 1.845880366189672, + "grad_norm": 1.887935996055603, + "learning_rate": 7.058021656903231e-06, + "loss": 1.6211, + "step": 51920 + }, + { + "epoch": 1.8462358901430984, + "grad_norm": 1.9207141399383545, + "learning_rate": 7.054276221028886e-06, + "loss": 1.6498, + "step": 51930 + }, + { + "epoch": 1.8465914140965247, + "grad_norm": 2.0247364044189453, + "learning_rate": 7.050531237599244e-06, + "loss": 1.6259, + "step": 51940 + }, + { + "epoch": 1.846946938049951, + "grad_norm": 1.835836410522461, + "learning_rate": 7.04678670718951e-06, + "loss": 1.6194, + "step": 51950 + }, + { + "epoch": 1.8473024620033773, + "grad_norm": 1.896708369255066, + "learning_rate": 7.043042630374822e-06, + "loss": 1.6359, + "step": 51960 + }, + { + "epoch": 1.8476579859568039, + "grad_norm": 1.8522411584854126, + "learning_rate": 7.039299007730248e-06, + "loss": 1.6118, + "step": 51970 + }, + { + "epoch": 1.8480135099102302, + "grad_norm": 1.8102995157241821, + "learning_rate": 7.03555583983078e-06, + "loss": 1.7098, + "step": 51980 + }, + { + "epoch": 1.8483690338636567, + "grad_norm": 1.7815966606140137, + "learning_rate": 7.031813127251348e-06, + "loss": 1.6347, + "step": 51990 + }, + { + "epoch": 1.848724557817083, + "grad_norm": 1.9562181234359741, + "learning_rate": 7.028070870566813e-06, + "loss": 1.6436, + "step": 52000 + }, + { + "epoch": 1.8490800817705093, + "grad_norm": 2.038663387298584, + "learning_rate": 7.024329070351954e-06, + "loss": 1.6473, + "step": 52010 + }, + { + "epoch": 1.8494356057239356, + "grad_norm": 1.9478405714035034, + "learning_rate": 7.0205877271814914e-06, + "loss": 1.6152, + "step": 52020 + }, + { + "epoch": 1.849791129677362, + "grad_norm": 1.9415020942687988, + "learning_rate": 7.016846841630074e-06, + "loss": 1.5909, + "step": 52030 + }, + { + "epoch": 1.8501466536307882, + "grad_norm": 1.7721590995788574, + "learning_rate": 7.0131064142722775e-06, + "loss": 1.595, + "step": 52040 + }, + { + "epoch": 1.8505021775842148, + "grad_norm": 1.9393048286437988, + "learning_rate": 7.00936644568261e-06, + "loss": 1.6165, + "step": 52050 + }, + { + "epoch": 1.850857701537641, + "grad_norm": 1.960496425628662, + "learning_rate": 7.005626936435501e-06, + "loss": 1.6196, + "step": 52060 + }, + { + "epoch": 1.8512132254910676, + "grad_norm": 1.8558365106582642, + "learning_rate": 7.0018878871053205e-06, + "loss": 1.6447, + "step": 52070 + }, + { + "epoch": 1.851568749444494, + "grad_norm": 1.9220082759857178, + "learning_rate": 6.998149298266364e-06, + "loss": 1.6293, + "step": 52080 + }, + { + "epoch": 1.8519242733979202, + "grad_norm": 2.0755198001861572, + "learning_rate": 6.994411170492852e-06, + "loss": 1.6548, + "step": 52090 + }, + { + "epoch": 1.8522797973513465, + "grad_norm": 1.7107176780700684, + "learning_rate": 6.99067350435894e-06, + "loss": 1.6267, + "step": 52100 + }, + { + "epoch": 1.8526353213047728, + "grad_norm": 2.0055670738220215, + "learning_rate": 6.986936300438709e-06, + "loss": 1.6632, + "step": 52110 + }, + { + "epoch": 1.8529908452581991, + "grad_norm": 1.9524128437042236, + "learning_rate": 6.9831995593061695e-06, + "loss": 1.6436, + "step": 52120 + }, + { + "epoch": 1.8533463692116257, + "grad_norm": 1.8308955430984497, + "learning_rate": 6.979463281535263e-06, + "loss": 1.6158, + "step": 52130 + }, + { + "epoch": 1.853701893165052, + "grad_norm": 1.8957483768463135, + "learning_rate": 6.975727467699856e-06, + "loss": 1.6426, + "step": 52140 + }, + { + "epoch": 1.8540574171184785, + "grad_norm": 1.9096064567565918, + "learning_rate": 6.971992118373751e-06, + "loss": 1.6047, + "step": 52150 + }, + { + "epoch": 1.8544129410719048, + "grad_norm": 1.9696846008300781, + "learning_rate": 6.96825723413067e-06, + "loss": 1.6335, + "step": 52160 + }, + { + "epoch": 1.854768465025331, + "grad_norm": 1.9123235940933228, + "learning_rate": 6.964522815544267e-06, + "loss": 1.6384, + "step": 52170 + }, + { + "epoch": 1.8551239889787574, + "grad_norm": 1.8498286008834839, + "learning_rate": 6.960788863188128e-06, + "loss": 1.69, + "step": 52180 + }, + { + "epoch": 1.8554795129321837, + "grad_norm": 1.8621598482131958, + "learning_rate": 6.957055377635763e-06, + "loss": 1.6193, + "step": 52190 + }, + { + "epoch": 1.85583503688561, + "grad_norm": 2.083085298538208, + "learning_rate": 6.953322359460613e-06, + "loss": 1.6396, + "step": 52200 + }, + { + "epoch": 1.8561905608390366, + "grad_norm": 1.911636233329773, + "learning_rate": 6.949589809236044e-06, + "loss": 1.614, + "step": 52210 + }, + { + "epoch": 1.8565460847924629, + "grad_norm": 1.8892377614974976, + "learning_rate": 6.945857727535355e-06, + "loss": 1.6627, + "step": 52220 + }, + { + "epoch": 1.8569016087458894, + "grad_norm": 1.8188692331314087, + "learning_rate": 6.942126114931771e-06, + "loss": 1.7005, + "step": 52230 + }, + { + "epoch": 1.8572571326993157, + "grad_norm": 1.85429048538208, + "learning_rate": 6.938394971998441e-06, + "loss": 1.6463, + "step": 52240 + }, + { + "epoch": 1.857612656652742, + "grad_norm": 2.153946876525879, + "learning_rate": 6.934664299308447e-06, + "loss": 1.6127, + "step": 52250 + }, + { + "epoch": 1.8579681806061683, + "grad_norm": 1.8729736804962158, + "learning_rate": 6.930934097434798e-06, + "loss": 1.6296, + "step": 52260 + }, + { + "epoch": 1.8583237045595946, + "grad_norm": 2.1069846153259277, + "learning_rate": 6.927204366950426e-06, + "loss": 1.64, + "step": 52270 + }, + { + "epoch": 1.858679228513021, + "grad_norm": 2.0351064205169678, + "learning_rate": 6.9234751084282e-06, + "loss": 1.5966, + "step": 52280 + }, + { + "epoch": 1.8590347524664474, + "grad_norm": 1.7966029644012451, + "learning_rate": 6.919746322440905e-06, + "loss": 1.6567, + "step": 52290 + }, + { + "epoch": 1.8593902764198738, + "grad_norm": 1.9065757989883423, + "learning_rate": 6.916018009561263e-06, + "loss": 1.6614, + "step": 52300 + }, + { + "epoch": 1.8597458003733003, + "grad_norm": 1.963813066482544, + "learning_rate": 6.9122901703619164e-06, + "loss": 1.6268, + "step": 52310 + }, + { + "epoch": 1.8601013243267266, + "grad_norm": 1.971492052078247, + "learning_rate": 6.908562805415444e-06, + "loss": 1.6525, + "step": 52320 + }, + { + "epoch": 1.860456848280153, + "grad_norm": 2.0255093574523926, + "learning_rate": 6.904835915294337e-06, + "loss": 1.6332, + "step": 52330 + }, + { + "epoch": 1.8608123722335792, + "grad_norm": 2.054584503173828, + "learning_rate": 6.901109500571026e-06, + "loss": 1.6178, + "step": 52340 + }, + { + "epoch": 1.8611678961870055, + "grad_norm": 1.9593502283096313, + "learning_rate": 6.897383561817867e-06, + "loss": 1.6165, + "step": 52350 + }, + { + "epoch": 1.8615234201404318, + "grad_norm": 1.9384634494781494, + "learning_rate": 6.893658099607141e-06, + "loss": 1.656, + "step": 52360 + }, + { + "epoch": 1.8618789440938583, + "grad_norm": 2.130075454711914, + "learning_rate": 6.889933114511052e-06, + "loss": 1.6586, + "step": 52370 + }, + { + "epoch": 1.8622344680472847, + "grad_norm": 2.0572383403778076, + "learning_rate": 6.886208607101734e-06, + "loss": 1.6845, + "step": 52380 + }, + { + "epoch": 1.8625899920007112, + "grad_norm": 2.1621816158294678, + "learning_rate": 6.882484577951254e-06, + "loss": 1.6646, + "step": 52390 + }, + { + "epoch": 1.8629455159541375, + "grad_norm": 1.9399415254592896, + "learning_rate": 6.878761027631593e-06, + "loss": 1.6042, + "step": 52400 + }, + { + "epoch": 1.8633010399075638, + "grad_norm": 1.8925954103469849, + "learning_rate": 6.875037956714672e-06, + "loss": 1.619, + "step": 52410 + }, + { + "epoch": 1.86365656386099, + "grad_norm": 1.9047517776489258, + "learning_rate": 6.871315365772324e-06, + "loss": 1.6381, + "step": 52420 + }, + { + "epoch": 1.8640120878144164, + "grad_norm": 2.014160394668579, + "learning_rate": 6.867593255376319e-06, + "loss": 1.6567, + "step": 52430 + }, + { + "epoch": 1.8643676117678427, + "grad_norm": 1.8610714673995972, + "learning_rate": 6.863871626098349e-06, + "loss": 1.6461, + "step": 52440 + }, + { + "epoch": 1.8647231357212692, + "grad_norm": 1.9440066814422607, + "learning_rate": 6.860150478510035e-06, + "loss": 1.618, + "step": 52450 + }, + { + "epoch": 1.8650786596746955, + "grad_norm": 1.9076669216156006, + "learning_rate": 6.856429813182919e-06, + "loss": 1.6221, + "step": 52460 + }, + { + "epoch": 1.865434183628122, + "grad_norm": 1.999401330947876, + "learning_rate": 6.852709630688477e-06, + "loss": 1.6114, + "step": 52470 + }, + { + "epoch": 1.8657897075815484, + "grad_norm": 1.7853844165802002, + "learning_rate": 6.8489899315981e-06, + "loss": 1.5854, + "step": 52480 + }, + { + "epoch": 1.8661452315349747, + "grad_norm": 1.9500246047973633, + "learning_rate": 6.845270716483114e-06, + "loss": 1.6231, + "step": 52490 + }, + { + "epoch": 1.866500755488401, + "grad_norm": 1.7870290279388428, + "learning_rate": 6.841551985914763e-06, + "loss": 1.6524, + "step": 52500 + }, + { + "epoch": 1.8668562794418273, + "grad_norm": 1.970482587814331, + "learning_rate": 6.837833740464224e-06, + "loss": 1.6487, + "step": 52510 + }, + { + "epoch": 1.8672118033952536, + "grad_norm": 1.7473478317260742, + "learning_rate": 6.834115980702595e-06, + "loss": 1.6268, + "step": 52520 + }, + { + "epoch": 1.8675673273486801, + "grad_norm": 1.974705696105957, + "learning_rate": 6.8303987072009005e-06, + "loss": 1.6069, + "step": 52530 + }, + { + "epoch": 1.8679228513021064, + "grad_norm": 1.9087203741073608, + "learning_rate": 6.826681920530093e-06, + "loss": 1.611, + "step": 52540 + }, + { + "epoch": 1.868278375255533, + "grad_norm": 1.8926479816436768, + "learning_rate": 6.8229656212610465e-06, + "loss": 1.6678, + "step": 52550 + }, + { + "epoch": 1.8686338992089593, + "grad_norm": 1.916961669921875, + "learning_rate": 6.819249809964557e-06, + "loss": 1.6275, + "step": 52560 + }, + { + "epoch": 1.8689894231623856, + "grad_norm": 2.004983425140381, + "learning_rate": 6.815534487211355e-06, + "loss": 1.629, + "step": 52570 + }, + { + "epoch": 1.869344947115812, + "grad_norm": 1.826501488685608, + "learning_rate": 6.811819653572088e-06, + "loss": 1.6336, + "step": 52580 + }, + { + "epoch": 1.8697004710692382, + "grad_norm": 1.9561131000518799, + "learning_rate": 6.808105309617334e-06, + "loss": 1.6884, + "step": 52590 + }, + { + "epoch": 1.8700559950226645, + "grad_norm": 1.9927582740783691, + "learning_rate": 6.804391455917591e-06, + "loss": 1.6628, + "step": 52600 + }, + { + "epoch": 1.870411518976091, + "grad_norm": 1.850220799446106, + "learning_rate": 6.8006780930432825e-06, + "loss": 1.6724, + "step": 52610 + }, + { + "epoch": 1.8707670429295173, + "grad_norm": 1.8439356088638306, + "learning_rate": 6.7969652215647595e-06, + "loss": 1.6149, + "step": 52620 + }, + { + "epoch": 1.8711225668829439, + "grad_norm": 1.9668513536453247, + "learning_rate": 6.793252842052294e-06, + "loss": 1.6607, + "step": 52630 + }, + { + "epoch": 1.8714780908363702, + "grad_norm": 1.8955330848693848, + "learning_rate": 6.78954095507609e-06, + "loss": 1.6587, + "step": 52640 + }, + { + "epoch": 1.8718336147897965, + "grad_norm": 1.7684237957000732, + "learning_rate": 6.785829561206263e-06, + "loss": 1.6449, + "step": 52650 + }, + { + "epoch": 1.8721891387432228, + "grad_norm": 1.9502431154251099, + "learning_rate": 6.782118661012861e-06, + "loss": 1.6575, + "step": 52660 + }, + { + "epoch": 1.872544662696649, + "grad_norm": 2.061988353729248, + "learning_rate": 6.778408255065858e-06, + "loss": 1.68, + "step": 52670 + }, + { + "epoch": 1.8729001866500754, + "grad_norm": 1.8865063190460205, + "learning_rate": 6.7746983439351465e-06, + "loss": 1.6484, + "step": 52680 + }, + { + "epoch": 1.873255710603502, + "grad_norm": 1.9195656776428223, + "learning_rate": 6.770988928190547e-06, + "loss": 1.6131, + "step": 52690 + }, + { + "epoch": 1.8736112345569282, + "grad_norm": 1.9573023319244385, + "learning_rate": 6.767280008401801e-06, + "loss": 1.6546, + "step": 52700 + }, + { + "epoch": 1.8739667585103548, + "grad_norm": 1.9096479415893555, + "learning_rate": 6.763571585138578e-06, + "loss": 1.6396, + "step": 52710 + }, + { + "epoch": 1.874322282463781, + "grad_norm": 1.8641668558120728, + "learning_rate": 6.759863658970467e-06, + "loss": 1.6524, + "step": 52720 + }, + { + "epoch": 1.8746778064172074, + "grad_norm": 2.046111822128296, + "learning_rate": 6.756156230466981e-06, + "loss": 1.6586, + "step": 52730 + }, + { + "epoch": 1.8750333303706337, + "grad_norm": 2.072157382965088, + "learning_rate": 6.752449300197559e-06, + "loss": 1.6432, + "step": 52740 + }, + { + "epoch": 1.87538885432406, + "grad_norm": 1.7518972158432007, + "learning_rate": 6.7487428687315615e-06, + "loss": 1.6357, + "step": 52750 + }, + { + "epoch": 1.8757443782774863, + "grad_norm": 2.038274049758911, + "learning_rate": 6.745036936638274e-06, + "loss": 1.6059, + "step": 52760 + }, + { + "epoch": 1.8760999022309128, + "grad_norm": 1.9318206310272217, + "learning_rate": 6.741331504486905e-06, + "loss": 1.648, + "step": 52770 + }, + { + "epoch": 1.8764554261843391, + "grad_norm": 2.032932758331299, + "learning_rate": 6.7376265728465865e-06, + "loss": 1.6737, + "step": 52780 + }, + { + "epoch": 1.8768109501377657, + "grad_norm": 2.2127063274383545, + "learning_rate": 6.733922142286368e-06, + "loss": 1.6334, + "step": 52790 + }, + { + "epoch": 1.877166474091192, + "grad_norm": 1.8505887985229492, + "learning_rate": 6.730218213375237e-06, + "loss": 1.6206, + "step": 52800 + }, + { + "epoch": 1.8775219980446183, + "grad_norm": 2.0579097270965576, + "learning_rate": 6.72651478668208e-06, + "loss": 1.6816, + "step": 52810 + }, + { + "epoch": 1.8778775219980446, + "grad_norm": 1.8971630334854126, + "learning_rate": 6.7228118627757275e-06, + "loss": 1.5917, + "step": 52820 + }, + { + "epoch": 1.878233045951471, + "grad_norm": 1.8610026836395264, + "learning_rate": 6.7191094422249244e-06, + "loss": 1.6413, + "step": 52830 + }, + { + "epoch": 1.8785885699048972, + "grad_norm": 1.9640917778015137, + "learning_rate": 6.715407525598341e-06, + "loss": 1.6587, + "step": 52840 + }, + { + "epoch": 1.8789440938583237, + "grad_norm": 1.9591315984725952, + "learning_rate": 6.711706113464569e-06, + "loss": 1.638, + "step": 52850 + }, + { + "epoch": 1.87929961781175, + "grad_norm": 1.8325140476226807, + "learning_rate": 6.7080052063921166e-06, + "loss": 1.6287, + "step": 52860 + }, + { + "epoch": 1.8796551417651766, + "grad_norm": 1.9015944004058838, + "learning_rate": 6.704304804949424e-06, + "loss": 1.6104, + "step": 52870 + }, + { + "epoch": 1.8800106657186029, + "grad_norm": 1.8562361001968384, + "learning_rate": 6.700604909704851e-06, + "loss": 1.5918, + "step": 52880 + }, + { + "epoch": 1.8803661896720292, + "grad_norm": 2.0939865112304688, + "learning_rate": 6.696905521226674e-06, + "loss": 1.619, + "step": 52890 + }, + { + "epoch": 1.8807217136254555, + "grad_norm": 1.8259873390197754, + "learning_rate": 6.693206640083101e-06, + "loss": 1.5921, + "step": 52900 + }, + { + "epoch": 1.8810772375788818, + "grad_norm": 1.8358393907546997, + "learning_rate": 6.689508266842254e-06, + "loss": 1.6305, + "step": 52910 + }, + { + "epoch": 1.881432761532308, + "grad_norm": 1.8782728910446167, + "learning_rate": 6.685810402072179e-06, + "loss": 1.6674, + "step": 52920 + }, + { + "epoch": 1.8817882854857346, + "grad_norm": 2.064119338989258, + "learning_rate": 6.682113046340846e-06, + "loss": 1.6177, + "step": 52930 + }, + { + "epoch": 1.882143809439161, + "grad_norm": 1.9853843450546265, + "learning_rate": 6.6784162002161465e-06, + "loss": 1.6494, + "step": 52940 + }, + { + "epoch": 1.8824993333925875, + "grad_norm": 1.725140929222107, + "learning_rate": 6.674719864265892e-06, + "loss": 1.6455, + "step": 52950 + }, + { + "epoch": 1.8828548573460138, + "grad_norm": 1.8963104486465454, + "learning_rate": 6.671024039057821e-06, + "loss": 1.6511, + "step": 52960 + }, + { + "epoch": 1.88321038129944, + "grad_norm": 1.8338735103607178, + "learning_rate": 6.667328725159579e-06, + "loss": 1.6808, + "step": 52970 + }, + { + "epoch": 1.8835659052528664, + "grad_norm": 1.9279541969299316, + "learning_rate": 6.663633923138753e-06, + "loss": 1.6262, + "step": 52980 + }, + { + "epoch": 1.8839214292062927, + "grad_norm": 1.9289398193359375, + "learning_rate": 6.659939633562833e-06, + "loss": 1.6397, + "step": 52990 + }, + { + "epoch": 1.884276953159719, + "grad_norm": 2.0128793716430664, + "learning_rate": 6.656245856999244e-06, + "loss": 1.6602, + "step": 53000 + }, + { + "epoch": 1.8846324771131455, + "grad_norm": 1.7492691278457642, + "learning_rate": 6.6525525940153265e-06, + "loss": 1.6003, + "step": 53010 + }, + { + "epoch": 1.8849880010665718, + "grad_norm": 1.9173500537872314, + "learning_rate": 6.648859845178342e-06, + "loss": 1.626, + "step": 53020 + }, + { + "epoch": 1.8853435250199984, + "grad_norm": 1.886384129524231, + "learning_rate": 6.645167611055474e-06, + "loss": 1.6548, + "step": 53030 + }, + { + "epoch": 1.8856990489734247, + "grad_norm": 2.015211343765259, + "learning_rate": 6.641475892213824e-06, + "loss": 1.7013, + "step": 53040 + }, + { + "epoch": 1.886054572926851, + "grad_norm": 1.8351353406906128, + "learning_rate": 6.637784689220421e-06, + "loss": 1.6402, + "step": 53050 + }, + { + "epoch": 1.8864100968802773, + "grad_norm": 1.9520881175994873, + "learning_rate": 6.634094002642207e-06, + "loss": 1.6562, + "step": 53060 + }, + { + "epoch": 1.8867656208337036, + "grad_norm": 1.868396282196045, + "learning_rate": 6.63040383304605e-06, + "loss": 1.6822, + "step": 53070 + }, + { + "epoch": 1.88712114478713, + "grad_norm": 1.8462475538253784, + "learning_rate": 6.626714180998737e-06, + "loss": 1.64, + "step": 53080 + }, + { + "epoch": 1.8874766687405564, + "grad_norm": 2.012273073196411, + "learning_rate": 6.623025047066976e-06, + "loss": 1.606, + "step": 53090 + }, + { + "epoch": 1.8878321926939827, + "grad_norm": 1.8039524555206299, + "learning_rate": 6.619336431817393e-06, + "loss": 1.6002, + "step": 53100 + }, + { + "epoch": 1.8881877166474093, + "grad_norm": 1.964730143547058, + "learning_rate": 6.615648335816536e-06, + "loss": 1.6452, + "step": 53110 + }, + { + "epoch": 1.8885432406008356, + "grad_norm": 1.8994137048721313, + "learning_rate": 6.61196075963088e-06, + "loss": 1.6654, + "step": 53120 + }, + { + "epoch": 1.8888987645542619, + "grad_norm": 1.8373494148254395, + "learning_rate": 6.608273703826804e-06, + "loss": 1.6285, + "step": 53130 + }, + { + "epoch": 1.8892542885076882, + "grad_norm": 1.860703468322754, + "learning_rate": 6.60458716897062e-06, + "loss": 1.6306, + "step": 53140 + }, + { + "epoch": 1.8896098124611145, + "grad_norm": 1.8882999420166016, + "learning_rate": 6.600901155628558e-06, + "loss": 1.6727, + "step": 53150 + }, + { + "epoch": 1.8899653364145408, + "grad_norm": 1.800947904586792, + "learning_rate": 6.597215664366767e-06, + "loss": 1.6457, + "step": 53160 + }, + { + "epoch": 1.8903208603679673, + "grad_norm": 1.9370545148849487, + "learning_rate": 6.593530695751314e-06, + "loss": 1.6583, + "step": 53170 + }, + { + "epoch": 1.8906763843213936, + "grad_norm": 1.8378711938858032, + "learning_rate": 6.589846250348186e-06, + "loss": 1.6085, + "step": 53180 + }, + { + "epoch": 1.8910319082748202, + "grad_norm": 1.7905361652374268, + "learning_rate": 6.586162328723294e-06, + "loss": 1.637, + "step": 53190 + }, + { + "epoch": 1.8913874322282465, + "grad_norm": 2.0296435356140137, + "learning_rate": 6.582478931442462e-06, + "loss": 1.6363, + "step": 53200 + }, + { + "epoch": 1.8917429561816728, + "grad_norm": 1.907057523727417, + "learning_rate": 6.578796059071437e-06, + "loss": 1.6487, + "step": 53210 + }, + { + "epoch": 1.892098480135099, + "grad_norm": 1.8881608247756958, + "learning_rate": 6.57511371217589e-06, + "loss": 1.6877, + "step": 53220 + }, + { + "epoch": 1.8924540040885254, + "grad_norm": 1.935365915298462, + "learning_rate": 6.571431891321401e-06, + "loss": 1.6318, + "step": 53230 + }, + { + "epoch": 1.8928095280419517, + "grad_norm": 2.105151653289795, + "learning_rate": 6.5677505970734745e-06, + "loss": 1.6447, + "step": 53240 + }, + { + "epoch": 1.8931650519953782, + "grad_norm": 1.9732235670089722, + "learning_rate": 6.564069829997537e-06, + "loss": 1.6506, + "step": 53250 + }, + { + "epoch": 1.8935205759488045, + "grad_norm": 1.8154934644699097, + "learning_rate": 6.5603895906589286e-06, + "loss": 1.6714, + "step": 53260 + }, + { + "epoch": 1.893876099902231, + "grad_norm": 1.8948172330856323, + "learning_rate": 6.556709879622916e-06, + "loss": 1.6852, + "step": 53270 + }, + { + "epoch": 1.8942316238556574, + "grad_norm": 1.943363070487976, + "learning_rate": 6.553030697454677e-06, + "loss": 1.6407, + "step": 53280 + }, + { + "epoch": 1.8945871478090837, + "grad_norm": 1.8452975749969482, + "learning_rate": 6.5493520447193085e-06, + "loss": 1.6285, + "step": 53290 + }, + { + "epoch": 1.89494267176251, + "grad_norm": 1.944027066230774, + "learning_rate": 6.54567392198183e-06, + "loss": 1.6307, + "step": 53300 + }, + { + "epoch": 1.8952981957159363, + "grad_norm": 1.9325518608093262, + "learning_rate": 6.541996329807177e-06, + "loss": 1.6061, + "step": 53310 + }, + { + "epoch": 1.8956537196693626, + "grad_norm": 2.05012845993042, + "learning_rate": 6.538319268760205e-06, + "loss": 1.62, + "step": 53320 + }, + { + "epoch": 1.896009243622789, + "grad_norm": 1.8536720275878906, + "learning_rate": 6.53464273940569e-06, + "loss": 1.6188, + "step": 53330 + }, + { + "epoch": 1.8963647675762154, + "grad_norm": 1.8141745328903198, + "learning_rate": 6.530966742308322e-06, + "loss": 1.6328, + "step": 53340 + }, + { + "epoch": 1.896720291529642, + "grad_norm": 1.8708752393722534, + "learning_rate": 6.5272912780327125e-06, + "loss": 1.627, + "step": 53350 + }, + { + "epoch": 1.8970758154830683, + "grad_norm": 1.889166235923767, + "learning_rate": 6.523616347143388e-06, + "loss": 1.6239, + "step": 53360 + }, + { + "epoch": 1.8974313394364946, + "grad_norm": 1.9003798961639404, + "learning_rate": 6.519941950204796e-06, + "loss": 1.6434, + "step": 53370 + }, + { + "epoch": 1.8977868633899209, + "grad_norm": 1.793718934059143, + "learning_rate": 6.516268087781298e-06, + "loss": 1.6483, + "step": 53380 + }, + { + "epoch": 1.8981423873433472, + "grad_norm": 1.900206208229065, + "learning_rate": 6.51259476043718e-06, + "loss": 1.6316, + "step": 53390 + }, + { + "epoch": 1.8984979112967735, + "grad_norm": 1.7711083889007568, + "learning_rate": 6.508921968736641e-06, + "loss": 1.6419, + "step": 53400 + }, + { + "epoch": 1.8988534352502, + "grad_norm": 2.0549914836883545, + "learning_rate": 6.505249713243798e-06, + "loss": 1.685, + "step": 53410 + }, + { + "epoch": 1.8992089592036263, + "grad_norm": 2.0399253368377686, + "learning_rate": 6.501577994522687e-06, + "loss": 1.6453, + "step": 53420 + }, + { + "epoch": 1.8995644831570528, + "grad_norm": 1.9935857057571411, + "learning_rate": 6.49790681313726e-06, + "loss": 1.6456, + "step": 53430 + }, + { + "epoch": 1.8999200071104791, + "grad_norm": 1.8322583436965942, + "learning_rate": 6.4942361696513936e-06, + "loss": 1.5947, + "step": 53440 + }, + { + "epoch": 1.9002755310639055, + "grad_norm": 1.8570095300674438, + "learning_rate": 6.490566064628865e-06, + "loss": 1.6, + "step": 53450 + }, + { + "epoch": 1.9006310550173318, + "grad_norm": 1.8600091934204102, + "learning_rate": 6.486896498633384e-06, + "loss": 1.662, + "step": 53460 + }, + { + "epoch": 1.900986578970758, + "grad_norm": 2.0022993087768555, + "learning_rate": 6.483227472228576e-06, + "loss": 1.5742, + "step": 53470 + }, + { + "epoch": 1.9013421029241844, + "grad_norm": 1.7619541883468628, + "learning_rate": 6.479558985977976e-06, + "loss": 1.6417, + "step": 53480 + }, + { + "epoch": 1.901697626877611, + "grad_norm": 1.8340739011764526, + "learning_rate": 6.475891040445043e-06, + "loss": 1.6716, + "step": 53490 + }, + { + "epoch": 1.9020531508310372, + "grad_norm": 1.8696131706237793, + "learning_rate": 6.472223636193149e-06, + "loss": 1.6595, + "step": 53500 + }, + { + "epoch": 1.9024086747844637, + "grad_norm": 1.9896914958953857, + "learning_rate": 6.468556773785585e-06, + "loss": 1.6487, + "step": 53510 + }, + { + "epoch": 1.90276419873789, + "grad_norm": 1.9387229681015015, + "learning_rate": 6.464890453785559e-06, + "loss": 1.6067, + "step": 53520 + }, + { + "epoch": 1.9031197226913164, + "grad_norm": 2.0417120456695557, + "learning_rate": 6.461224676756195e-06, + "loss": 1.6311, + "step": 53530 + }, + { + "epoch": 1.9034752466447427, + "grad_norm": 1.9006065130233765, + "learning_rate": 6.457559443260531e-06, + "loss": 1.653, + "step": 53540 + }, + { + "epoch": 1.903830770598169, + "grad_norm": 1.9895280599594116, + "learning_rate": 6.453894753861525e-06, + "loss": 1.5814, + "step": 53550 + }, + { + "epoch": 1.9041862945515953, + "grad_norm": 2.510457754135132, + "learning_rate": 6.450230609122052e-06, + "loss": 1.6378, + "step": 53560 + }, + { + "epoch": 1.9045418185050218, + "grad_norm": 1.9443553686141968, + "learning_rate": 6.446567009604898e-06, + "loss": 1.6472, + "step": 53570 + }, + { + "epoch": 1.904897342458448, + "grad_norm": 2.010951519012451, + "learning_rate": 6.442903955872775e-06, + "loss": 1.6459, + "step": 53580 + }, + { + "epoch": 1.9052528664118746, + "grad_norm": 1.9579919576644897, + "learning_rate": 6.439241448488298e-06, + "loss": 1.6313, + "step": 53590 + }, + { + "epoch": 1.905608390365301, + "grad_norm": 1.821547031402588, + "learning_rate": 6.4355794880140124e-06, + "loss": 1.6367, + "step": 53600 + }, + { + "epoch": 1.9059639143187272, + "grad_norm": 1.8498295545578003, + "learning_rate": 6.431918075012365e-06, + "loss": 1.6766, + "step": 53610 + }, + { + "epoch": 1.9063194382721536, + "grad_norm": 1.9463131427764893, + "learning_rate": 6.42825721004573e-06, + "loss": 1.6605, + "step": 53620 + }, + { + "epoch": 1.9066749622255799, + "grad_norm": 1.7999359369277954, + "learning_rate": 6.4245968936763905e-06, + "loss": 1.6405, + "step": 53630 + }, + { + "epoch": 1.9070304861790062, + "grad_norm": 1.837437629699707, + "learning_rate": 6.420937126466551e-06, + "loss": 1.6095, + "step": 53640 + }, + { + "epoch": 1.9073860101324327, + "grad_norm": 1.9130752086639404, + "learning_rate": 6.4172779089783276e-06, + "loss": 1.5937, + "step": 53650 + }, + { + "epoch": 1.907741534085859, + "grad_norm": 1.8691471815109253, + "learning_rate": 6.413619241773757e-06, + "loss": 1.6569, + "step": 53660 + }, + { + "epoch": 1.9080970580392855, + "grad_norm": 1.8947511911392212, + "learning_rate": 6.409961125414781e-06, + "loss": 1.6334, + "step": 53670 + }, + { + "epoch": 1.9084525819927118, + "grad_norm": 1.9409008026123047, + "learning_rate": 6.406303560463267e-06, + "loss": 1.6369, + "step": 53680 + }, + { + "epoch": 1.9088081059461381, + "grad_norm": 1.8441576957702637, + "learning_rate": 6.402646547480993e-06, + "loss": 1.6131, + "step": 53690 + }, + { + "epoch": 1.9091636298995645, + "grad_norm": 1.829941749572754, + "learning_rate": 6.398990087029653e-06, + "loss": 1.6402, + "step": 53700 + }, + { + "epoch": 1.9095191538529908, + "grad_norm": 1.9568201303482056, + "learning_rate": 6.39533417967086e-06, + "loss": 1.6263, + "step": 53710 + }, + { + "epoch": 1.909874677806417, + "grad_norm": 1.840928554534912, + "learning_rate": 6.391678825966134e-06, + "loss": 1.6257, + "step": 53720 + }, + { + "epoch": 1.9102302017598436, + "grad_norm": 2.085839033126831, + "learning_rate": 6.388024026476915e-06, + "loss": 1.6544, + "step": 53730 + }, + { + "epoch": 1.91058572571327, + "grad_norm": 1.9179319143295288, + "learning_rate": 6.384369781764559e-06, + "loss": 1.6979, + "step": 53740 + }, + { + "epoch": 1.9109412496666964, + "grad_norm": 1.9086040258407593, + "learning_rate": 6.380716092390333e-06, + "loss": 1.6438, + "step": 53750 + }, + { + "epoch": 1.9112967736201227, + "grad_norm": 2.0124614238739014, + "learning_rate": 6.3770629589154275e-06, + "loss": 1.6385, + "step": 53760 + }, + { + "epoch": 1.911652297573549, + "grad_norm": 1.9612501859664917, + "learning_rate": 6.3734103819009315e-06, + "loss": 1.6497, + "step": 53770 + }, + { + "epoch": 1.9120078215269753, + "grad_norm": 2.2222492694854736, + "learning_rate": 6.369758361907861e-06, + "loss": 1.6181, + "step": 53780 + }, + { + "epoch": 1.9123633454804017, + "grad_norm": 2.0280916690826416, + "learning_rate": 6.366106899497149e-06, + "loss": 1.6248, + "step": 53790 + }, + { + "epoch": 1.912718869433828, + "grad_norm": 1.8456861972808838, + "learning_rate": 6.3624559952296275e-06, + "loss": 1.6181, + "step": 53800 + }, + { + "epoch": 1.9130743933872545, + "grad_norm": 1.9914888143539429, + "learning_rate": 6.358805649666058e-06, + "loss": 1.6349, + "step": 53810 + }, + { + "epoch": 1.9134299173406808, + "grad_norm": 2.0794036388397217, + "learning_rate": 6.35515586336711e-06, + "loss": 1.632, + "step": 53820 + }, + { + "epoch": 1.9137854412941073, + "grad_norm": 1.9409990310668945, + "learning_rate": 6.35150663689337e-06, + "loss": 1.5857, + "step": 53830 + }, + { + "epoch": 1.9141409652475336, + "grad_norm": 1.886592149734497, + "learning_rate": 6.347857970805336e-06, + "loss": 1.6556, + "step": 53840 + }, + { + "epoch": 1.91449648920096, + "grad_norm": 1.7917068004608154, + "learning_rate": 6.3442098656634155e-06, + "loss": 1.6261, + "step": 53850 + }, + { + "epoch": 1.9148520131543862, + "grad_norm": 1.9958102703094482, + "learning_rate": 6.340562322027936e-06, + "loss": 1.6755, + "step": 53860 + }, + { + "epoch": 1.9152075371078126, + "grad_norm": 1.9651449918746948, + "learning_rate": 6.336915340459142e-06, + "loss": 1.6133, + "step": 53870 + }, + { + "epoch": 1.9155630610612389, + "grad_norm": 1.9258513450622559, + "learning_rate": 6.333268921517184e-06, + "loss": 1.6422, + "step": 53880 + }, + { + "epoch": 1.9159185850146654, + "grad_norm": 1.9306154251098633, + "learning_rate": 6.329623065762129e-06, + "loss": 1.623, + "step": 53890 + }, + { + "epoch": 1.9162741089680917, + "grad_norm": 1.9108705520629883, + "learning_rate": 6.325977773753957e-06, + "loss": 1.6683, + "step": 53900 + }, + { + "epoch": 1.9166296329215182, + "grad_norm": 1.916987419128418, + "learning_rate": 6.322333046052562e-06, + "loss": 1.6664, + "step": 53910 + }, + { + "epoch": 1.9169851568749445, + "grad_norm": 1.8760071992874146, + "learning_rate": 6.318688883217756e-06, + "loss": 1.6102, + "step": 53920 + }, + { + "epoch": 1.9173406808283708, + "grad_norm": 2.035249710083008, + "learning_rate": 6.315045285809251e-06, + "loss": 1.6147, + "step": 53930 + }, + { + "epoch": 1.9176962047817971, + "grad_norm": 1.8555046319961548, + "learning_rate": 6.311402254386687e-06, + "loss": 1.6164, + "step": 53940 + }, + { + "epoch": 1.9180517287352234, + "grad_norm": 1.8743116855621338, + "learning_rate": 6.307759789509609e-06, + "loss": 1.6632, + "step": 53950 + }, + { + "epoch": 1.9184072526886498, + "grad_norm": 1.9584312438964844, + "learning_rate": 6.304117891737475e-06, + "loss": 1.6478, + "step": 53960 + }, + { + "epoch": 1.9187627766420763, + "grad_norm": 1.9928791522979736, + "learning_rate": 6.300476561629662e-06, + "loss": 1.6504, + "step": 53970 + }, + { + "epoch": 1.9191183005955026, + "grad_norm": 2.083934783935547, + "learning_rate": 6.296835799745452e-06, + "loss": 1.6567, + "step": 53980 + }, + { + "epoch": 1.9194738245489291, + "grad_norm": 1.8650946617126465, + "learning_rate": 6.293195606644044e-06, + "loss": 1.6624, + "step": 53990 + }, + { + "epoch": 1.9198293485023554, + "grad_norm": 1.9788693189620972, + "learning_rate": 6.289555982884548e-06, + "loss": 1.6915, + "step": 54000 + }, + { + "epoch": 1.9201848724557817, + "grad_norm": 2.0542051792144775, + "learning_rate": 6.285916929025988e-06, + "loss": 1.6699, + "step": 54010 + }, + { + "epoch": 1.920540396409208, + "grad_norm": 1.9393855333328247, + "learning_rate": 6.282278445627304e-06, + "loss": 1.6726, + "step": 54020 + }, + { + "epoch": 1.9208959203626343, + "grad_norm": 1.938590407371521, + "learning_rate": 6.278640533247338e-06, + "loss": 1.6455, + "step": 54030 + }, + { + "epoch": 1.9212514443160607, + "grad_norm": 2.0474417209625244, + "learning_rate": 6.275003192444852e-06, + "loss": 1.6008, + "step": 54040 + }, + { + "epoch": 1.9216069682694872, + "grad_norm": 1.933955430984497, + "learning_rate": 6.2713664237785195e-06, + "loss": 1.6037, + "step": 54050 + }, + { + "epoch": 1.9219624922229135, + "grad_norm": 1.941810965538025, + "learning_rate": 6.2677302278069266e-06, + "loss": 1.6202, + "step": 54060 + }, + { + "epoch": 1.92231801617634, + "grad_norm": 2.1382062435150146, + "learning_rate": 6.2640946050885705e-06, + "loss": 1.6369, + "step": 54070 + }, + { + "epoch": 1.9226735401297663, + "grad_norm": 1.9926743507385254, + "learning_rate": 6.2604595561818595e-06, + "loss": 1.624, + "step": 54080 + }, + { + "epoch": 1.9230290640831926, + "grad_norm": 1.8309038877487183, + "learning_rate": 6.25682508164511e-06, + "loss": 1.6487, + "step": 54090 + }, + { + "epoch": 1.923384588036619, + "grad_norm": 2.033742904663086, + "learning_rate": 6.253191182036562e-06, + "loss": 1.6486, + "step": 54100 + }, + { + "epoch": 1.9237401119900452, + "grad_norm": 1.9058263301849365, + "learning_rate": 6.249557857914354e-06, + "loss": 1.642, + "step": 54110 + }, + { + "epoch": 1.9240956359434715, + "grad_norm": 2.0782461166381836, + "learning_rate": 6.245925109836542e-06, + "loss": 1.6093, + "step": 54120 + }, + { + "epoch": 1.924451159896898, + "grad_norm": 1.862004041671753, + "learning_rate": 6.242292938361096e-06, + "loss": 1.6631, + "step": 54130 + }, + { + "epoch": 1.9248066838503244, + "grad_norm": 1.8530863523483276, + "learning_rate": 6.2386613440458936e-06, + "loss": 1.6516, + "step": 54140 + }, + { + "epoch": 1.925162207803751, + "grad_norm": 1.9508360624313354, + "learning_rate": 6.235030327448726e-06, + "loss": 1.6512, + "step": 54150 + }, + { + "epoch": 1.9255177317571772, + "grad_norm": 1.7762582302093506, + "learning_rate": 6.231399889127292e-06, + "loss": 1.624, + "step": 54160 + }, + { + "epoch": 1.9258732557106035, + "grad_norm": 2.1135988235473633, + "learning_rate": 6.227770029639206e-06, + "loss": 1.6372, + "step": 54170 + }, + { + "epoch": 1.9262287796640298, + "grad_norm": 1.9216655492782593, + "learning_rate": 6.2241407495419916e-06, + "loss": 1.6046, + "step": 54180 + }, + { + "epoch": 1.9265843036174561, + "grad_norm": 1.995802879333496, + "learning_rate": 6.220512049393082e-06, + "loss": 1.6022, + "step": 54190 + }, + { + "epoch": 1.9269398275708824, + "grad_norm": 1.8913733959197998, + "learning_rate": 6.216883929749826e-06, + "loss": 1.6214, + "step": 54200 + }, + { + "epoch": 1.927295351524309, + "grad_norm": 1.824079155921936, + "learning_rate": 6.213256391169478e-06, + "loss": 1.6018, + "step": 54210 + }, + { + "epoch": 1.9276508754777353, + "grad_norm": 1.9052177667617798, + "learning_rate": 6.209629434209203e-06, + "loss": 1.6245, + "step": 54220 + }, + { + "epoch": 1.9280063994311618, + "grad_norm": 2.0674831867218018, + "learning_rate": 6.2060030594260836e-06, + "loss": 1.6535, + "step": 54230 + }, + { + "epoch": 1.9283619233845881, + "grad_norm": 1.8703824281692505, + "learning_rate": 6.202377267377104e-06, + "loss": 1.601, + "step": 54240 + }, + { + "epoch": 1.9287174473380144, + "grad_norm": 1.8373008966445923, + "learning_rate": 6.1987520586191694e-06, + "loss": 1.5962, + "step": 54250 + }, + { + "epoch": 1.9290729712914407, + "grad_norm": 1.872745156288147, + "learning_rate": 6.19512743370908e-06, + "loss": 1.6289, + "step": 54260 + }, + { + "epoch": 1.929428495244867, + "grad_norm": 1.81459641456604, + "learning_rate": 6.191503393203561e-06, + "loss": 1.6279, + "step": 54270 + }, + { + "epoch": 1.9297840191982933, + "grad_norm": 2.0255491733551025, + "learning_rate": 6.187879937659242e-06, + "loss": 1.6352, + "step": 54280 + }, + { + "epoch": 1.9301395431517199, + "grad_norm": 1.938364863395691, + "learning_rate": 6.184257067632662e-06, + "loss": 1.6082, + "step": 54290 + }, + { + "epoch": 1.9304950671051462, + "grad_norm": 2.025392532348633, + "learning_rate": 6.180634783680272e-06, + "loss": 1.6319, + "step": 54300 + }, + { + "epoch": 1.9308505910585727, + "grad_norm": 2.0753066539764404, + "learning_rate": 6.17701308635843e-06, + "loss": 1.6358, + "step": 54310 + }, + { + "epoch": 1.931206115011999, + "grad_norm": 1.9253500699996948, + "learning_rate": 6.173391976223408e-06, + "loss": 1.603, + "step": 54320 + }, + { + "epoch": 1.9315616389654253, + "grad_norm": 1.9665428400039673, + "learning_rate": 6.169771453831388e-06, + "loss": 1.6557, + "step": 54330 + }, + { + "epoch": 1.9319171629188516, + "grad_norm": 1.876198649406433, + "learning_rate": 6.166151519738454e-06, + "loss": 1.659, + "step": 54340 + }, + { + "epoch": 1.932272686872278, + "grad_norm": 2.0440635681152344, + "learning_rate": 6.162532174500608e-06, + "loss": 1.5992, + "step": 54350 + }, + { + "epoch": 1.9326282108257042, + "grad_norm": 1.954292893409729, + "learning_rate": 6.1589134186737594e-06, + "loss": 1.6207, + "step": 54360 + }, + { + "epoch": 1.9329837347791308, + "grad_norm": 1.9511399269104004, + "learning_rate": 6.155295252813726e-06, + "loss": 1.644, + "step": 54370 + }, + { + "epoch": 1.933339258732557, + "grad_norm": 1.899208426475525, + "learning_rate": 6.151677677476235e-06, + "loss": 1.6321, + "step": 54380 + }, + { + "epoch": 1.9336947826859836, + "grad_norm": 1.962569236755371, + "learning_rate": 6.148060693216926e-06, + "loss": 1.6182, + "step": 54390 + }, + { + "epoch": 1.93405030663941, + "grad_norm": 1.9431875944137573, + "learning_rate": 6.144444300591341e-06, + "loss": 1.6533, + "step": 54400 + }, + { + "epoch": 1.9344058305928362, + "grad_norm": 1.917087435722351, + "learning_rate": 6.14082850015494e-06, + "loss": 1.6401, + "step": 54410 + }, + { + "epoch": 1.9347613545462625, + "grad_norm": 2.0295639038085938, + "learning_rate": 6.137213292463081e-06, + "loss": 1.6772, + "step": 54420 + }, + { + "epoch": 1.9351168784996888, + "grad_norm": 2.0047333240509033, + "learning_rate": 6.13359867807104e-06, + "loss": 1.6686, + "step": 54430 + }, + { + "epoch": 1.9354724024531151, + "grad_norm": 1.8973724842071533, + "learning_rate": 6.1299846575339995e-06, + "loss": 1.6447, + "step": 54440 + }, + { + "epoch": 1.9358279264065417, + "grad_norm": 1.8601739406585693, + "learning_rate": 6.126371231407051e-06, + "loss": 1.6251, + "step": 54450 + }, + { + "epoch": 1.936183450359968, + "grad_norm": 1.846688151359558, + "learning_rate": 6.122758400245195e-06, + "loss": 1.6743, + "step": 54460 + }, + { + "epoch": 1.9365389743133945, + "grad_norm": 1.6897163391113281, + "learning_rate": 6.119146164603335e-06, + "loss": 1.5958, + "step": 54470 + }, + { + "epoch": 1.9368944982668208, + "grad_norm": 1.7965532541275024, + "learning_rate": 6.115534525036293e-06, + "loss": 1.6697, + "step": 54480 + }, + { + "epoch": 1.9372500222202471, + "grad_norm": 1.9531224966049194, + "learning_rate": 6.111923482098791e-06, + "loss": 1.6911, + "step": 54490 + }, + { + "epoch": 1.9376055461736734, + "grad_norm": 1.8963346481323242, + "learning_rate": 6.108313036345465e-06, + "loss": 1.6553, + "step": 54500 + }, + { + "epoch": 1.9379610701270997, + "grad_norm": 1.7927358150482178, + "learning_rate": 6.1047031883308565e-06, + "loss": 1.635, + "step": 54510 + }, + { + "epoch": 1.938316594080526, + "grad_norm": 1.7936055660247803, + "learning_rate": 6.101093938609413e-06, + "loss": 1.6526, + "step": 54520 + }, + { + "epoch": 1.9386721180339526, + "grad_norm": 1.9310426712036133, + "learning_rate": 6.097485287735493e-06, + "loss": 1.6477, + "step": 54530 + }, + { + "epoch": 1.9390276419873789, + "grad_norm": 2.0859527587890625, + "learning_rate": 6.093877236263365e-06, + "loss": 1.6356, + "step": 54540 + }, + { + "epoch": 1.9393831659408054, + "grad_norm": 1.9422963857650757, + "learning_rate": 6.090269784747201e-06, + "loss": 1.6431, + "step": 54550 + }, + { + "epoch": 1.9397386898942317, + "grad_norm": 1.8831487894058228, + "learning_rate": 6.086662933741085e-06, + "loss": 1.6067, + "step": 54560 + }, + { + "epoch": 1.940094213847658, + "grad_norm": 2.097904682159424, + "learning_rate": 6.083056683799009e-06, + "loss": 1.6668, + "step": 54570 + }, + { + "epoch": 1.9404497378010843, + "grad_norm": 1.8286104202270508, + "learning_rate": 6.079451035474864e-06, + "loss": 1.5989, + "step": 54580 + }, + { + "epoch": 1.9408052617545106, + "grad_norm": 2.1394026279449463, + "learning_rate": 6.075845989322457e-06, + "loss": 1.6453, + "step": 54590 + }, + { + "epoch": 1.941160785707937, + "grad_norm": 2.041598320007324, + "learning_rate": 6.072241545895503e-06, + "loss": 1.6452, + "step": 54600 + }, + { + "epoch": 1.9415163096613635, + "grad_norm": 1.9624253511428833, + "learning_rate": 6.068637705747617e-06, + "loss": 1.6554, + "step": 54610 + }, + { + "epoch": 1.9418718336147898, + "grad_norm": 1.9965091943740845, + "learning_rate": 6.065034469432332e-06, + "loss": 1.6203, + "step": 54620 + }, + { + "epoch": 1.9422273575682163, + "grad_norm": 1.9117686748504639, + "learning_rate": 6.06143183750308e-06, + "loss": 1.6907, + "step": 54630 + }, + { + "epoch": 1.9425828815216426, + "grad_norm": 1.9436414241790771, + "learning_rate": 6.057829810513204e-06, + "loss": 1.6389, + "step": 54640 + }, + { + "epoch": 1.942938405475069, + "grad_norm": 2.0671372413635254, + "learning_rate": 6.05422838901595e-06, + "loss": 1.629, + "step": 54650 + }, + { + "epoch": 1.9432939294284952, + "grad_norm": 2.0150489807128906, + "learning_rate": 6.050627573564474e-06, + "loss": 1.6819, + "step": 54660 + }, + { + "epoch": 1.9436494533819215, + "grad_norm": 2.0774424076080322, + "learning_rate": 6.047027364711842e-06, + "loss": 1.6285, + "step": 54670 + }, + { + "epoch": 1.9440049773353478, + "grad_norm": 1.8574728965759277, + "learning_rate": 6.0434277630110195e-06, + "loss": 1.6132, + "step": 54680 + }, + { + "epoch": 1.9443605012887744, + "grad_norm": 1.8759816884994507, + "learning_rate": 6.0398287690148864e-06, + "loss": 1.6227, + "step": 54690 + }, + { + "epoch": 1.9447160252422007, + "grad_norm": 2.022251844406128, + "learning_rate": 6.036230383276224e-06, + "loss": 1.6282, + "step": 54700 + }, + { + "epoch": 1.9450715491956272, + "grad_norm": 1.8638499975204468, + "learning_rate": 6.03263260634772e-06, + "loss": 1.6657, + "step": 54710 + }, + { + "epoch": 1.9454270731490535, + "grad_norm": 1.9763329029083252, + "learning_rate": 6.029035438781973e-06, + "loss": 1.6613, + "step": 54720 + }, + { + "epoch": 1.9457825971024798, + "grad_norm": 2.0072896480560303, + "learning_rate": 6.025438881131489e-06, + "loss": 1.6344, + "step": 54730 + }, + { + "epoch": 1.9461381210559061, + "grad_norm": 2.0064783096313477, + "learning_rate": 6.021842933948667e-06, + "loss": 1.6461, + "step": 54740 + }, + { + "epoch": 1.9464936450093324, + "grad_norm": 1.7632877826690674, + "learning_rate": 6.018247597785827e-06, + "loss": 1.671, + "step": 54750 + }, + { + "epoch": 1.9468491689627587, + "grad_norm": 1.792749285697937, + "learning_rate": 6.01465287319519e-06, + "loss": 1.6333, + "step": 54760 + }, + { + "epoch": 1.9472046929161853, + "grad_norm": 1.7539933919906616, + "learning_rate": 6.011058760728887e-06, + "loss": 1.6139, + "step": 54770 + }, + { + "epoch": 1.9475602168696116, + "grad_norm": 2.02398943901062, + "learning_rate": 6.007465260938945e-06, + "loss": 1.6289, + "step": 54780 + }, + { + "epoch": 1.947915740823038, + "grad_norm": 1.932620644569397, + "learning_rate": 6.003872374377304e-06, + "loss": 1.6382, + "step": 54790 + }, + { + "epoch": 1.9482712647764644, + "grad_norm": 1.8369656801223755, + "learning_rate": 6.000280101595812e-06, + "loss": 1.6346, + "step": 54800 + }, + { + "epoch": 1.9486267887298907, + "grad_norm": 1.9664580821990967, + "learning_rate": 5.996688443146217e-06, + "loss": 1.5874, + "step": 54810 + }, + { + "epoch": 1.948982312683317, + "grad_norm": 1.980931282043457, + "learning_rate": 5.993097399580177e-06, + "loss": 1.6441, + "step": 54820 + }, + { + "epoch": 1.9493378366367433, + "grad_norm": 2.005143880844116, + "learning_rate": 5.989506971449255e-06, + "loss": 1.6219, + "step": 54830 + }, + { + "epoch": 1.9496933605901696, + "grad_norm": 1.9172461032867432, + "learning_rate": 5.985917159304912e-06, + "loss": 1.6612, + "step": 54840 + }, + { + "epoch": 1.9500488845435962, + "grad_norm": 1.9985624551773071, + "learning_rate": 5.982327963698528e-06, + "loss": 1.6124, + "step": 54850 + }, + { + "epoch": 1.9504044084970225, + "grad_norm": 1.9466190338134766, + "learning_rate": 5.978739385181376e-06, + "loss": 1.6695, + "step": 54860 + }, + { + "epoch": 1.950759932450449, + "grad_norm": 1.8929356336593628, + "learning_rate": 5.975151424304641e-06, + "loss": 1.6219, + "step": 54870 + }, + { + "epoch": 1.9511154564038753, + "grad_norm": 1.8140504360198975, + "learning_rate": 5.971564081619414e-06, + "loss": 1.6497, + "step": 54880 + }, + { + "epoch": 1.9514709803573016, + "grad_norm": 1.8329968452453613, + "learning_rate": 5.967977357676684e-06, + "loss": 1.6418, + "step": 54890 + }, + { + "epoch": 1.951826504310728, + "grad_norm": 1.822810173034668, + "learning_rate": 5.964391253027353e-06, + "loss": 1.6689, + "step": 54900 + }, + { + "epoch": 1.9521820282641542, + "grad_norm": 1.939647912979126, + "learning_rate": 5.9608057682222194e-06, + "loss": 1.6432, + "step": 54910 + }, + { + "epoch": 1.9525375522175805, + "grad_norm": 2.139228343963623, + "learning_rate": 5.957220903811993e-06, + "loss": 1.6232, + "step": 54920 + }, + { + "epoch": 1.952893076171007, + "grad_norm": 1.8620997667312622, + "learning_rate": 5.953636660347288e-06, + "loss": 1.6186, + "step": 54930 + }, + { + "epoch": 1.9532486001244334, + "grad_norm": 1.9274126291275024, + "learning_rate": 5.9500530383786205e-06, + "loss": 1.6124, + "step": 54940 + }, + { + "epoch": 1.9536041240778599, + "grad_norm": 1.7978813648223877, + "learning_rate": 5.946470038456416e-06, + "loss": 1.5766, + "step": 54950 + }, + { + "epoch": 1.9539596480312862, + "grad_norm": 1.954411268234253, + "learning_rate": 5.942887661130995e-06, + "loss": 1.6011, + "step": 54960 + }, + { + "epoch": 1.9543151719847125, + "grad_norm": 2.41074800491333, + "learning_rate": 5.93930590695259e-06, + "loss": 1.6419, + "step": 54970 + }, + { + "epoch": 1.9546706959381388, + "grad_norm": 1.847296953201294, + "learning_rate": 5.935724776471339e-06, + "loss": 1.6543, + "step": 54980 + }, + { + "epoch": 1.955026219891565, + "grad_norm": 1.9085177183151245, + "learning_rate": 5.932144270237279e-06, + "loss": 1.6094, + "step": 54990 + }, + { + "epoch": 1.9553817438449914, + "grad_norm": 1.9735503196716309, + "learning_rate": 5.9285643888003516e-06, + "loss": 1.6406, + "step": 55000 + }, + { + "epoch": 1.955737267798418, + "grad_norm": 1.8311965465545654, + "learning_rate": 5.924985132710409e-06, + "loss": 1.6452, + "step": 55010 + }, + { + "epoch": 1.9560927917518443, + "grad_norm": 2.274158239364624, + "learning_rate": 5.921406502517197e-06, + "loss": 1.6486, + "step": 55020 + }, + { + "epoch": 1.9564483157052708, + "grad_norm": 1.9504927396774292, + "learning_rate": 5.917828498770375e-06, + "loss": 1.6552, + "step": 55030 + }, + { + "epoch": 1.956803839658697, + "grad_norm": 1.9453336000442505, + "learning_rate": 5.9142511220194985e-06, + "loss": 1.614, + "step": 55040 + }, + { + "epoch": 1.9571593636121234, + "grad_norm": 1.955653190612793, + "learning_rate": 5.910674372814036e-06, + "loss": 1.62, + "step": 55050 + }, + { + "epoch": 1.9575148875655497, + "grad_norm": 1.9923677444458008, + "learning_rate": 5.907098251703345e-06, + "loss": 1.6377, + "step": 55060 + }, + { + "epoch": 1.957870411518976, + "grad_norm": 1.9610176086425781, + "learning_rate": 5.903522759236702e-06, + "loss": 1.6179, + "step": 55070 + }, + { + "epoch": 1.9582259354724023, + "grad_norm": 1.9300296306610107, + "learning_rate": 5.899947895963279e-06, + "loss": 1.6107, + "step": 55080 + }, + { + "epoch": 1.9585814594258288, + "grad_norm": 2.070293426513672, + "learning_rate": 5.896373662432149e-06, + "loss": 1.6164, + "step": 55090 + }, + { + "epoch": 1.9589369833792551, + "grad_norm": 1.903748631477356, + "learning_rate": 5.892800059192294e-06, + "loss": 1.6447, + "step": 55100 + }, + { + "epoch": 1.9592925073326817, + "grad_norm": 2.0441808700561523, + "learning_rate": 5.889227086792598e-06, + "loss": 1.6361, + "step": 55110 + }, + { + "epoch": 1.959648031286108, + "grad_norm": 1.9533919095993042, + "learning_rate": 5.885654745781848e-06, + "loss": 1.6185, + "step": 55120 + }, + { + "epoch": 1.9600035552395343, + "grad_norm": 2.0124576091766357, + "learning_rate": 5.88208303670873e-06, + "loss": 1.6174, + "step": 55130 + }, + { + "epoch": 1.9603590791929606, + "grad_norm": 2.0306930541992188, + "learning_rate": 5.878511960121842e-06, + "loss": 1.6283, + "step": 55140 + }, + { + "epoch": 1.960714603146387, + "grad_norm": 1.9483821392059326, + "learning_rate": 5.8749415165696725e-06, + "loss": 1.6634, + "step": 55150 + }, + { + "epoch": 1.9610701270998132, + "grad_norm": 2.05631422996521, + "learning_rate": 5.871371706600621e-06, + "loss": 1.6079, + "step": 55160 + }, + { + "epoch": 1.9614256510532397, + "grad_norm": 1.9040125608444214, + "learning_rate": 5.867802530762988e-06, + "loss": 1.6189, + "step": 55170 + }, + { + "epoch": 1.961781175006666, + "grad_norm": 1.896045446395874, + "learning_rate": 5.864233989604978e-06, + "loss": 1.605, + "step": 55180 + }, + { + "epoch": 1.9621366989600926, + "grad_norm": 1.9672343730926514, + "learning_rate": 5.8606660836746985e-06, + "loss": 1.6656, + "step": 55190 + }, + { + "epoch": 1.9624922229135189, + "grad_norm": 2.0945780277252197, + "learning_rate": 5.857098813520152e-06, + "loss": 1.6252, + "step": 55200 + }, + { + "epoch": 1.9628477468669452, + "grad_norm": 1.7114602327346802, + "learning_rate": 5.853532179689256e-06, + "loss": 1.657, + "step": 55210 + }, + { + "epoch": 1.9632032708203715, + "grad_norm": 2.0863399505615234, + "learning_rate": 5.849966182729814e-06, + "loss": 1.649, + "step": 55220 + }, + { + "epoch": 1.9635587947737978, + "grad_norm": 1.833872675895691, + "learning_rate": 5.846400823189546e-06, + "loss": 1.6404, + "step": 55230 + }, + { + "epoch": 1.963914318727224, + "grad_norm": 1.8471561670303345, + "learning_rate": 5.842836101616067e-06, + "loss": 1.6303, + "step": 55240 + }, + { + "epoch": 1.9642698426806506, + "grad_norm": 1.9319822788238525, + "learning_rate": 5.839272018556899e-06, + "loss": 1.6363, + "step": 55250 + }, + { + "epoch": 1.964625366634077, + "grad_norm": 1.969245433807373, + "learning_rate": 5.83570857455946e-06, + "loss": 1.5972, + "step": 55260 + }, + { + "epoch": 1.9649808905875035, + "grad_norm": 1.8855727910995483, + "learning_rate": 5.832145770171074e-06, + "loss": 1.6405, + "step": 55270 + }, + { + "epoch": 1.9653364145409298, + "grad_norm": 1.7602448463439941, + "learning_rate": 5.828583605938964e-06, + "loss": 1.6767, + "step": 55280 + }, + { + "epoch": 1.965691938494356, + "grad_norm": 1.832932710647583, + "learning_rate": 5.825022082410263e-06, + "loss": 1.6609, + "step": 55290 + }, + { + "epoch": 1.9660474624477824, + "grad_norm": 1.9036272764205933, + "learning_rate": 5.821461200131988e-06, + "loss": 1.6552, + "step": 55300 + }, + { + "epoch": 1.9664029864012087, + "grad_norm": 2.110887050628662, + "learning_rate": 5.817900959651072e-06, + "loss": 1.6909, + "step": 55310 + }, + { + "epoch": 1.966758510354635, + "grad_norm": 1.8974913358688354, + "learning_rate": 5.814341361514349e-06, + "loss": 1.61, + "step": 55320 + }, + { + "epoch": 1.9671140343080615, + "grad_norm": 1.991543173789978, + "learning_rate": 5.810782406268546e-06, + "loss": 1.6339, + "step": 55330 + }, + { + "epoch": 1.9674695582614878, + "grad_norm": 1.9699667692184448, + "learning_rate": 5.8072240944603e-06, + "loss": 1.5963, + "step": 55340 + }, + { + "epoch": 1.9678250822149144, + "grad_norm": 1.982343316078186, + "learning_rate": 5.803666426636145e-06, + "loss": 1.7, + "step": 55350 + }, + { + "epoch": 1.9681806061683407, + "grad_norm": 1.8685624599456787, + "learning_rate": 5.800109403342513e-06, + "loss": 1.6561, + "step": 55360 + }, + { + "epoch": 1.968536130121767, + "grad_norm": 1.9771759510040283, + "learning_rate": 5.7965530251257495e-06, + "loss": 1.6414, + "step": 55370 + }, + { + "epoch": 1.9688916540751933, + "grad_norm": 1.860378384590149, + "learning_rate": 5.792997292532081e-06, + "loss": 1.5819, + "step": 55380 + }, + { + "epoch": 1.9692471780286196, + "grad_norm": 1.8745434284210205, + "learning_rate": 5.789442206107649e-06, + "loss": 1.6408, + "step": 55390 + }, + { + "epoch": 1.969602701982046, + "grad_norm": 1.8563114404678345, + "learning_rate": 5.785887766398496e-06, + "loss": 1.6107, + "step": 55400 + }, + { + "epoch": 1.9699582259354724, + "grad_norm": 2.024580240249634, + "learning_rate": 5.782333973950558e-06, + "loss": 1.6539, + "step": 55410 + }, + { + "epoch": 1.9703137498888987, + "grad_norm": 1.8910824060440063, + "learning_rate": 5.7787808293096815e-06, + "loss": 1.6408, + "step": 55420 + }, + { + "epoch": 1.9706692738423253, + "grad_norm": 1.897397756576538, + "learning_rate": 5.775228333021597e-06, + "loss": 1.6104, + "step": 55430 + }, + { + "epoch": 1.9710247977957516, + "grad_norm": 1.880532145500183, + "learning_rate": 5.771676485631952e-06, + "loss": 1.6784, + "step": 55440 + }, + { + "epoch": 1.9713803217491779, + "grad_norm": 1.9245859384536743, + "learning_rate": 5.768125287686287e-06, + "loss": 1.6261, + "step": 55450 + }, + { + "epoch": 1.9717358457026042, + "grad_norm": 1.824560523033142, + "learning_rate": 5.764574739730043e-06, + "loss": 1.6433, + "step": 55460 + }, + { + "epoch": 1.9720913696560305, + "grad_norm": 1.9456140995025635, + "learning_rate": 5.761024842308564e-06, + "loss": 1.6675, + "step": 55470 + }, + { + "epoch": 1.9724468936094568, + "grad_norm": 1.9762824773788452, + "learning_rate": 5.7574755959670906e-06, + "loss": 1.6517, + "step": 55480 + }, + { + "epoch": 1.9728024175628833, + "grad_norm": 1.9311729669570923, + "learning_rate": 5.753927001250763e-06, + "loss": 1.6418, + "step": 55490 + }, + { + "epoch": 1.9731579415163096, + "grad_norm": 2.19989013671875, + "learning_rate": 5.750379058704626e-06, + "loss": 1.6751, + "step": 55500 + }, + { + "epoch": 1.9735134654697362, + "grad_norm": 1.8706977367401123, + "learning_rate": 5.746831768873619e-06, + "loss": 1.589, + "step": 55510 + }, + { + "epoch": 1.9738689894231625, + "grad_norm": 2.1740660667419434, + "learning_rate": 5.743285132302588e-06, + "loss": 1.6747, + "step": 55520 + }, + { + "epoch": 1.9742245133765888, + "grad_norm": 1.9190523624420166, + "learning_rate": 5.739739149536272e-06, + "loss": 1.6302, + "step": 55530 + }, + { + "epoch": 1.974580037330015, + "grad_norm": 1.9370909929275513, + "learning_rate": 5.736193821119307e-06, + "loss": 1.6445, + "step": 55540 + }, + { + "epoch": 1.9749355612834414, + "grad_norm": 1.9816778898239136, + "learning_rate": 5.732649147596242e-06, + "loss": 1.651, + "step": 55550 + }, + { + "epoch": 1.9752910852368677, + "grad_norm": 1.9557394981384277, + "learning_rate": 5.7291051295115065e-06, + "loss": 1.6275, + "step": 55560 + }, + { + "epoch": 1.9756466091902942, + "grad_norm": 1.9068324565887451, + "learning_rate": 5.7255617674094445e-06, + "loss": 1.6153, + "step": 55570 + }, + { + "epoch": 1.9760021331437205, + "grad_norm": 1.999617338180542, + "learning_rate": 5.7220190618342944e-06, + "loss": 1.6201, + "step": 55580 + }, + { + "epoch": 1.976357657097147, + "grad_norm": 1.916778326034546, + "learning_rate": 5.718477013330193e-06, + "loss": 1.6376, + "step": 55590 + }, + { + "epoch": 1.9767131810505734, + "grad_norm": 1.9432013034820557, + "learning_rate": 5.714935622441177e-06, + "loss": 1.6546, + "step": 55600 + }, + { + "epoch": 1.9770687050039997, + "grad_norm": 1.8765946626663208, + "learning_rate": 5.711394889711181e-06, + "loss": 1.6364, + "step": 55610 + }, + { + "epoch": 1.977424228957426, + "grad_norm": 2.0579068660736084, + "learning_rate": 5.707854815684042e-06, + "loss": 1.624, + "step": 55620 + }, + { + "epoch": 1.9777797529108523, + "grad_norm": 1.9126039743423462, + "learning_rate": 5.704315400903491e-06, + "loss": 1.658, + "step": 55630 + }, + { + "epoch": 1.9781352768642786, + "grad_norm": 1.9604213237762451, + "learning_rate": 5.700776645913159e-06, + "loss": 1.6575, + "step": 55640 + }, + { + "epoch": 1.9784908008177051, + "grad_norm": 1.833206295967102, + "learning_rate": 5.697238551256579e-06, + "loss": 1.5933, + "step": 55650 + }, + { + "epoch": 1.9788463247711314, + "grad_norm": 2.0333101749420166, + "learning_rate": 5.693701117477182e-06, + "loss": 1.7015, + "step": 55660 + }, + { + "epoch": 1.979201848724558, + "grad_norm": 1.8384674787521362, + "learning_rate": 5.690164345118289e-06, + "loss": 1.5944, + "step": 55670 + }, + { + "epoch": 1.9795573726779843, + "grad_norm": 1.8767904043197632, + "learning_rate": 5.68662823472313e-06, + "loss": 1.6359, + "step": 55680 + }, + { + "epoch": 1.9799128966314106, + "grad_norm": 2.3191983699798584, + "learning_rate": 5.683092786834833e-06, + "loss": 1.6399, + "step": 55690 + }, + { + "epoch": 1.9802684205848369, + "grad_norm": 1.9148094654083252, + "learning_rate": 5.6795580019964125e-06, + "loss": 1.6165, + "step": 55700 + }, + { + "epoch": 1.9806239445382632, + "grad_norm": 1.9649780988693237, + "learning_rate": 5.676023880750794e-06, + "loss": 1.6495, + "step": 55710 + }, + { + "epoch": 1.9809794684916895, + "grad_norm": 1.7389864921569824, + "learning_rate": 5.672490423640795e-06, + "loss": 1.6209, + "step": 55720 + }, + { + "epoch": 1.981334992445116, + "grad_norm": 1.940738558769226, + "learning_rate": 5.668957631209133e-06, + "loss": 1.5906, + "step": 55730 + }, + { + "epoch": 1.9816905163985423, + "grad_norm": 1.726500153541565, + "learning_rate": 5.6654255039984256e-06, + "loss": 1.5935, + "step": 55740 + }, + { + "epoch": 1.9820460403519689, + "grad_norm": 1.841575026512146, + "learning_rate": 5.661894042551181e-06, + "loss": 1.6638, + "step": 55750 + }, + { + "epoch": 1.9824015643053952, + "grad_norm": 1.9646481275558472, + "learning_rate": 5.658363247409811e-06, + "loss": 1.6413, + "step": 55760 + }, + { + "epoch": 1.9827570882588215, + "grad_norm": 1.8491153717041016, + "learning_rate": 5.654833119116624e-06, + "loss": 1.6231, + "step": 55770 + }, + { + "epoch": 1.9831126122122478, + "grad_norm": 1.9587262868881226, + "learning_rate": 5.651303658213825e-06, + "loss": 1.6477, + "step": 55780 + }, + { + "epoch": 1.983468136165674, + "grad_norm": 1.9501127004623413, + "learning_rate": 5.647774865243523e-06, + "loss": 1.6247, + "step": 55790 + }, + { + "epoch": 1.9838236601191004, + "grad_norm": 1.8898067474365234, + "learning_rate": 5.644246740747707e-06, + "loss": 1.6378, + "step": 55800 + }, + { + "epoch": 1.984179184072527, + "grad_norm": 1.9823150634765625, + "learning_rate": 5.640719285268284e-06, + "loss": 1.6399, + "step": 55810 + }, + { + "epoch": 1.9845347080259532, + "grad_norm": 1.9178792238235474, + "learning_rate": 5.637192499347044e-06, + "loss": 1.6473, + "step": 55820 + }, + { + "epoch": 1.9848902319793797, + "grad_norm": 2.044755458831787, + "learning_rate": 5.63366638352568e-06, + "loss": 1.6059, + "step": 55830 + }, + { + "epoch": 1.985245755932806, + "grad_norm": 2.0040781497955322, + "learning_rate": 5.630140938345784e-06, + "loss": 1.6196, + "step": 55840 + }, + { + "epoch": 1.9856012798862324, + "grad_norm": 1.8934400081634521, + "learning_rate": 5.626616164348844e-06, + "loss": 1.6292, + "step": 55850 + }, + { + "epoch": 1.9859568038396587, + "grad_norm": 1.950329303741455, + "learning_rate": 5.623092062076236e-06, + "loss": 1.6541, + "step": 55860 + }, + { + "epoch": 1.986312327793085, + "grad_norm": 1.837617039680481, + "learning_rate": 5.619568632069243e-06, + "loss": 1.6316, + "step": 55870 + }, + { + "epoch": 1.9866678517465113, + "grad_norm": 1.951453685760498, + "learning_rate": 5.616045874869045e-06, + "loss": 1.6863, + "step": 55880 + }, + { + "epoch": 1.9870233756999378, + "grad_norm": 1.8173454999923706, + "learning_rate": 5.612523791016711e-06, + "loss": 1.5941, + "step": 55890 + }, + { + "epoch": 1.9873788996533641, + "grad_norm": 1.9951165914535522, + "learning_rate": 5.609002381053213e-06, + "loss": 1.6386, + "step": 55900 + }, + { + "epoch": 1.9877344236067906, + "grad_norm": 1.8609397411346436, + "learning_rate": 5.605481645519422e-06, + "loss": 1.6426, + "step": 55910 + }, + { + "epoch": 1.988089947560217, + "grad_norm": 1.9778833389282227, + "learning_rate": 5.601961584956093e-06, + "loss": 1.6824, + "step": 55920 + }, + { + "epoch": 1.9884454715136433, + "grad_norm": 1.9272605180740356, + "learning_rate": 5.598442199903887e-06, + "loss": 1.63, + "step": 55930 + }, + { + "epoch": 1.9888009954670696, + "grad_norm": 2.024440050125122, + "learning_rate": 5.594923490903363e-06, + "loss": 1.6416, + "step": 55940 + }, + { + "epoch": 1.9891565194204959, + "grad_norm": 1.9837208986282349, + "learning_rate": 5.591405458494969e-06, + "loss": 1.6337, + "step": 55950 + }, + { + "epoch": 1.9895120433739222, + "grad_norm": 1.9372029304504395, + "learning_rate": 5.5878881032190555e-06, + "loss": 1.6284, + "step": 55960 + }, + { + "epoch": 1.9898675673273487, + "grad_norm": 2.217460870742798, + "learning_rate": 5.584371425615865e-06, + "loss": 1.6289, + "step": 55970 + }, + { + "epoch": 1.990223091280775, + "grad_norm": 1.9019287824630737, + "learning_rate": 5.580855426225538e-06, + "loss": 1.6128, + "step": 55980 + }, + { + "epoch": 1.9905786152342015, + "grad_norm": 2.099592924118042, + "learning_rate": 5.577340105588109e-06, + "loss": 1.6096, + "step": 55990 + }, + { + "epoch": 1.9909341391876278, + "grad_norm": 1.902306079864502, + "learning_rate": 5.573825464243508e-06, + "loss": 1.6055, + "step": 56000 + }, + { + "epoch": 1.9912896631410542, + "grad_norm": 1.955054521560669, + "learning_rate": 5.570311502731569e-06, + "loss": 1.627, + "step": 56010 + }, + { + "epoch": 1.9916451870944805, + "grad_norm": 1.910407304763794, + "learning_rate": 5.566798221592005e-06, + "loss": 1.5674, + "step": 56020 + }, + { + "epoch": 1.9920007110479068, + "grad_norm": 1.9375877380371094, + "learning_rate": 5.563285621364436e-06, + "loss": 1.6361, + "step": 56030 + }, + { + "epoch": 1.992356235001333, + "grad_norm": 1.9428280591964722, + "learning_rate": 5.559773702588382e-06, + "loss": 1.6032, + "step": 56040 + }, + { + "epoch": 1.9927117589547596, + "grad_norm": 2.0371007919311523, + "learning_rate": 5.556262465803244e-06, + "loss": 1.6356, + "step": 56050 + }, + { + "epoch": 1.993067282908186, + "grad_norm": 1.87294340133667, + "learning_rate": 5.552751911548328e-06, + "loss": 1.6167, + "step": 56060 + }, + { + "epoch": 1.9934228068616124, + "grad_norm": 2.201530933380127, + "learning_rate": 5.5492420403628335e-06, + "loss": 1.6139, + "step": 56070 + }, + { + "epoch": 1.9937783308150387, + "grad_norm": 1.8459148406982422, + "learning_rate": 5.5457328527858546e-06, + "loss": 1.6256, + "step": 56080 + }, + { + "epoch": 1.994133854768465, + "grad_norm": 1.781484842300415, + "learning_rate": 5.542224349356382e-06, + "loss": 1.6231, + "step": 56090 + }, + { + "epoch": 1.9944893787218914, + "grad_norm": 1.799372673034668, + "learning_rate": 5.5387165306133e-06, + "loss": 1.6064, + "step": 56100 + }, + { + "epoch": 1.9948449026753177, + "grad_norm": 1.8162728548049927, + "learning_rate": 5.5352093970953844e-06, + "loss": 1.6262, + "step": 56110 + }, + { + "epoch": 1.995200426628744, + "grad_norm": 1.9241447448730469, + "learning_rate": 5.531702949341311e-06, + "loss": 1.6323, + "step": 56120 + }, + { + "epoch": 1.9955559505821705, + "grad_norm": 1.852016806602478, + "learning_rate": 5.5281971878896505e-06, + "loss": 1.6684, + "step": 56130 + }, + { + "epoch": 1.9959114745355968, + "grad_norm": 1.8842954635620117, + "learning_rate": 5.524692113278861e-06, + "loss": 1.6066, + "step": 56140 + }, + { + "epoch": 1.9962669984890233, + "grad_norm": 1.9963688850402832, + "learning_rate": 5.521187726047308e-06, + "loss": 1.6465, + "step": 56150 + }, + { + "epoch": 1.9966225224424496, + "grad_norm": 1.9112470149993896, + "learning_rate": 5.517684026733232e-06, + "loss": 1.6166, + "step": 56160 + }, + { + "epoch": 1.996978046395876, + "grad_norm": 1.9450349807739258, + "learning_rate": 5.514181015874791e-06, + "loss": 1.6156, + "step": 56170 + }, + { + "epoch": 1.9973335703493023, + "grad_norm": 1.9551585912704468, + "learning_rate": 5.510678694010016e-06, + "loss": 1.5933, + "step": 56180 + }, + { + "epoch": 1.9976890943027286, + "grad_norm": 1.9778324365615845, + "learning_rate": 5.507177061676844e-06, + "loss": 1.6367, + "step": 56190 + }, + { + "epoch": 1.9980446182561549, + "grad_norm": 1.8860313892364502, + "learning_rate": 5.503676119413106e-06, + "loss": 1.6546, + "step": 56200 + }, + { + "epoch": 1.9984001422095814, + "grad_norm": 1.9805750846862793, + "learning_rate": 5.500175867756523e-06, + "loss": 1.6811, + "step": 56210 + }, + { + "epoch": 1.9987556661630077, + "grad_norm": 1.9156203269958496, + "learning_rate": 5.496676307244713e-06, + "loss": 1.6241, + "step": 56220 + }, + { + "epoch": 1.9991111901164342, + "grad_norm": 1.8544695377349854, + "learning_rate": 5.493177438415186e-06, + "loss": 1.6506, + "step": 56230 + }, + { + "epoch": 1.9994667140698605, + "grad_norm": 1.9403393268585205, + "learning_rate": 5.489679261805347e-06, + "loss": 1.6226, + "step": 56240 + }, + { + "epoch": 1.9998222380232868, + "grad_norm": 1.8933693170547485, + "learning_rate": 5.486181777952493e-06, + "loss": 1.6402, + "step": 56250 + }, + { + "epoch": 2.0, + "eval_loss": 1.720118761062622, + "eval_runtime": 9.6697, + "eval_samples_per_second": 105.898, + "eval_steps_per_second": 1.655, + "step": 56255 + } + ], + "logging_steps": 10, + "max_steps": 84381, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.815794246966837e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}